Import runc_1.0.0~rc10+dfsg1.orig.tar.xz
authorDmitry Smirnov <onlyjob@debian.org>
Sun, 26 Jan 2020 09:24:01 +0000 (09:24 +0000)
committerDmitry Smirnov <onlyjob@debian.org>
Sun, 26 Jan 2020 09:24:01 +0000 (09:24 +0000)
[dgit import orig runc_1.0.0~rc10+dfsg1.orig.tar.xz]

309 files changed:
.gitignore [new file with mode: 0644]
.pullapprove.yml [new file with mode: 0644]
.travis.yml [new file with mode: 0644]
CONTRIBUTING.md [new file with mode: 0644]
Dockerfile [new file with mode: 0644]
LICENSE [new file with mode: 0644]
MAINTAINERS [new file with mode: 0644]
MAINTAINERS_GUIDE.md [new file with mode: 0644]
Makefile [new file with mode: 0644]
NOTICE [new file with mode: 0644]
PRINCIPLES.md [new file with mode: 0644]
README.md [new file with mode: 0644]
SECURITY.md [new file with mode: 0644]
VERSION [new file with mode: 0644]
checkpoint.go [new file with mode: 0644]
contrib/cmd/recvtty/recvtty.go [new file with mode: 0644]
contrib/completions/bash/runc [new file with mode: 0644]
create.go [new file with mode: 0644]
delete.go [new file with mode: 0644]
docs/checkpoint-restore.md [new file with mode: 0644]
docs/terminals.md [new file with mode: 0644]
events.go [new file with mode: 0644]
exec.go [new file with mode: 0644]
init.go [new file with mode: 0644]
kill.go [new file with mode: 0644]
libcontainer/README.md [new file with mode: 0644]
libcontainer/SPEC.md [new file with mode: 0644]
libcontainer/apparmor/apparmor.go [new file with mode: 0644]
libcontainer/apparmor/apparmor_disabled.go [new file with mode: 0644]
libcontainer/capabilities_linux.go [new file with mode: 0644]
libcontainer/cgroups/cgroups.go [new file with mode: 0644]
libcontainer/cgroups/cgroups_test.go [new file with mode: 0644]
libcontainer/cgroups/cgroups_unsupported.go [new file with mode: 0644]
libcontainer/cgroups/ebpf/devicefilter/devicefilter.go [new file with mode: 0644]
libcontainer/cgroups/ebpf/devicefilter/devicefilter_test.go [new file with mode: 0644]
libcontainer/cgroups/ebpf/ebpf.go [new file with mode: 0644]
libcontainer/cgroups/fs/apply_raw.go [new file with mode: 0644]
libcontainer/cgroups/fs/apply_raw_test.go [new file with mode: 0644]
libcontainer/cgroups/fs/blkio.go [new file with mode: 0644]
libcontainer/cgroups/fs/blkio_test.go [new file with mode: 0644]
libcontainer/cgroups/fs/cpu.go [new file with mode: 0644]
libcontainer/cgroups/fs/cpu_test.go [new file with mode: 0644]
libcontainer/cgroups/fs/cpuacct.go [new file with mode: 0644]
libcontainer/cgroups/fs/cpuset.go [new file with mode: 0644]
libcontainer/cgroups/fs/cpuset_test.go [new file with mode: 0644]
libcontainer/cgroups/fs/devices.go [new file with mode: 0644]
libcontainer/cgroups/fs/devices_test.go [new file with mode: 0644]
libcontainer/cgroups/fs/freezer.go [new file with mode: 0644]
libcontainer/cgroups/fs/freezer_test.go [new file with mode: 0644]
libcontainer/cgroups/fs/fs_unsupported.go [new file with mode: 0644]
libcontainer/cgroups/fs/hugetlb.go [new file with mode: 0644]
libcontainer/cgroups/fs/hugetlb_test.go [new file with mode: 0644]
libcontainer/cgroups/fs/kmem.go [new file with mode: 0644]
libcontainer/cgroups/fs/kmem_disabled.go [new file with mode: 0644]
libcontainer/cgroups/fs/memory.go [new file with mode: 0644]
libcontainer/cgroups/fs/memory_test.go [new file with mode: 0644]
libcontainer/cgroups/fs/name.go [new file with mode: 0644]
libcontainer/cgroups/fs/net_cls.go [new file with mode: 0644]
libcontainer/cgroups/fs/net_cls_test.go [new file with mode: 0644]
libcontainer/cgroups/fs/net_prio.go [new file with mode: 0644]
libcontainer/cgroups/fs/net_prio_test.go [new file with mode: 0644]
libcontainer/cgroups/fs/perf_event.go [new file with mode: 0644]
libcontainer/cgroups/fs/pids.go [new file with mode: 0644]
libcontainer/cgroups/fs/pids_test.go [new file with mode: 0644]
libcontainer/cgroups/fs/stats_util_test.go [new file with mode: 0644]
libcontainer/cgroups/fs/util_test.go [new file with mode: 0644]
libcontainer/cgroups/fs2/cpu.go [new file with mode: 0644]
libcontainer/cgroups/fs2/cpuset.go [new file with mode: 0644]
libcontainer/cgroups/fs2/defaultpath.go [new file with mode: 0644]
libcontainer/cgroups/fs2/defaultpath_test.go [new file with mode: 0644]
libcontainer/cgroups/fs2/devices.go [new file with mode: 0644]
libcontainer/cgroups/fs2/freezer.go [new file with mode: 0644]
libcontainer/cgroups/fs2/fs2.go [new file with mode: 0644]
libcontainer/cgroups/fs2/io.go [new file with mode: 0644]
libcontainer/cgroups/fs2/memory.go [new file with mode: 0644]
libcontainer/cgroups/fs2/pids.go [new file with mode: 0644]
libcontainer/cgroups/fscommon/fscommon.go [new file with mode: 0644]
libcontainer/cgroups/fscommon/utils.go [new file with mode: 0644]
libcontainer/cgroups/fscommon/utils_test.go [new file with mode: 0644]
libcontainer/cgroups/stats.go [new file with mode: 0644]
libcontainer/cgroups/systemd/apply_nosystemd.go [new file with mode: 0644]
libcontainer/cgroups/systemd/apply_systemd.go [new file with mode: 0644]
libcontainer/cgroups/systemd/unified_hierarchy.go [new file with mode: 0644]
libcontainer/cgroups/utils.go [new file with mode: 0644]
libcontainer/cgroups/utils_test.go [new file with mode: 0644]
libcontainer/configs/blkio_device.go [new file with mode: 0644]
libcontainer/configs/cgroup_linux.go [new file with mode: 0644]
libcontainer/configs/cgroup_unsupported.go [new file with mode: 0644]
libcontainer/configs/config.go [new file with mode: 0644]
libcontainer/configs/config_linux.go [new file with mode: 0644]
libcontainer/configs/config_linux_test.go [new file with mode: 0644]
libcontainer/configs/config_test.go [new file with mode: 0644]
libcontainer/configs/config_windows_test.go [new file with mode: 0644]
libcontainer/configs/device.go [new file with mode: 0644]
libcontainer/configs/device_defaults.go [new file with mode: 0644]
libcontainer/configs/hugepage_limit.go [new file with mode: 0644]
libcontainer/configs/intelrdt.go [new file with mode: 0644]
libcontainer/configs/interface_priority_map.go [new file with mode: 0644]
libcontainer/configs/mount.go [new file with mode: 0644]
libcontainer/configs/namespaces.go [new file with mode: 0644]
libcontainer/configs/namespaces_linux.go [new file with mode: 0644]
libcontainer/configs/namespaces_syscall.go [new file with mode: 0644]
libcontainer/configs/namespaces_syscall_unsupported.go [new file with mode: 0644]
libcontainer/configs/namespaces_unsupported.go [new file with mode: 0644]
libcontainer/configs/network.go [new file with mode: 0644]
libcontainer/configs/validate/rootless.go [new file with mode: 0644]
libcontainer/configs/validate/rootless_test.go [new file with mode: 0644]
libcontainer/configs/validate/validator.go [new file with mode: 0644]
libcontainer/configs/validate/validator_test.go [new file with mode: 0644]
libcontainer/console_linux.go [new file with mode: 0644]
libcontainer/container.go [new file with mode: 0644]
libcontainer/container_linux.go [new file with mode: 0644]
libcontainer/container_linux_test.go [new file with mode: 0644]
libcontainer/criu_opts_linux.go [new file with mode: 0644]
libcontainer/devices/devices.go [new file with mode: 0644]
libcontainer/devices/devices_test.go [new file with mode: 0644]
libcontainer/error.go [new file with mode: 0644]
libcontainer/error_test.go [new file with mode: 0644]
libcontainer/factory.go [new file with mode: 0644]
libcontainer/factory_linux.go [new file with mode: 0644]
libcontainer/factory_linux_test.go [new file with mode: 0644]
libcontainer/generic_error.go [new file with mode: 0644]
libcontainer/generic_error_test.go [new file with mode: 0644]
libcontainer/init_linux.go [new file with mode: 0644]
libcontainer/integration/checkpoint_test.go [new file with mode: 0644]
libcontainer/integration/doc.go [new file with mode: 0644]
libcontainer/integration/exec_test.go [new file with mode: 0644]
libcontainer/integration/execin_test.go [new file with mode: 0644]
libcontainer/integration/init_test.go [new file with mode: 0644]
libcontainer/integration/seccomp_test.go [new file with mode: 0644]
libcontainer/integration/template_test.go [new file with mode: 0644]
libcontainer/integration/utils_test.go [new file with mode: 0644]
libcontainer/intelrdt/intelrdt.go [new file with mode: 0644]
libcontainer/intelrdt/intelrdt_test.go [new file with mode: 0644]
libcontainer/intelrdt/stats.go [new file with mode: 0644]
libcontainer/intelrdt/util_test.go [new file with mode: 0644]
libcontainer/keys/keyctl.go [new file with mode: 0644]
libcontainer/logs/logs.go [new file with mode: 0644]
libcontainer/logs/logs_linux_test.go [new file with mode: 0644]
libcontainer/message_linux.go [new file with mode: 0644]
libcontainer/mount/mount.go [new file with mode: 0644]
libcontainer/mount/mount_linux.go [new file with mode: 0644]
libcontainer/mount/mountinfo.go [new file with mode: 0644]
libcontainer/network_linux.go [new file with mode: 0644]
libcontainer/notify_linux.go [new file with mode: 0644]
libcontainer/notify_linux_test.go [new file with mode: 0644]
libcontainer/nsenter/README.md [new file with mode: 0644]
libcontainer/nsenter/cloned_binary.c [new file with mode: 0644]
libcontainer/nsenter/namespace.h [new file with mode: 0644]
libcontainer/nsenter/nsenter.go [new file with mode: 0644]
libcontainer/nsenter/nsenter_gccgo.go [new file with mode: 0644]
libcontainer/nsenter/nsenter_test.go [new file with mode: 0644]
libcontainer/nsenter/nsenter_unsupported.go [new file with mode: 0644]
libcontainer/nsenter/nsexec.c [new file with mode: 0644]
libcontainer/process.go [new file with mode: 0644]
libcontainer/process_linux.go [new file with mode: 0644]
libcontainer/restored_process.go [new file with mode: 0644]
libcontainer/rootfs_linux.go [new file with mode: 0644]
libcontainer/rootfs_linux_test.go [new file with mode: 0644]
libcontainer/seccomp/config.go [new file with mode: 0644]
libcontainer/seccomp/fixtures/proc_self_status [new file with mode: 0644]
libcontainer/seccomp/seccomp_linux.go [new file with mode: 0644]
libcontainer/seccomp/seccomp_linux_test.go [new file with mode: 0644]
libcontainer/seccomp/seccomp_unsupported.go [new file with mode: 0644]
libcontainer/setns_init_linux.go [new file with mode: 0644]
libcontainer/specconv/example.go [new file with mode: 0644]
libcontainer/specconv/spec_linux.go [new file with mode: 0644]
libcontainer/specconv/spec_linux_test.go [new file with mode: 0644]
libcontainer/stacktrace/capture.go [new file with mode: 0644]
libcontainer/stacktrace/capture_test.go [new file with mode: 0644]
libcontainer/stacktrace/frame.go [new file with mode: 0644]
libcontainer/stacktrace/frame_test.go [new file with mode: 0644]
libcontainer/stacktrace/stacktrace.go [new file with mode: 0644]
libcontainer/standard_init_linux.go [new file with mode: 0644]
libcontainer/state_linux.go [new file with mode: 0644]
libcontainer/state_linux_test.go [new file with mode: 0644]
libcontainer/stats_linux.go [new file with mode: 0644]
libcontainer/sync.go [new file with mode: 0644]
libcontainer/system/linux.go [new file with mode: 0644]
libcontainer/system/linux_test.go [new file with mode: 0644]
libcontainer/system/proc.go [new file with mode: 0644]
libcontainer/system/proc_test.go [new file with mode: 0644]
libcontainer/system/syscall_linux_32.go [new file with mode: 0644]
libcontainer/system/syscall_linux_64.go [new file with mode: 0644]
libcontainer/system/sysconfig.go [new file with mode: 0644]
libcontainer/system/sysconfig_notcgo.go [new file with mode: 0644]
libcontainer/system/unsupported.go [new file with mode: 0644]
libcontainer/system/xattrs_linux.go [new file with mode: 0644]
libcontainer/user/MAINTAINERS [new file with mode: 0644]
libcontainer/user/lookup.go [new file with mode: 0644]
libcontainer/user/lookup_unix.go [new file with mode: 0644]
libcontainer/user/lookup_windows.go [new file with mode: 0644]
libcontainer/user/user.go [new file with mode: 0644]
libcontainer/user/user_test.go [new file with mode: 0644]
libcontainer/utils/cmsg.go [new file with mode: 0644]
libcontainer/utils/utils.go [new file with mode: 0644]
libcontainer/utils/utils_test.go [new file with mode: 0644]
libcontainer/utils/utils_unix.go [new file with mode: 0644]
list.go [new file with mode: 0644]
main.go [new file with mode: 0644]
man/README.md [new file with mode: 0644]
man/md2man-all.sh [new file with mode: 0755]
man/runc-checkpoint.8.md [new file with mode: 0644]
man/runc-create.8.md [new file with mode: 0644]
man/runc-delete.8.md [new file with mode: 0644]
man/runc-events.8.md [new file with mode: 0644]
man/runc-exec.8.md [new file with mode: 0644]
man/runc-kill.8.md [new file with mode: 0644]
man/runc-list.8.md [new file with mode: 0644]
man/runc-pause.8.md [new file with mode: 0644]
man/runc-ps.8.md [new file with mode: 0644]
man/runc-restore.8.md [new file with mode: 0644]
man/runc-resume.8.md [new file with mode: 0644]
man/runc-run.8.md [new file with mode: 0644]
man/runc-spec.8.md [new file with mode: 0644]
man/runc-start.8.md [new file with mode: 0644]
man/runc-state.8.md [new file with mode: 0644]
man/runc-update.8.md [new file with mode: 0644]
man/runc.8.md [new file with mode: 0644]
notify_socket.go [new file with mode: 0644]
pause.go [new file with mode: 0644]
ps.go [new file with mode: 0644]
restore.go [new file with mode: 0644]
rlimit_linux.go [new file with mode: 0644]
rootless_linux.go [new file with mode: 0644]
run.go [new file with mode: 0644]
script/.validate [new file with mode: 0644]
script/check-config.sh [new file with mode: 0755]
script/release.sh [new file with mode: 0755]
script/tmpmount [new file with mode: 0755]
script/validate-c [new file with mode: 0755]
script/validate-gofmt [new file with mode: 0755]
signalmap.go [new file with mode: 0644]
signalmap_mipsx.go [new file with mode: 0644]
signals.go [new file with mode: 0644]
spec.go [new file with mode: 0644]
start.go [new file with mode: 0644]
state.go [new file with mode: 0644]
tests/integration/README.md [new file with mode: 0644]
tests/integration/cgroups.bats [new file with mode: 0644]
tests/integration/checkpoint.bats [new file with mode: 0644]
tests/integration/config.json [new file with mode: 0644]
tests/integration/create.bats [new file with mode: 0644]
tests/integration/debug.bats [new file with mode: 0644]
tests/integration/delete.bats [new file with mode: 0644]
tests/integration/events.bats [new file with mode: 0644]
tests/integration/exec.bats [new file with mode: 0644]
tests/integration/help.bats [new file with mode: 0644]
tests/integration/helpers.bash [new file with mode: 0644]
tests/integration/kill.bats [new file with mode: 0644]
tests/integration/list.bats [new file with mode: 0644]
tests/integration/mask.bats [new file with mode: 0644]
tests/integration/mounts.bats [new file with mode: 0755]
tests/integration/multi-arch.bash [new file with mode: 0644]
tests/integration/pause.bats [new file with mode: 0644]
tests/integration/ps.bats [new file with mode: 0644]
tests/integration/root.bats [new file with mode: 0644]
tests/integration/spec.bats [new file with mode: 0644]
tests/integration/start.bats [new file with mode: 0644]
tests/integration/start_detached.bats [new file with mode: 0644]
tests/integration/start_hello.bats [new file with mode: 0644]
tests/integration/state.bats [new file with mode: 0644]
tests/integration/testdata/hello-world-aarch64.tar [new file with mode: 0644]
tests/integration/testdata/hello-world.tar [new file with mode: 0644]
tests/integration/tty.bats [new file with mode: 0644]
tests/integration/update.bats [new file with mode: 0644]
tests/integration/version.bats [new file with mode: 0644]
tests/rootless.sh [new file with mode: 0755]
tty.go [new file with mode: 0644]
types/events.go [new file with mode: 0644]
update.go [new file with mode: 0644]
utils.go [new file with mode: 0644]
utils_linux.go [new file with mode: 0644]
vendor.conf [new file with mode: 0644]
vendor/github.com/cilium/ebpf/LICENSE [new file with mode: 0644]
vendor/github.com/cilium/ebpf/abi.go [new file with mode: 0644]
vendor/github.com/cilium/ebpf/asm/alu.go [new file with mode: 0644]
vendor/github.com/cilium/ebpf/asm/alu_string.go [new file with mode: 0644]
vendor/github.com/cilium/ebpf/asm/doc.go [new file with mode: 0644]
vendor/github.com/cilium/ebpf/asm/func.go [new file with mode: 0644]
vendor/github.com/cilium/ebpf/asm/func_string.go [new file with mode: 0644]
vendor/github.com/cilium/ebpf/asm/instruction.go [new file with mode: 0644]
vendor/github.com/cilium/ebpf/asm/jump.go [new file with mode: 0644]
vendor/github.com/cilium/ebpf/asm/jump_string.go [new file with mode: 0644]
vendor/github.com/cilium/ebpf/asm/load_store.go [new file with mode: 0644]
vendor/github.com/cilium/ebpf/asm/load_store_string.go [new file with mode: 0644]
vendor/github.com/cilium/ebpf/asm/opcode.go [new file with mode: 0644]
vendor/github.com/cilium/ebpf/asm/opcode_string.go [new file with mode: 0644]
vendor/github.com/cilium/ebpf/asm/register.go [new file with mode: 0644]
vendor/github.com/cilium/ebpf/collection.go [new file with mode: 0644]
vendor/github.com/cilium/ebpf/doc.go [new file with mode: 0644]
vendor/github.com/cilium/ebpf/elf_reader.go [new file with mode: 0644]
vendor/github.com/cilium/ebpf/feature.go [new file with mode: 0644]
vendor/github.com/cilium/ebpf/go.mod [new file with mode: 0644]
vendor/github.com/cilium/ebpf/internal/cpu.go [new file with mode: 0644]
vendor/github.com/cilium/ebpf/internal/endian.go [new file with mode: 0644]
vendor/github.com/cilium/ebpf/internal/unix/types_linux.go [new file with mode: 0644]
vendor/github.com/cilium/ebpf/internal/unix/types_other.go [new file with mode: 0644]
vendor/github.com/cilium/ebpf/linker.go [new file with mode: 0644]
vendor/github.com/cilium/ebpf/map.go [new file with mode: 0644]
vendor/github.com/cilium/ebpf/marshalers.go [new file with mode: 0644]
vendor/github.com/cilium/ebpf/prog.go [new file with mode: 0644]
vendor/github.com/cilium/ebpf/ptr_32_be.go [new file with mode: 0644]
vendor/github.com/cilium/ebpf/ptr_32_le.go [new file with mode: 0644]
vendor/github.com/cilium/ebpf/ptr_64.go [new file with mode: 0644]
vendor/github.com/cilium/ebpf/readme.md [new file with mode: 0644]
vendor/github.com/cilium/ebpf/syscalls.go [new file with mode: 0644]
vendor/github.com/cilium/ebpf/types.go [new file with mode: 0644]
vendor/github.com/cilium/ebpf/types_string.go [new file with mode: 0644]

diff --git a/.gitignore b/.gitignore
new file mode 100644 (file)
index 0000000..282e34e
--- /dev/null
@@ -0,0 +1,6 @@
+vendor/pkg
+/runc
+/runc-*
+contrib/cmd/recvtty/recvtty
+man/man8
+release
diff --git a/.pullapprove.yml b/.pullapprove.yml
new file mode 100644 (file)
index 0000000..fc8c5d3
--- /dev/null
@@ -0,0 +1,10 @@
+approve_by_comment: true
+approve_regex: ^LGTM
+reject_regex: ^Rejected
+reset_on_push: true
+author_approval: ignored
+reviewers:
+  teams:
+    - runc-maintainers
+  name: default
+  required: 2
diff --git a/.travis.yml b/.travis.yml
new file mode 100644 (file)
index 0000000..5c2928d
--- /dev/null
@@ -0,0 +1,54 @@
+dist: bionic
+language: go
+go:
+  - 1.11.x
+  - 1.12.x
+  - tip
+
+matrix:
+  include:
+    - go: 1.12.x
+      env:
+        - RUNC_USE_SYSTEMD=1
+      script:
+        - make BUILDTAGS="${BUILDTAGS}" all
+        - sudo PATH="$PATH" make localintegration RUNC_USE_SYSTEMD=1
+    - go: 1.12.x
+      env:
+        - VIRTUALBOX_VERSION=6.0
+        - VAGRANT_VERSION=2.2.6
+        - FEDORA_VERSION=31
+      before_install:
+        - cat /proc/cpuinfo
+        - wget -q https://www.virtualbox.org/download/oracle_vbox_2016.asc -O- | sudo apt-key add - && sudo sh -c "echo deb https://download.virtualbox.org/virtualbox/debian $(lsb_release -cs) contrib >> /etc/apt/sources.list" && sudo apt-get update && sudo apt-get install -yq build-essential gcc make linux-headers-$(uname -r) virtualbox-${VIRTUALBOX_VERSION} && sudo usermod -aG vboxusers $(whoami)
+        - wget https://releases.hashicorp.com/vagrant/${VAGRANT_VERSION}/vagrant_${VAGRANT_VERSION}_$(uname -m).deb && sudo dpkg -i vagrant_${VAGRANT_VERSION}_$(uname -m).deb
+        - vagrant init bento/fedora-${FEDORA_VERSION} && vagrant up && mkdir -p ~/.ssh && vagrant ssh-config >> ~/.ssh/config
+        - ssh default sudo dnf install -y podman
+      script:
+        - ssh default sudo podman build -t test /vagrant
+        - ssh default sudo podman run --privileged --cgroupns=private test make localunittest
+  allow_failures:
+    - go: tip
+
+go_import_path: github.com/opencontainers/runc
+
+# `make ci` uses Docker.
+sudo: required
+services:
+  - docker
+
+env:
+  global:
+    - BUILDTAGS="seccomp apparmor selinux ambient"
+
+before_install:
+  - sudo apt-get -qq update
+  - sudo apt-get install -y libseccomp-dev
+  - go get -u golang.org/x/lint/golint
+  - go get -u github.com/vbatts/git-validation
+  - env | grep TRAVIS_
+
+script:
+  - git-validation -run DCO,short-subject -v
+  - make BUILDTAGS="${BUILDTAGS}"
+  - make BUILDTAGS="${BUILDTAGS}" clean ci cross
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644 (file)
index 0000000..3b674cf
--- /dev/null
@@ -0,0 +1,124 @@
+## Contribution Guidelines
+
+### Security issues
+
+If you are reporting a security issue, do not create an issue or file a pull
+request on GitHub. Instead, disclose the issue responsibly by sending an email
+to security@opencontainers.org (which is inhabited only by the maintainers of
+the various OCI projects).
+
+### Pull requests are always welcome
+
+We are always thrilled to receive pull requests, and do our best to
+process them as fast as possible. Not sure if that typo is worth a pull
+request? Do it! We will appreciate it.
+
+If your pull request is not accepted on the first try, don't be
+discouraged! If there's a problem with the implementation, hopefully you
+received feedback on what to improve.
+
+We're trying very hard to keep runc lean and focused. We don't want it
+to do everything for everybody. This means that we might decide against
+incorporating a new feature. However, there might be a way to implement
+that feature *on top of* runc.
+
+
+### Conventions
+
+Fork the repo and make changes on your fork in a feature branch:
+
+- If it's a bugfix branch, name it XXX-something where XXX is the number of the
+  issue
+- If it's a feature branch, create an enhancement issue to announce your
+  intentions, and name it XXX-something where XXX is the number of the issue.
+
+Submit unit tests for your changes.  Go has a great test framework built in; use
+it! Take a look at existing tests for inspiration. Run the full test suite on
+your branch before submitting a pull request.
+
+Update the documentation when creating or modifying features. Test
+your documentation changes for clarity, concision, and correctness, as
+well as a clean documentation build. See ``docs/README.md`` for more
+information on building the docs and how docs get released.
+
+Write clean code. Universally formatted code promotes ease of writing, reading,
+and maintenance. Always run `gofmt -s -w file.go` on each changed file before
+committing your changes. Most editors have plugins that do this automatically.
+
+Pull requests descriptions should be as clear as possible and include a
+reference to all the issues that they address.
+
+Pull requests must not contain commits from other users or branches.
+
+Commit messages must start with a capitalized and short summary (max. 50
+chars) written in the imperative, followed by an optional, more detailed
+explanatory text which is separated from the summary by an empty line.
+
+Code review comments may be added to your pull request. Discuss, then make the
+suggested modifications and push additional commits to your feature branch. Be
+sure to post a comment after pushing. The new commits will show up in the pull
+request automatically, but the reviewers will not be notified unless you
+comment.
+
+Before the pull request is merged, make sure that you squash your commits into
+logical units of work using `git rebase -i` and `git push -f`. After every
+commit the test suite should be passing. Include documentation changes in the
+same commit so that a revert would remove all traces of the feature or fix.
+
+Commits that fix or close an issue should include a reference like `Closes #XXX`
+or `Fixes #XXX`, which will automatically close the issue when merged.
+
+### Sign your work
+
+The sign-off is a simple line at the end of the explanation for the
+patch, which certifies that you wrote it or otherwise have the right to
+pass it on as an open-source patch.  The rules are pretty simple: if you
+can certify the below (from
+[developercertificate.org](http://developercertificate.org/)):
+
+```
+Developer Certificate of Origin
+Version 1.1
+
+Copyright (C) 2004, 2006 The Linux Foundation and its contributors.
+660 York Street, Suite 102,
+San Francisco, CA 94110 USA
+
+Everyone is permitted to copy and distribute verbatim copies of this
+license document, but changing it is not allowed.
+
+
+Developer's Certificate of Origin 1.1
+
+By making a contribution to this project, I certify that:
+
+(a) The contribution was created in whole or in part by me and I
+    have the right to submit it under the open source license
+    indicated in the file; or
+
+(b) The contribution is based upon previous work that, to the best
+    of my knowledge, is covered under an appropriate open source
+    license and I have the right under that license to submit that
+    work with modifications, whether created in whole or in part
+    by me, under the same open source license (unless I am
+    permitted to submit under a different license), as indicated
+    in the file; or
+
+(c) The contribution was provided directly to me by some other
+    person who certified (a), (b) or (c) and I have not modified
+    it.
+
+(d) I understand and agree that this project and the contribution
+    are public and that a record of the contribution (including all
+    personal information I submit with it, including my sign-off) is
+    maintained indefinitely and may be redistributed consistent with
+    this project or the open source license(s) involved.
+```
+
+then you just add a line to every git commit message:
+
+    Signed-off-by: Joe Smith <joe@gmail.com>
+
+using your real name (sorry, no pseudonyms or anonymous contributions.)
+
+You can add the sign off when creating the git commit via `git commit -s`.
diff --git a/Dockerfile b/Dockerfile
new file mode 100644 (file)
index 0000000..5c65470
--- /dev/null
@@ -0,0 +1,66 @@
+FROM golang:1.12-stretch
+
+RUN dpkg --add-architecture armel \
+    && dpkg --add-architecture armhf \
+    && dpkg --add-architecture arm64 \
+    && dpkg --add-architecture ppc64el \
+    && apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    sudo \
+    gawk \
+    iptables \
+    jq \
+    pkg-config \
+    libaio-dev \
+    libcap-dev \
+    libprotobuf-dev \
+    libprotobuf-c0-dev \
+    libnl-3-dev \
+    libnet-dev \
+    libseccomp2 \
+    libseccomp-dev \
+    protobuf-c-compiler \
+    protobuf-compiler \
+    python-minimal \
+    uidmap \
+    kmod \
+    crossbuild-essential-armel crossbuild-essential-armhf crossbuild-essential-arm64 crossbuild-essential-ppc64el \
+    libseccomp-dev:armel libseccomp-dev:armhf libseccomp-dev:arm64 libseccomp-dev:ppc64el \
+    --no-install-recommends \
+    && apt-get clean
+
+# Add a dummy user for the rootless integration tests. While runC does
+# not require an entry in /etc/passwd to operate, one of the tests uses
+# `git clone` -- and `git clone` does not allow you to clone a
+# repository if the current uid does not have an entry in /etc/passwd.
+RUN useradd -u1000 -m -d/home/rootless -s/bin/bash rootless
+
+# install bats
+RUN cd /tmp \
+    && git clone https://github.com/sstephenson/bats.git \
+    && cd bats \
+    && git reset --hard 03608115df2071fff4eaaff1605768c275e5f81f \
+    && ./install.sh /usr/local \
+    && rm -rf /tmp/bats
+
+# install criu
+ENV CRIU_VERSION v3.12
+RUN mkdir -p /usr/src/criu \
+    && curl -sSL https://github.com/checkpoint-restore/criu/archive/${CRIU_VERSION}.tar.gz | tar -v -C /usr/src/criu/ -xz --strip-components=1 \
+    && cd /usr/src/criu \
+    && make install-criu \
+    && rm -rf /usr/src/criu
+
+# setup a playground for us to spawn containers in
+ENV ROOTFS /busybox
+RUN mkdir -p ${ROOTFS}
+
+COPY script/tmpmount /
+WORKDIR /go/src/github.com/opencontainers/runc
+ENTRYPOINT ["/tmpmount"]
+
+ADD . /go/src/github.com/opencontainers/runc
+
+RUN . tests/integration/multi-arch.bash \
+    && curl -o- -sSL `get_busybox` | tar xfJC - ${ROOTFS}
diff --git a/LICENSE b/LICENSE
new file mode 100644 (file)
index 0000000..2744858
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,191 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   Copyright 2014 Docker, Inc.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/MAINTAINERS b/MAINTAINERS
new file mode 100644 (file)
index 0000000..9fe08d3
--- /dev/null
@@ -0,0 +1,5 @@
+Michael Crosby <michael@docker.com> (@crosbymichael)
+Mrunal Patel <mpatel@redhat.com> (@mrunalp)
+Daniel, Dao Quang Minh <dqminh89@gmail.com> (@dqminh)
+Qiang Huang <h.huangqiang@huawei.com> (@hqhq)
+Aleksa Sarai <asarai@suse.de> (@cyphar)
diff --git a/MAINTAINERS_GUIDE.md b/MAINTAINERS_GUIDE.md
new file mode 100644 (file)
index 0000000..7442103
--- /dev/null
@@ -0,0 +1,120 @@
+## Introduction
+
+Dear maintainer. Thank you for investing the time and energy to help
+make runc as useful as possible. Maintaining a project is difficult,
+sometimes unrewarding work.  Sure, you will get to contribute cool
+features to the project. But most of your time will be spent reviewing,
+cleaning up, documenting, answering questions, justifying design
+decisions - while everyone has all the fun! But remember - the quality
+of the maintainers work is what distinguishes the good projects from the
+great.  So please be proud of your work, even the unglamorous parts,
+and encourage a culture of appreciation and respect for *every* aspect
+of improving the project - not just the hot new features.
+
+This document is a manual for maintainers old and new. It explains what
+is expected of maintainers, how they should work, and what tools are
+available to them.
+
+This is a living document - if you see something out of date or missing,
+speak up!
+
+## What are a maintainer's responsibility?
+
+It is every maintainer's responsibility to:
+
+* 1) Expose a clear roadmap for improving their component.
+* 2) Deliver prompt feedback and decisions on pull requests.
+* 3) Be available to anyone with questions, bug reports, criticism etc.
+  on their component. This includes IRC and GitHub issues and pull requests.
+* 4) Make sure their component respects the philosophy, design and
+  roadmap of the project.
+
+## How are decisions made?
+
+Short answer: with pull requests to the runc repository.
+
+runc is an open-source project with an open design philosophy. This
+means that the repository is the source of truth for EVERY aspect of the
+project, including its philosophy, design, roadmap and APIs. *If it's
+part of the project, it's in the repo. It's in the repo, it's part of
+the project.*
+
+As a result, all decisions can be expressed as changes to the
+repository. An implementation change is a change to the source code. An
+API change is a change to the API specification. A philosophy change is
+a change to the philosophy manifesto. And so on.
+
+All decisions affecting runc, big and small, follow the same 3 steps:
+
+* Step 1: Open a pull request. Anyone can do this.
+
+* Step 2: Discuss the pull request. Anyone can do this.
+
+* Step 3: Accept (`LGTM`) or refuse a pull request. The relevant maintainers do 
+this (see below "Who decides what?")
+
+*I'm a maintainer, should I make pull requests too?*
+
+Yes. Nobody should ever push to master directly. All changes should be
+made through a pull request.
+
+## Who decides what?
+
+All decisions are pull requests, and the relevant maintainers make
+decisions by accepting or refusing the pull request. Review and acceptance
+by anyone is denoted by adding a comment in the pull request: `LGTM`.
+However, only currently listed `MAINTAINERS` are counted towards the required
+two LGTMs.
+
+Overall the maintainer system works because of mutual respect across the
+maintainers of the project.  The maintainers trust one another to make decisions
+in the best interests of the project.  Sometimes maintainers can disagree and
+this is part of a healthy project to represent the point of views of various people.
+In the case where maintainers cannot find agreement on a specific change the
+role of a Chief Maintainer comes into play.
+
+The Chief Maintainer for the project is responsible for overall architecture
+of the project to maintain conceptual integrity.  Large decisions and
+architecture changes should be reviewed by the chief maintainer.
+The current chief maintainer for the project is Michael Crosby (@crosbymichael).
+
+Even though the maintainer system is built on trust, if there is a conflict
+with the chief maintainer on a decision, their decision can be challenged
+and brought to the technical oversight board if two-thirds of the
+maintainers vote for an appeal. It is expected that this would be a
+very exceptional event.
+
+
+### How are maintainers added?
+
+The best maintainers have a vested interest in the project.  Maintainers
+are first and foremost contributors that have shown they are committed to
+the long term success of the project.  Contributors wanting to become
+maintainers are expected to be deeply involved in contributing code,
+pull request review, and triage of issues in the project for more than two months.
+
+Just contributing does not make you a maintainer, it is about building trust
+with the current maintainers of the project and being a person that they can
+depend on and trust to make decisions in the best interest of the project.  The
+final vote to add a new maintainer should be approved by over 66% of the current
+maintainers with the chief maintainer having veto power.  In case of a veto,
+conflict resolution rules expressed above apply.  The voting period is
+five business days on the Pull Request to add the new maintainer.
+
+
+### What is expected of maintainers?
+
+Part of a healthy project is to have active maintainers to support the community
+in contributions and perform tasks to keep the project running.  Maintainers are
+expected to be able to respond in a timely manner if their help is required on specific
+issues where they are pinged.  Being a maintainer is a time consuming commitment and should
+not be taken lightly.
+
+When a maintainer is unable to perform the required duties they can be removed with
+a vote by 66% of the current maintainers with the chief maintainer having veto power.
+The voting period is ten business days.  Issues related to a maintainer's performance should
+be discussed with them among the other maintainers so that they are not surprised by
+a pull request removing them.
+
+
+
diff --git a/Makefile b/Makefile
new file mode 100644 (file)
index 0000000..81db9d9
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,133 @@
+.PHONY: all shell dbuild man release \
+           localtest localunittest localintegration \
+           test unittest integration \
+           cross localcross
+
+CONTAINER_ENGINE := docker
+GO := go
+
+SOURCES := $(shell find . 2>&1 | grep -E '.*\.(c|h|go)$$')
+PREFIX := $(DESTDIR)/usr/local
+BINDIR := $(PREFIX)/sbin
+GIT_BRANCH := $(shell git rev-parse --abbrev-ref HEAD 2>/dev/null)
+GIT_BRANCH_CLEAN := $(shell echo $(GIT_BRANCH) | sed -e "s/[^[:alnum:]]/-/g")
+RUNC_IMAGE := runc_dev$(if $(GIT_BRANCH_CLEAN),:$(GIT_BRANCH_CLEAN))
+PROJECT := github.com/opencontainers/runc
+BUILDTAGS ?= seccomp
+COMMIT_NO := $(shell git rev-parse HEAD 2> /dev/null || true)
+COMMIT ?= $(if $(shell git status --porcelain --untracked-files=no),"${COMMIT_NO}-dirty","${COMMIT_NO}")
+
+MAN_DIR := $(CURDIR)/man/man8
+MAN_PAGES = $(shell ls $(MAN_DIR)/*.8)
+MAN_PAGES_BASE = $(notdir $(MAN_PAGES))
+MAN_INSTALL_PATH := ${PREFIX}/share/man/man8/
+
+RELEASE_DIR := $(CURDIR)/release
+
+VERSION := ${shell cat ./VERSION}
+
+SHELL := $(shell command -v bash 2>/dev/null)
+
+.DEFAULT: runc
+
+runc: $(SOURCES)
+       $(GO) build -buildmode=pie $(EXTRA_FLAGS) -ldflags "-X main.gitCommit=${COMMIT} -X main.version=${VERSION} $(EXTRA_LDFLAGS)" -tags "$(BUILDTAGS)" -o runc .
+
+all: runc recvtty
+
+recvtty: contrib/cmd/recvtty/recvtty
+
+contrib/cmd/recvtty/recvtty: $(SOURCES)
+       $(GO) build -buildmode=pie $(EXTRA_FLAGS) -ldflags "-X main.gitCommit=${COMMIT} -X main.version=${VERSION} $(EXTRA_LDFLAGS)" -tags "$(BUILDTAGS)" -o contrib/cmd/recvtty/recvtty ./contrib/cmd/recvtty
+
+static: $(SOURCES)
+       CGO_ENABLED=1 $(GO) build $(EXTRA_FLAGS) -tags "$(BUILDTAGS) netgo osusergo" -installsuffix netgo -ldflags "-w -extldflags -static -X main.gitCommit=${COMMIT} -X main.version=${VERSION} $(EXTRA_LDFLAGS)" -o runc .
+       CGO_ENABLED=1 $(GO) build $(EXTRA_FLAGS) -tags "$(BUILDTAGS) netgo osusergo" -installsuffix netgo -ldflags "-w -extldflags -static -X main.gitCommit=${COMMIT} -X main.version=${VERSION} $(EXTRA_LDFLAGS)" -o contrib/cmd/recvtty/recvtty ./contrib/cmd/recvtty
+
+release:
+       script/release.sh -r release/$(VERSION) -v $(VERSION)
+
+dbuild: runcimage
+       $(CONTAINER_ENGINE) run ${CONTAINER_ENGINE_RUN_FLAGS} --rm -v $(CURDIR):/go/src/$(PROJECT) --privileged $(RUNC_IMAGE) make clean all
+
+lint:
+       $(GO) vet $(allpackages)
+       $(GO) fmt $(allpackages)
+
+man:
+       man/md2man-all.sh
+
+runcimage:
+       $(CONTAINER_ENGINE) build ${CONTAINER_ENGINE_BUILD_FLAGS} -t $(RUNC_IMAGE) .
+
+test:
+       make unittest integration rootlessintegration
+
+localtest:
+       make localunittest localintegration localrootlessintegration
+
+unittest: runcimage
+       $(CONTAINER_ENGINE) run ${CONTAINER_ENGINE_RUN_FLAGS} -t --privileged --rm -v /lib/modules:/lib/modules:ro -v $(CURDIR):/go/src/$(PROJECT) $(RUNC_IMAGE) make localunittest TESTFLAGS=${TESTFLAGS}
+
+localunittest: all
+       $(GO) test -timeout 3m -tags "$(BUILDTAGS)" ${TESTFLAGS} -v $(allpackages)
+
+integration: runcimage
+       $(CONTAINER_ENGINE) run ${CONTAINER_ENGINE_RUN_FLAGS} -t --privileged --rm -v /lib/modules:/lib/modules:ro -v $(CURDIR):/go/src/$(PROJECT) $(RUNC_IMAGE) make localintegration TESTPATH=${TESTPATH}
+
+localintegration: all
+       bats -t tests/integration${TESTPATH}
+
+rootlessintegration: runcimage
+       $(CONTAINER_ENGINE) run ${CONTAINER_ENGINE_RUN_FLAGS} -t --privileged --rm -v $(CURDIR):/go/src/$(PROJECT) $(RUNC_IMAGE) make localrootlessintegration
+
+localrootlessintegration: all
+       tests/rootless.sh
+
+shell: runcimage
+       $(CONTAINER_ENGINE) run ${CONTAINER_ENGINE_RUN_FLAGS} -ti --privileged --rm -v $(CURDIR):/go/src/$(PROJECT) $(RUNC_IMAGE) bash
+
+install:
+       install -D -m0755 runc $(BINDIR)/runc
+
+install-bash:
+       install -D -m0644 contrib/completions/bash/runc $(PREFIX)/share/bash-completion/completions/runc
+
+install-man:
+       install -d -m 755 $(MAN_INSTALL_PATH)
+       install -m 644 $(MAN_PAGES) $(MAN_INSTALL_PATH)
+
+uninstall:
+       rm -f $(BINDIR)/runc
+
+uninstall-bash:
+       rm -f $(PREFIX)/share/bash-completion/completions/runc
+
+uninstall-man:
+       rm -f $(addprefix $(MAN_INSTALL_PATH),$(MAN_PAGES_BASE))
+
+clean:
+       rm -f runc runc-*
+       rm -f contrib/cmd/recvtty/recvtty
+       rm -rf $(RELEASE_DIR)
+       rm -rf $(MAN_DIR)
+
+validate:
+       script/validate-gofmt
+       script/validate-c
+       $(GO) vet $(allpackages)
+
+ci: validate test release
+
+cross: runcimage
+       $(CONTAINER_ENGINE) run ${CONTAINER_ENGINE_RUN_FLAGS} -e BUILDTAGS="$(BUILDTAGS)" --rm -v $(CURDIR):/go/src/$(PROJECT) $(RUNC_IMAGE) make localcross
+
+localcross:
+       CGO_ENABLED=1 GOARCH=arm GOARM=6 CC=arm-linux-gnueabi-gcc $(GO) build -buildmode=pie $(EXTRA_FLAGS) -ldflags "-X main.gitCommit=${COMMIT} -X main.version=${VERSION} $(EXTRA_LDFLAGS)" -tags "$(BUILDTAGS)" -o runc-armel .
+       CGO_ENABLED=1 GOARCH=arm GOARM=7 CC=arm-linux-gnueabihf-gcc $(GO) build -buildmode=pie $(EXTRA_FLAGS) -ldflags "-X main.gitCommit=${COMMIT} -X main.version=${VERSION} $(EXTRA_LDFLAGS)" -tags "$(BUILDTAGS)" -o runc-armhf .
+       CGO_ENABLED=1 GOARCH=arm64 CC=aarch64-linux-gnu-gcc $(GO) build -buildmode=pie $(EXTRA_FLAGS) -ldflags "-X main.gitCommit=${COMMIT} -X main.version=${VERSION} $(EXTRA_LDFLAGS)" -tags "$(BUILDTAGS)" -o runc-arm64 .
+       CGO_ENABLED=1 GOARCH=ppc64le CC=powerpc64le-linux-gnu-gcc $(GO) build -buildmode=pie $(EXTRA_FLAGS) -ldflags "-X main.gitCommit=${COMMIT} -X main.version=${VERSION} $(EXTRA_LDFLAGS)" -tags "$(BUILDTAGS)" -o runc-ppc64le .
+
+# memoize allpackages, so that it's executed only once and only if used
+_allpackages = $(shell $(GO) list ./... | grep -v vendor)
+allpackages = $(if $(__allpackages),,$(eval __allpackages := $$(_allpackages)))$(__allpackages)
diff --git a/NOTICE b/NOTICE
new file mode 100644 (file)
index 0000000..5c97abc
--- /dev/null
+++ b/NOTICE
@@ -0,0 +1,17 @@
+runc
+
+Copyright 2012-2015 Docker, Inc.
+
+This product includes software developed at Docker, Inc. (http://www.docker.com).
+
+The following is courtesy of our legal counsel:
+
+
+Use and transfer of Docker may be subject to certain restrictions by the
+United States and other governments.  
+It is your responsibility to ensure that your use and/or transfer does not
+violate applicable laws. 
+
+For more information, please see http://www.bis.doc.gov
+
+See also http://www.apache.org/dev/crypto.html and/or seek legal counsel.
diff --git a/PRINCIPLES.md b/PRINCIPLES.md
new file mode 100644 (file)
index 0000000..fdcc373
--- /dev/null
@@ -0,0 +1,19 @@
+# runc principles
+
+In the design and development of runc and libcontainer we try to follow these principles:
+
+(Work in progress)
+
+* Don't try to replace every tool. Instead, be an ingredient to improve them.
+* Less code is better.
+* Fewer components are better. Do you really need to add one more class?
+* 50 lines of straightforward, readable code is better than 10 lines of magic that nobody can understand.
+* Don't do later what you can do now. "//TODO: refactor" is not acceptable in new code.
+* When hesitating between two options, choose the one that is easier to reverse.
+* "No" is temporary; "Yes" is forever. If you're not sure about a new feature, say no. You can change your mind later.
+* Containers must be portable to the greatest possible number of machines. Be suspicious of any change which makes machines less interchangeable.
+* The fewer moving parts in a container, the better.
+* Don't merge it unless you document it.
+* Don't document it unless you can keep it up-to-date.
+* Don't merge it unless you test it!
+* Everyone's problem is slightly different. Focus on the part that is the same for everyone, and solve that.
diff --git a/README.md b/README.md
new file mode 100644 (file)
index 0000000..a806f27
--- /dev/null
+++ b/README.md
@@ -0,0 +1,280 @@
+# runc
+
+[![Build Status](https://travis-ci.org/opencontainers/runc.svg?branch=master)](https://travis-ci.org/opencontainers/runc)
+[![Go Report Card](https://goreportcard.com/badge/github.com/opencontainers/runc)](https://goreportcard.com/report/github.com/opencontainers/runc)
+[![GoDoc](https://godoc.org/github.com/opencontainers/runc?status.svg)](https://godoc.org/github.com/opencontainers/runc)
+
+## Introduction
+
+`runc` is a CLI tool for spawning and running containers according to the OCI specification.
+
+## Releases
+
+`runc` depends on and tracks the [runtime-spec](https://github.com/opencontainers/runtime-spec) repository.
+We will try to make sure that `runc` and the OCI specification major versions stay in lockstep.
+This means that `runc` 1.0.0 should implement the 1.0 version of the specification.
+
+You can find official releases of `runc` on the [release](https://github.com/opencontainers/runc/releases) page.
+
+Currently, the following features are not considered to be production-ready:
+
+* Support for cgroup v2
+
+## Security
+
+The reporting process and disclosure communications are outlined in [/org/security](https://github.com/opencontainers/org/blob/master/security/).
+
+## Building
+
+`runc` currently supports the Linux platform with various architecture support.
+It must be built with Go version 1.6 or higher in order for some features to function properly.
+
+In order to enable seccomp support you will need to install `libseccomp` on your platform.
+> e.g. `libseccomp-devel` for CentOS, or `libseccomp-dev` for Ubuntu
+
+Otherwise, if you do not want to build `runc` with seccomp support you can add `BUILDTAGS=""` when running make.
+
+```bash
+# create a 'github.com/opencontainers' in your GOPATH/src
+cd github.com/opencontainers
+git clone https://github.com/opencontainers/runc
+cd runc
+
+make
+sudo make install
+```
+
+You can also use `go get` to install to your `GOPATH`, assuming that you have a `github.com` parent folder already created under `src`:
+
+```bash
+go get github.com/opencontainers/runc
+cd $GOPATH/src/github.com/opencontainers/runc
+make
+sudo make install
+```
+
+`runc` will be installed to `/usr/local/sbin/runc` on your system.
+
+
+#### Build Tags
+
+`runc` supports optional build tags for compiling support of various features.
+To add build tags to the make option the `BUILDTAGS` variable must be set.
+
+```bash
+make BUILDTAGS='seccomp apparmor'
+```
+
+| Build Tag | Feature                            | Dependency  |
+|-----------|------------------------------------|-------------|
+| seccomp   | Syscall filtering                  | libseccomp  |
+| selinux   | selinux process and mount labeling | <none>      |
+| apparmor  | apparmor profile support           | <none>      |
+| ambient   | ambient capability support         | kernel 4.3  |
+| nokmem    | disable kernel memory account      | <none>      |
+
+
+### Running the test suite
+
+`runc` currently supports running its test suite via Docker.
+To run the suite just type `make test`.
+
+```bash
+make test
+```
+
+There are additional make targets for running the tests outside of a container but this is not recommended as the tests are written with the expectation that they can write and remove anywhere.
+
+You can run a specific test case by setting the `TESTFLAGS` variable.
+
+```bash
+# make test TESTFLAGS="-run=SomeTestFunction"
+```
+
+You can run a specific integration test by setting the `TESTPATH` variable.
+
+```bash
+# make test TESTPATH="/checkpoint.bats"
+```
+
+You can run a test in your proxy environment by setting `DOCKER_BUILD_PROXY` and `DOCKER_RUN_PROXY` variables.
+
+```bash
+# make test DOCKER_BUILD_PROXY="--build-arg HTTP_PROXY=http://yourproxy/" DOCKER_RUN_PROXY="-e HTTP_PROXY=http://yourproxy/"
+```
+
+### Dependencies Management
+
+`runc` uses [vndr](https://github.com/LK4D4/vndr) for dependencies management.
+Please refer to [vndr](https://github.com/LK4D4/vndr) for how to add or update
+new dependencies.
+
+## Using runc
+
+### Creating an OCI Bundle
+
+In order to use runc you must have your container in the format of an OCI bundle.
+If you have Docker installed you can use its `export` method to acquire a root filesystem from an existing Docker container.
+
+```bash
+# create the top most bundle directory
+mkdir /mycontainer
+cd /mycontainer
+
+# create the rootfs directory
+mkdir rootfs
+
+# export busybox via Docker into the rootfs directory
+docker export $(docker create busybox) | tar -C rootfs -xvf -
+```
+
+After a root filesystem is populated you just generate a spec in the format of a `config.json` file inside your bundle.
+`runc` provides a `spec` command to generate a base template spec that you are then able to edit.
+To find features and documentation for fields in the spec please refer to the [specs](https://github.com/opencontainers/runtime-spec) repository.
+
+```bash
+runc spec
+```
+
+### Running Containers
+
+Assuming you have an OCI bundle from the previous step you can execute the container in two different ways.
+
+The first way is to use the convenience command `run` that will handle creating, starting, and deleting the container after it exits.
+
+```bash
+# run as root
+cd /mycontainer
+runc run mycontainerid
+```
+
+If you used the unmodified `runc spec` template this should give you a `sh` session inside the container.
+
+The second way to start a container is using the specs lifecycle operations.
+This gives you more power over how the container is created and managed while it is running.
+This will also launch the container in the background so you will have to edit the `config.json` to remove the `terminal` setting for the simple examples here.
+Your process field in the `config.json` should look like this below with `"terminal": false` and `"args": ["sleep", "5"]`.
+
+
+```json
+        "process": {
+                "terminal": false,
+                "user": {
+                        "uid": 0,
+                        "gid": 0
+                },
+                "args": [
+                        "sleep", "5"
+                ],
+                "env": [
+                        "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
+                        "TERM=xterm"
+                ],
+                "cwd": "/",
+                "capabilities": {
+                        "bounding": [
+                                "CAP_AUDIT_WRITE",
+                                "CAP_KILL",
+                                "CAP_NET_BIND_SERVICE"
+                        ],
+                        "effective": [
+                                "CAP_AUDIT_WRITE",
+                                "CAP_KILL",
+                                "CAP_NET_BIND_SERVICE"
+                        ],
+                        "inheritable": [
+                                "CAP_AUDIT_WRITE",
+                                "CAP_KILL",
+                                "CAP_NET_BIND_SERVICE"
+                        ],
+                        "permitted": [
+                                "CAP_AUDIT_WRITE",
+                                "CAP_KILL",
+                                "CAP_NET_BIND_SERVICE"
+                        ],
+                        "ambient": [
+                                "CAP_AUDIT_WRITE",
+                                "CAP_KILL",
+                                "CAP_NET_BIND_SERVICE"
+                        ]
+                },
+                "rlimits": [
+                        {
+                                "type": "RLIMIT_NOFILE",
+                                "hard": 1024,
+                                "soft": 1024
+                        }
+                ],
+                "noNewPrivileges": true
+        },
+```
+
+Now we can go through the lifecycle operations in your shell.
+
+
+```bash
+# run as root
+cd /mycontainer
+runc create mycontainerid
+
+# view the container is created and in the "created" state
+runc list
+
+# start the process inside the container
+runc start mycontainerid
+
+# after 5 seconds view that the container has exited and is now in the stopped state
+runc list
+
+# now delete the container
+runc delete mycontainerid
+```
+
+This allows higher level systems to augment the containers creation logic with setup of various settings after the container is created and/or before it is deleted. For example, the container's network stack is commonly set up after `create` but before `start`.
+
+#### Rootless containers
+`runc` has the ability to run containers without root privileges. This is called `rootless`. You need to pass some parameters to `runc` in order to run rootless containers. See below and compare with the previous version.
+
+**Note:** In order to use this feature, "User Namespaces" must be compiled and enabled in your kernel. There are various ways to do this depending on your distribution:
+- Confirm `CONFIG_USER_NS=y` is set in your kernel configuration (normally found in `/proc/config.gz`)
+- Arch/Debian: `echo 1 > /proc/sys/kernel/unprivileged_userns_clone`
+- RHEL/CentOS 7: `echo 28633 > /proc/sys/user/max_user_namespaces`
+
+Run the following commands as an ordinary user:
+```bash
+# Same as the first example
+mkdir ~/mycontainer
+cd ~/mycontainer
+mkdir rootfs
+docker export $(docker create busybox) | tar -C rootfs -xvf -
+
+# The --rootless parameter instructs runc spec to generate a configuration for a rootless container, which will allow you to run the container as a non-root user.
+runc spec --rootless
+
+# The --root parameter tells runc where to store the container state. It must be writable by the user.
+runc --root /tmp/runc run mycontainerid
+```
+
+#### Supervisors
+
+`runc` can be used with process supervisors and init systems to ensure that containers are restarted when they exit.
+An example systemd unit file looks something like this.
+
+```systemd
+[Unit]
+Description=Start My Container
+
+[Service]
+Type=forking
+ExecStart=/usr/local/sbin/runc run -d --pid-file /run/mycontainerid.pid mycontainerid
+ExecStopPost=/usr/local/sbin/runc delete mycontainerid
+WorkingDirectory=/mycontainer
+PIDFile=/run/mycontainerid.pid
+
+[Install]
+WantedBy=multi-user.target
+```
+
+## License
+
+The code and docs are released under the [Apache 2.0 license](LICENSE).
diff --git a/SECURITY.md b/SECURITY.md
new file mode 100644 (file)
index 0000000..63a7438
--- /dev/null
@@ -0,0 +1,3 @@
+# Security
+
+The reporting process and disclosure communications are outlined in [/org/security](https://github.com/opencontainers/org/blob/master/security/).
diff --git a/VERSION b/VERSION
new file mode 100644 (file)
index 0000000..950f8ca
--- /dev/null
+++ b/VERSION
@@ -0,0 +1 @@
+1.0.0-rc10
diff --git a/checkpoint.go b/checkpoint.go
new file mode 100644 (file)
index 0000000..ae01ea3
--- /dev/null
@@ -0,0 +1,137 @@
+// +build linux
+
+package main
+
+import (
+       "fmt"
+       "os"
+       "strconv"
+       "strings"
+
+       "github.com/opencontainers/runc/libcontainer"
+       "github.com/opencontainers/runc/libcontainer/system"
+       "github.com/opencontainers/runtime-spec/specs-go"
+       "github.com/sirupsen/logrus"
+       "github.com/urfave/cli"
+
+       "golang.org/x/sys/unix"
+)
+
+var checkpointCommand = cli.Command{
+       Name:  "checkpoint",
+       Usage: "checkpoint a running container",
+       ArgsUsage: `<container-id>
+
+Where "<container-id>" is the name for the instance of the container to be
+checkpointed.`,
+       Description: `The checkpoint command saves the state of the container instance.`,
+       Flags: []cli.Flag{
+               cli.StringFlag{Name: "image-path", Value: "", Usage: "path for saving criu image files"},
+               cli.StringFlag{Name: "work-path", Value: "", Usage: "path for saving work files and logs"},
+               cli.StringFlag{Name: "parent-path", Value: "", Usage: "path for previous criu image files in pre-dump"},
+               cli.BoolFlag{Name: "leave-running", Usage: "leave the process running after checkpointing"},
+               cli.BoolFlag{Name: "tcp-established", Usage: "allow open tcp connections"},
+               cli.BoolFlag{Name: "ext-unix-sk", Usage: "allow external unix sockets"},
+               cli.BoolFlag{Name: "shell-job", Usage: "allow shell jobs"},
+               cli.BoolFlag{Name: "lazy-pages", Usage: "use userfaultfd to lazily restore memory pages"},
+               cli.StringFlag{Name: "status-fd", Value: "", Usage: "criu writes \\0 to this FD once lazy-pages is ready"},
+               cli.StringFlag{Name: "page-server", Value: "", Usage: "ADDRESS:PORT of the page server"},
+               cli.BoolFlag{Name: "file-locks", Usage: "handle file locks, for safety"},
+               cli.BoolFlag{Name: "pre-dump", Usage: "dump container's memory information only, leave the container running after this"},
+               cli.StringFlag{Name: "manage-cgroups-mode", Value: "", Usage: "cgroups mode: 'soft' (default), 'full' and 'strict'"},
+               cli.StringSliceFlag{Name: "empty-ns", Usage: "create a namespace, but don't restore its properties"},
+               cli.BoolFlag{Name: "auto-dedup", Usage: "enable auto deduplication of memory images"},
+       },
+       Action: func(context *cli.Context) error {
+               if err := checkArgs(context, 1, exactArgs); err != nil {
+                       return err
+               }
+               // XXX: Currently this is untested with rootless containers.
+               if os.Geteuid() != 0 || system.RunningInUserNS() {
+                       logrus.Warn("runc checkpoint is untested with rootless containers")
+               }
+
+               container, err := getContainer(context)
+               if err != nil {
+                       return err
+               }
+               status, err := container.Status()
+               if err != nil {
+                       return err
+               }
+               if status == libcontainer.Created || status == libcontainer.Stopped {
+                       fatalf("Container cannot be checkpointed in %s state", status.String())
+               }
+               defer destroy(container)
+               options := criuOptions(context)
+               // these are the mandatory criu options for a container
+               setPageServer(context, options)
+               setManageCgroupsMode(context, options)
+               if err := setEmptyNsMask(context, options); err != nil {
+                       return err
+               }
+               return container.Checkpoint(options)
+       },
+}
+
+func getCheckpointImagePath(context *cli.Context) string {
+       imagePath := context.String("image-path")
+       if imagePath == "" {
+               imagePath = getDefaultImagePath(context)
+       }
+       return imagePath
+}
+
+func setPageServer(context *cli.Context, options *libcontainer.CriuOpts) {
+       // xxx following criu opts are optional
+       // The dump image can be sent to a criu page server
+       if psOpt := context.String("page-server"); psOpt != "" {
+               addressPort := strings.Split(psOpt, ":")
+               if len(addressPort) != 2 {
+                       fatal(fmt.Errorf("Use --page-server ADDRESS:PORT to specify page server"))
+               }
+               portInt, err := strconv.Atoi(addressPort[1])
+               if err != nil {
+                       fatal(fmt.Errorf("Invalid port number"))
+               }
+               options.PageServer = libcontainer.CriuPageServerInfo{
+                       Address: addressPort[0],
+                       Port:    int32(portInt),
+               }
+       }
+}
+
+func setManageCgroupsMode(context *cli.Context, options *libcontainer.CriuOpts) {
+       if cgOpt := context.String("manage-cgroups-mode"); cgOpt != "" {
+               switch cgOpt {
+               case "soft":
+                       options.ManageCgroupsMode = libcontainer.CRIU_CG_MODE_SOFT
+               case "full":
+                       options.ManageCgroupsMode = libcontainer.CRIU_CG_MODE_FULL
+               case "strict":
+                       options.ManageCgroupsMode = libcontainer.CRIU_CG_MODE_STRICT
+               default:
+                       fatal(fmt.Errorf("Invalid manage cgroups mode"))
+               }
+       }
+}
+
+var namespaceMapping = map[specs.LinuxNamespaceType]int{
+       specs.NetworkNamespace: unix.CLONE_NEWNET,
+}
+
+func setEmptyNsMask(context *cli.Context, options *libcontainer.CriuOpts) error {
+       /* Runc doesn't manage network devices and their configuration */
+       nsmask := unix.CLONE_NEWNET
+
+       for _, ns := range context.StringSlice("empty-ns") {
+               f, exists := namespaceMapping[specs.LinuxNamespaceType(ns)]
+               if !exists {
+                       return fmt.Errorf("namespace %q is not supported", ns)
+               }
+               nsmask |= f
+       }
+
+       options.EmptyNs = uint32(nsmask)
+       return nil
+}
diff --git a/contrib/cmd/recvtty/recvtty.go b/contrib/cmd/recvtty/recvtty.go
new file mode 100644 (file)
index 0000000..a658b8d
--- /dev/null
@@ -0,0 +1,238 @@
+/*
+ * Copyright 2016 SUSE LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package main
+
+import (
+       "fmt"
+       "io"
+       "io/ioutil"
+       "net"
+       "os"
+       "strings"
+
+       "github.com/containerd/console"
+       "github.com/opencontainers/runc/libcontainer/utils"
+       "github.com/urfave/cli"
+)
+
+// version will be populated by the Makefile, read from
+// VERSION file of the source code.
+var version = ""
+
+// gitCommit will be the hash that the binary was built from
+// and will be populated by the Makefile
+var gitCommit = ""
+
+const (
+       usage = `Open Container Initiative contrib/cmd/recvtty
+
+recvtty is a reference implementation of a consumer of runC's --console-socket
+API. It has two main modes of operation:
+
+  * single: Only permit one terminal to be sent to the socket, which is
+       then hooked up to the stdio of the recvtty process. This is useful
+       for rudimentary shell management of a container.
+
+  * null: Permit as many terminals to be sent to the socket, but they
+       are read to /dev/null. This is used for testing, and imitates the
+       old runC API's --console=/dev/pts/ptmx hack which would allow for a
+       similar trick. This is probably not what you want to use, unless
+       you're doing something like our bats integration tests.
+
+To use recvtty, just specify a socket path at which you want to receive
+terminals:
+
+    $ recvtty [--mode <single|null>] socket.sock
+`
+)
+
+func bail(err error) {
+       fmt.Fprintf(os.Stderr, "[recvtty] fatal error: %v\n", err)
+       os.Exit(1)
+}
+
+func handleSingle(path string) error {
+       // Open a socket.
+       ln, err := net.Listen("unix", path)
+       if err != nil {
+               return err
+       }
+       defer ln.Close()
+
+       // We only accept a single connection, since we can only really have
+       // one reader for os.Stdin. Plus this is all a PoC.
+       conn, err := ln.Accept()
+       if err != nil {
+               return err
+       }
+       defer conn.Close()
+
+       // Close ln, to allow for other instances to take over.
+       ln.Close()
+
+       // Get the fd of the connection.
+       unixconn, ok := conn.(*net.UnixConn)
+       if !ok {
+               return fmt.Errorf("failed to cast to unixconn")
+       }
+
+       socket, err := unixconn.File()
+       if err != nil {
+               return err
+       }
+       defer socket.Close()
+
+       // Get the master file descriptor from runC.
+       master, err := utils.RecvFd(socket)
+       if err != nil {
+               return err
+       }
+       c, err := console.ConsoleFromFile(master)
+       if err != nil {
+               return err
+       }
+       console.ClearONLCR(c.Fd())
+
+       // Copy from our stdio to the master fd.
+       quitChan := make(chan struct{})
+       go func() {
+               io.Copy(os.Stdout, c)
+               quitChan <- struct{}{}
+       }()
+       go func() {
+               io.Copy(c, os.Stdin)
+               quitChan <- struct{}{}
+       }()
+
+       // Only close the master fd once we've stopped copying.
+       <-quitChan
+       c.Close()
+       return nil
+}
+
+func handleNull(path string) error {
+       // Open a socket.
+       ln, err := net.Listen("unix", path)
+       if err != nil {
+               return err
+       }
+       defer ln.Close()
+
+       // As opposed to handleSingle we accept as many connections as we get, but
+       // we don't interact with Stdin at all (and we copy stdout to /dev/null).
+       for {
+               conn, err := ln.Accept()
+               if err != nil {
+                       return err
+               }
+               go func(conn net.Conn) {
+                       // Don't leave references lying around.
+                       defer conn.Close()
+
+                       // Get the fd of the connection.
+                       unixconn, ok := conn.(*net.UnixConn)
+                       if !ok {
+                               return
+                       }
+
+                       socket, err := unixconn.File()
+                       if err != nil {
+                               return
+                       }
+                       defer socket.Close()
+
+                       // Get the master file descriptor from runC.
+                       master, err := utils.RecvFd(socket)
+                       if err != nil {
+                               return
+                       }
+
+                       // Just do a dumb copy to /dev/null.
+                       devnull, err := os.OpenFile("/dev/null", os.O_RDWR, 0)
+                       if err != nil {
+                               // TODO: Handle this nicely.
+                               return
+                       }
+
+                       io.Copy(devnull, master)
+                       devnull.Close()
+               }(conn)
+       }
+}
+
+func main() {
+       app := cli.NewApp()
+       app.Name = "recvtty"
+       app.Usage = usage
+
+       // Set version to be the same as runC.
+       var v []string
+       if version != "" {
+               v = append(v, version)
+       }
+       if gitCommit != "" {
+               v = append(v, fmt.Sprintf("commit: %s", gitCommit))
+       }
+       app.Version = strings.Join(v, "\n")
+
+       // Set the flags.
+       app.Flags = []cli.Flag{
+               cli.StringFlag{
+                       Name:  "mode, m",
+                       Value: "single",
+                       Usage: "Mode of operation (single or null)",
+               },
+               cli.StringFlag{
+                       Name:  "pid-file",
+                       Value: "",
+                       Usage: "Path to write daemon process ID to",
+               },
+       }
+
+       app.Action = func(ctx *cli.Context) error {
+               args := ctx.Args()
+               if len(args) != 1 {
+                       return fmt.Errorf("need to specify a single socket path")
+               }
+               path := ctx.Args()[0]
+
+               pidPath := ctx.String("pid-file")
+               if pidPath != "" {
+                       pid := fmt.Sprintf("%d\n", os.Getpid())
+                       if err := ioutil.WriteFile(pidPath, []byte(pid), 0644); err != nil {
+                               return err
+                       }
+               }
+
+               switch ctx.String("mode") {
+               case "single":
+                       if err := handleSingle(path); err != nil {
+                               return err
+                       }
+               case "null":
+                       if err := handleNull(path); err != nil {
+                               return err
+                       }
+               default:
+                       return fmt.Errorf("need to select a valid mode: %s", ctx.String("mode"))
+               }
+               return nil
+       }
+       if err := app.Run(os.Args); err != nil {
+               bail(err)
+       }
+}
diff --git a/contrib/completions/bash/runc b/contrib/completions/bash/runc
new file mode 100644 (file)
index 0000000..9517a5b
--- /dev/null
@@ -0,0 +1,826 @@
+#!/bin/bash
+#
+# bash completion file for runc command
+#
+# This script provides completion of:
+#  - commands and their options
+#  - filepaths
+#
+# To enable the completions either:
+#  - place this file in /usr/share/bash-completion/completions
+#  or
+#  - copy this file to e.g. ~/.runc-completion.sh and add the line
+#    below to your .bashrc after bash completion features are loaded
+#    . ~/.runc-completion.sh
+#
+# Configuration:
+#
+
+# Note for developers:
+# Please arrange options sorted alphabetically by long name with the short
+# options immediately following their corresponding long form.
+# This order should be applied to lists, alternatives and code blocks.
+
+__runc_previous_extglob_setting=$(shopt -p extglob)
+shopt -s extglob
+
+__runc_list_all() {
+       COMPREPLY=($(compgen -W "$(runc list -q)" -- $cur))
+}
+
+__runc_pos_first_nonflag() {
+       local argument_flags=$1
+
+       local counter=$((${subcommand_pos:-${command_pos}} + 1))
+       while [ $counter -le $cword ]; do
+               if [ -n "$argument_flags" ] && eval "case '${words[$counter]}' in $argument_flags) true ;; *) false ;; esac"; then
+                       ((counter++))
+               else
+                       case "${words[$counter]}" in
+                       -*) ;;
+                       *)
+                               break
+                               ;;
+                       esac
+               fi
+               ((counter++))
+       done
+
+       echo $counter
+}
+
+# Transforms a multiline list of strings into a single line string
+# with the words separated by "|".
+# This is used to prepare arguments to __runc_pos_first_nonflag().
+__runc_to_alternatives() {
+       local parts=($1)
+       local IFS='|'
+       echo "${parts[*]}"
+}
+
+# Transforms a multiline list of options into an extglob pattern
+# suitable for use in case statements.
+__runc_to_extglob() {
+       local extglob=$(__runc_to_alternatives "$1")
+       echo "@($extglob)"
+}
+
+# Subcommand processing.
+# Locates the first occurrence of any of the subcommands contained in the
+# first argument. In case of a match, calls the corresponding completion
+# function and returns 0.
+# If no match is found, 1 is returned. The calling function can then
+# continue processing its completion.
+#
+# TODO if the preceding command has options that accept arguments and an
+# argument is equal to one of the subcommands, this is falsely detected as
+# a match.
+__runc_subcommands() {
+       local subcommands="$1"
+
+       local counter=$(($command_pos + 1))
+       while [ $counter -lt $cword ]; do
+               case "${words[$counter]}" in
+               $(__runc_to_extglob "$subcommands"))
+                       subcommand_pos=$counter
+                       local subcommand=${words[$counter]}
+                       local completions_func=_runc_${command}_${subcommand}
+                       declare -F $completions_func >/dev/null && $completions_func
+                       return 0
+                       ;;
+               esac
+               ((counter++))
+       done
+       return 1
+}
+
+# List all Signals
+__runc_list_signals() {
+       COMPREPLY=($(compgen -W "$(for i in $(kill -l | xargs); do echo $i; done | grep SIG)"))
+}
+
+# suppress trailing whitespace
+__runc_nospace() {
+       # compopt is not available in ancient bash versions
+       type compopt &>/dev/null && compopt -o nospace
+}
+
+# The list of capabilities is defined in types.go, ALL was added manually.
+__runc_complete_capabilities() {
+       COMPREPLY=($(compgen -W "
+               ALL
+               AUDIT_CONTROL
+               AUDIT_WRITE
+               AUDIT_READ
+               BLOCK_SUSPEND
+               CHOWN
+               DAC_OVERRIDE
+               DAC_READ_SEARCH
+               FOWNER
+               FSETID
+               IPC_LOCK
+               IPC_OWNER
+               KILL
+               LEASE
+               LINUX_IMMUTABLE
+               MAC_ADMIN
+               MAC_OVERRIDE
+               MKNOD
+               NET_ADMIN
+               NET_BIND_SERVICE
+               NET_BROADCAST
+               NET_RAW
+               SETFCAP
+               SETGID
+               SETPCAP
+               SETUID
+               SYS_ADMIN
+               SYS_BOOT
+               SYS_CHROOT
+               SYSLOG
+               SYS_MODULE
+               SYS_NICE
+               SYS_PACCT
+               SYS_PTRACE
+               SYS_RAWIO
+               SYS_RESOURCE
+               SYS_TIME
+               SYS_TTY_CONFIG
+               WAKE_ALARM
+       " -- "$cur"))
+}
+
+_runc_exec() {
+       local boolean_options="
+          --help
+          --no-new-privs
+          --tty, -t
+          --detach, -d
+       "
+
+       local options_with_args="
+          --console-socket
+          --cwd
+          --env, -e
+          --user, -u
+          --additional-gids, -g
+          --process, -p
+          --pid-file
+          --process-label
+          --apparmor
+          --cap, -c
+          --preserve-fds
+       "
+
+       local all_options="$options_with_args $boolean_options"
+
+       case "$prev" in
+       --cap | -c)
+               __runc_complete_capabilities
+               return
+               ;;
+
+       --console-socket | --cwd | --process | --apparmor)
+               case "$cur" in
+               *:*) ;; # TODO somehow do _filedir for stuff inside the image, if it's already specified (which is also somewhat difficult to determine)
+               '')
+                       COMPREPLY=($(compgen -W '/' -- "$cur"))
+                       __runc_nospace
+                       ;;
+               /*)
+                       _filedir
+                       __runc_nospace
+                       ;;
+               esac
+               return
+               ;;
+       --env | -e)
+               COMPREPLY=($(compgen -e -- "$cur"))
+               __runc_nospace
+               return
+               ;;
+       $(__runc_to_extglob "$options_with_args"))
+               return
+               ;;
+       esac
+
+       case "$cur" in
+       -*)
+               COMPREPLY=($(compgen -W "$all_options" -- "$cur"))
+               ;;
+       *)
+               __runc_list_all
+               ;;
+       esac
+}
+
+# global options that may appear after the runc command
+_runc_runc() {
+       local boolean_options="
+               $global_boolean_options
+               --help
+               --version -v
+               --debug
+       "
+       local options_with_args="
+               --log
+               --log-format
+               --root
+               --criu
+               --rootless
+       "
+
+       case "$prev" in
+       --log | --root | --criu)
+               case "$cur" in
+               *:*) ;; # TODO somehow do _filedir for stuff inside the image, if it's already specified (which is also somewhat difficult to determine)
+               '')
+                       COMPREPLY=($(compgen -W '/' -- "$cur"))
+                       __runc_nospace
+                       ;;
+               *)
+                       _filedir
+                       __runc_nospace
+                       ;;
+               esac
+               return
+               ;;
+
+       --log-format)
+               COMPREPLY=($(compgen -W 'text json' -- "$cur"))
+               return
+               ;;
+
+       $(__runc_to_extglob "$options_with_args"))
+               return
+               ;;
+       esac
+
+       case "$cur" in
+       -*)
+               COMPREPLY=($(compgen -W "$boolean_options $options_with_args" -- "$cur"))
+               ;;
+       *)
+               local counter=$(__runc_pos_first_nonflag $(__runc_to_extglob "$options_with_args"))
+               if [ $cword -eq $counter ]; then
+                       COMPREPLY=($(compgen -W "${commands[*]} help" -- "$cur"))
+               fi
+               ;;
+       esac
+}
+
+_runc_pause() {
+       local boolean_options="
+          --help
+          -h
+       "
+
+       case "$cur" in
+       -*)
+               COMPREPLY=($(compgen -W "$boolean_options $options_with_args" -- "$cur"))
+               ;;
+       *)
+               __runc_list_all
+               ;;
+       esac
+}
+
+_runc_ps() {
+       local boolean_options="
+          --help
+          -h
+       "
+       local options_with_args="
+          --format, -f
+       "
+
+       case "$cur" in
+       -*)
+               COMPREPLY=($(compgen -W "$boolean_options $options_with_args" -- "$cur"))
+               ;;
+       *)
+               __runc_list_all
+               ;;
+       esac
+}
+
+_runc_delete() {
+       local boolean_options="
+          --help
+          -h
+          --format, -f
+       "
+
+       case "$cur" in
+       -*)
+               COMPREPLY=($(compgen -W "$boolean_options $options_with_args" -- "$cur"))
+               ;;
+       *)
+               __runc_list_all
+               ;;
+       esac
+}
+
+_runc_kill() {
+       local boolean_options="
+          --help
+          -h
+          --all
+          -a
+       "
+
+       case "$prev" in
+       "kill")
+               __runc_list_all
+               return
+               ;;
+       *)
+               __runc_list_signals
+               return
+               ;;
+       esac
+       case "$cur" in
+       -*)
+               COMPREPLY=($(compgen -W "$boolean_options $options_with_args" -- "$cur"))
+               ;;
+       *)
+               __runc_list_all
+               ;;
+       esac
+}
+
+_runc_events() {
+       local boolean_options="
+          --help
+          --stats
+       "
+
+       local options_with_args="
+          --interval
+       "
+
+       case "$prev" in
+       $(__runc_to_extglob "$options_with_args"))
+               return
+               ;;
+       esac
+
+       case "$cur" in
+       -*)
+               COMPREPLY=($(compgen -W "$boolean_options $options_with_args" -- "$cur"))
+               ;;
+       *)
+               __runc_list_all
+               ;;
+       esac
+}
+
+_runc_list() {
+       local boolean_options="
+          --help
+          --quiet 
+          -q
+       "
+
+       local options_with_args="
+          --format
+          -f
+       "
+
+       case "$prev" in
+       --format | -f)
+               COMPREPLY=($(compgen -W 'text json' -- "$cur"))
+               return
+               ;;
+
+       $(__runc_to_extglob "$options_with_args"))
+               return
+               ;;
+       esac
+
+       case "$cur" in
+       -*)
+               COMPREPLY=($(compgen -W "$boolean_options $options_with_args" -- "$cur"))
+               ;;
+       *)
+               local counter=$(__runc_pos_first_nonflag $(__runc_to_extglob "$options_with_args"))
+               ;;
+       esac
+}
+
+_runc_spec() {
+       local boolean_options="
+          --help
+          --rootless
+       "
+
+       local options_with_args="
+          --bundle
+          -b
+       "
+
+       case "$prev" in
+       --bundle | -b)
+               case "$cur" in
+               '')
+                       COMPREPLY=($(compgen -W '/' -- "$cur"))
+                       __runc_nospace
+                       ;;
+               /*)
+                       _filedir
+                       __runc_nospace
+                       ;;
+               esac
+               return
+               ;;
+
+       $(__runc_to_extglob "$options_with_args"))
+               return
+               ;;
+       esac
+
+       case "$cur" in
+       -*)
+               COMPREPLY=($(compgen -W "$boolean_options $options_with_args" -- "$cur"))
+               ;;
+       *)
+               local counter=$(__runc_pos_first_nonflag $(__runc_to_extglob "$options_with_args"))
+               ;;
+       esac
+}
+
+_runc_run() {
+       local boolean_options="
+          --help
+          --detatch
+          -d
+          --no-subreaper
+          --no-pivot
+          --no-new-keyring
+       "
+
+       local options_with_args="
+          --bundle
+          -b
+          --console-socket
+          --pid-file
+          --preserve-fds
+       "
+
+       case "$prev" in
+       --bundle | -b | --console-socket | --pid-file)
+               case "$cur" in
+               '')
+                       COMPREPLY=($(compgen -W '/' -- "$cur"))
+                       __runc_nospace
+                       ;;
+               /*)
+                       _filedir
+                       __runc_nospace
+                       ;;
+               esac
+               return
+               ;;
+
+       $(__runc_to_extglob "$options_with_args"))
+               return
+               ;;
+       esac
+
+       case "$cur" in
+       -*)
+               COMPREPLY=($(compgen -W "$boolean_options $options_with_args" -- "$cur"))
+               ;;
+       *)
+               __runc_list_all
+               ;;
+       esac
+}
+
+_runc_checkpoint() {
+       local boolean_options="
+          --help
+          -h
+          --leave-running
+          --tcp-established
+          --ext-unix-sk
+          --shell-job
+          --lazy-pages
+          --file-locks
+          --pre-dump
+          --auto-dedup
+       "
+
+       local options_with_args="
+          --image-path
+          --work-path
+          --parent-path
+          --status-fd
+          --page-server
+          --manage-cgroups-mode
+          --empty-ns
+       "
+
+       case "$prev" in
+       --page-server) ;;
+
+       --manage-cgroups-mode)
+               COMPREPLY=($(compgen -W "soft full strict" -- "$cur"))
+               return
+               ;;
+
+       --image-path | --work-path | --parent-path)
+               case "$cur" in
+               *:*) ;; # TODO somehow do _filedir for stuff inside the image, if it's already specified (which is also somewhat difficult to determine)
+               '')
+                       COMPREPLY=($(compgen -W '/' -- "$cur"))
+                       __runc_nospace
+                       ;;
+               *)
+                       _filedir
+                       __runc_nospace
+                       ;;
+               esac
+               return
+               ;;
+
+       $(__runc_to_extglob "$options_with_args"))
+               return
+               ;;
+       esac
+
+       case "$cur" in
+       -*)
+               COMPREPLY=($(compgen -W "$boolean_options $options_with_args" -- "$cur"))
+               ;;
+       *)
+               __runc_list_all
+               ;;
+       esac
+}
+_runc_create() {
+       local boolean_options="
+          --help
+          --no-pivot
+          --no-new-keyring
+       "
+
+       local options_with_args="
+          --bundle
+          -b
+          --console-socket
+          --pid-file
+          --preserve-fds
+       "
+       case "$prev" in
+       --bundle | -b | --console-socket | --pid-file)
+               case "$cur" in
+               '')
+                       COMPREPLY=($(compgen -W '/' -- "$cur"))
+                       __runc_nospace
+                       ;;
+               /*)
+                       _filedir
+                       __runc_nospace
+                       ;;
+               esac
+               return
+               ;;
+
+       $(__runc_to_extglob "$options_with_args"))
+               return
+               ;;
+       esac
+
+       case "$cur" in
+       -*)
+               COMPREPLY=($(compgen -W "$boolean_options $options_with_args" -- "$cur"))
+               ;;
+       *)
+               __runc_list_all
+               ;;
+       esac
+
+}
+
+_runc_help() {
+       local counter=$(__runc_pos_first_nonflag)
+       if [ $cword -eq $counter ]; then
+               COMPREPLY=($(compgen -W "${commands[*]}" -- "$cur"))
+       fi
+}
+
+_runc_restore() {
+       local boolean_options="
+          --help
+          --tcp-established
+          --ext-unix-sk
+          --shell-job
+          --file-locks
+          --detach
+          -d
+          --no-subreaper
+          --no-pivot
+          --auto-dedup
+          --lazy-pages
+       "
+
+       local options_with_args="
+          -b
+          --bundle
+          --image-path
+          --work-path
+          --manage-cgroups-mode
+          --pid-file
+          --empty-ns
+       "
+
+       local all_options="$options_with_args $boolean_options"
+
+       case "$prev" in
+       --manage-cgroups-mode)
+               COMPREPLY=($(compgen -W "soft full strict" -- "$cur"))
+               return
+               ;;
+
+       --pid-file | --image-path | --work-path | --bundle | -b)
+               case "$cur" in
+               *:*) ;; # TODO somehow do _filedir for stuff inside the image, if it's already specified (which is also somewhat difficult to determine)
+               '')
+                       COMPREPLY=($(compgen -W '/' -- "$cur"))
+                       __runc_nospace
+                       ;;
+               /*)
+                       _filedir
+                       __runc_nospace
+                       ;;
+               esac
+               return
+               ;;
+
+       $(__runc_to_extglob "$options_with_args"))
+               return
+               ;;
+       esac
+
+       case "$cur" in
+       -*)
+               COMPREPLY=($(compgen -W "$all_options" -- "$cur"))
+               ;;
+       *)
+               __runc_list_all
+               ;;
+       esac
+}
+
+_runc_resume() {
+       local boolean_options="
+          --help
+          -h
+       "
+
+       case "$cur" in
+       -*)
+               COMPREPLY=($(compgen -W "$boolean_options $options_with_args" -- "$cur"))
+               ;;
+       *)
+               __runc_list_all
+               ;;
+       esac
+}
+
+_runc_state() {
+       local boolean_options="
+          --help
+          -h
+       "
+
+       case "$cur" in
+       -*)
+               COMPREPLY=($(compgen -W "$boolean_options $options_with_args" -- "$cur"))
+               ;;
+       *)
+               __runc_list_all
+               ;;
+       esac
+}
+_runc_start() {
+       local boolean_options="
+          --help
+          -h
+       "
+
+       case "$cur" in
+       -*)
+               COMPREPLY=($(compgen -W "$boolean_options $options_with_args" -- "$cur"))
+               ;;
+       *)
+               __runc_list_all
+               ;;
+       esac
+}
+_runc_update() {
+       local boolean_options="
+          --help
+       "
+
+       local options_with_args="
+          --blkio-weight
+          --cpu-period
+          --cpu-quota
+          --cpu-rt-period
+          --cpu-rt-runtime
+          --cpu-share
+          --cpuset-cpus
+          --cpuset-mems
+          --kernel-memory
+          --kernel-memory-tcp
+          --memory
+          --memory-reservation
+          --memory-swap
+          --pids-limit
+          --l3-cache-schema
+          --mem-bw-schema
+       "
+
+       case "$prev" in
+       $(__runc_to_extglob "$options_with_args"))
+               return
+               ;;
+       esac
+
+       case "$cur" in
+       -*)
+               COMPREPLY=($(compgen -W "$boolean_options $options_with_args" -- "$cur"))
+               ;;
+       *)
+               __runc_list_all
+               ;;
+       esac
+}
+
+_runc() {
+       local previous_extglob_setting=$(shopt -p extglob)
+       shopt -s extglob
+
+       local commands=(
+               checkpoint
+               create
+               delete
+               events
+               exec
+               init
+               kill
+               list
+               pause
+               ps
+               restore
+               resume
+               run
+               spec
+               start
+               state
+               update
+               help
+               h
+       )
+
+       # These options are valid as global options for all client commands
+       # and valid as command options for `runc daemon`
+       local global_boolean_options="
+               --help -h
+               --version -v
+       "
+
+       COMPREPLY=()
+       local cur prev words cword
+       _get_comp_words_by_ref -n : cur prev words cword
+
+       local command='runc' command_pos=0 subcommand_pos
+       local counter=1
+       while [ $counter -lt $cword ]; do
+               case "${words[$counter]}" in
+               -*) ;;
+               =)
+                       ((counter++))
+                       ;;
+               *)
+                       command="${words[$counter]}"
+                       command_pos=$counter
+                       break
+                       ;;
+               esac
+               ((counter++))
+       done
+
+       local completions_func=_runc_${command}
+       declare -F $completions_func >/dev/null && $completions_func
+
+       eval "$previous_extglob_setting"
+       return 0
+}
+
+eval "$__runc_previous_extglob_setting"
+unset __runc_previous_extglob_setting
+
+complete -F _runc runc
diff --git a/create.go b/create.go
new file mode 100644 (file)
index 0000000..5f3ac60
--- /dev/null
+++ b/create.go
@@ -0,0 +1,74 @@
+package main
+
+import (
+       "os"
+
+       "github.com/urfave/cli"
+)
+
+var createCommand = cli.Command{
+       Name:  "create",
+       Usage: "create a container",
+       ArgsUsage: `<container-id>
+
+Where "<container-id>" is your name for the instance of the container that you
+are starting. The name you provide for the container instance must be unique on
+your host.`,
+       Description: `The create command creates an instance of a container for a bundle. The bundle
+is a directory with a specification file named "` + specConfig + `" and a root
+filesystem.
+
+The specification file includes an args parameter. The args parameter is used
+to specify command(s) that get run when the container is started. To change the
+command(s) that get executed on start, edit the args parameter of the spec. See
+"runc spec --help" for more explanation.`,
+       Flags: []cli.Flag{
+               cli.StringFlag{
+                       Name:  "bundle, b",
+                       Value: "",
+                       Usage: `path to the root of the bundle directory, defaults to the current directory`,
+               },
+               cli.StringFlag{
+                       Name:  "console-socket",
+                       Value: "",
+                       Usage: "path to an AF_UNIX socket which will receive a file descriptor referencing the master end of the console's pseudoterminal",
+               },
+               cli.StringFlag{
+                       Name:  "pid-file",
+                       Value: "",
+                       Usage: "specify the file to write the process id to",
+               },
+               cli.BoolFlag{
+                       Name:  "no-pivot",
+                       Usage: "do not use pivot root to jail process inside rootfs.  This should be used whenever the rootfs is on top of a ramdisk",
+               },
+               cli.BoolFlag{
+                       Name:  "no-new-keyring",
+                       Usage: "do not create a new session keyring for the container.  This will cause the container to inherit the calling processes session key",
+               },
+               cli.IntFlag{
+                       Name:  "preserve-fds",
+                       Usage: "Pass N additional file descriptors to the container (stdio + $LISTEN_FDS + N in total)",
+               },
+       },
+       Action: func(context *cli.Context) error {
+               if err := checkArgs(context, 1, exactArgs); err != nil {
+                       return err
+               }
+               if err := revisePidFile(context); err != nil {
+                       return err
+               }
+               spec, err := setupSpec(context)
+               if err != nil {
+                       return err
+               }
+               status, err := startContainer(context, spec, CT_ACT_CREATE, nil)
+               if err != nil {
+                       return err
+               }
+               // exit with the container's exit status so any external supervisor is
+               // notified of the exit with the correct exit status.
+               os.Exit(status)
+               return nil
+       },
+}
diff --git a/delete.go b/delete.go
new file mode 100644 (file)
index 0000000..fb6f38e
--- /dev/null
+++ b/delete.go
@@ -0,0 +1,89 @@
+// +build !solaris
+
+package main
+
+import (
+       "fmt"
+       "os"
+       "path/filepath"
+       "syscall"
+       "time"
+
+       "github.com/opencontainers/runc/libcontainer"
+       "github.com/urfave/cli"
+
+       "golang.org/x/sys/unix"
+)
+
+func killContainer(container libcontainer.Container) error {
+       _ = container.Signal(unix.SIGKILL, false)
+       for i := 0; i < 100; i++ {
+               time.Sleep(100 * time.Millisecond)
+               if err := container.Signal(syscall.Signal(0), false); err != nil {
+                       destroy(container)
+                       return nil
+               }
+       }
+       return fmt.Errorf("container init still running")
+}
+
+var deleteCommand = cli.Command{
+       Name:  "delete",
+       Usage: "delete any resources held by the container often used with detached container",
+       ArgsUsage: `<container-id>
+
+Where "<container-id>" is the name for the instance of the container.
+
+EXAMPLE:
+For example, if the container id is "ubuntu01" and runc list currently shows the
+status of "ubuntu01" as "stopped" the following will delete resources held for
+"ubuntu01" removing "ubuntu01" from the runc list of containers:
+
+       # runc delete ubuntu01`,
+       Flags: []cli.Flag{
+               cli.BoolFlag{
+                       Name:  "force, f",
+                       Usage: "Forcibly deletes the container if it is still running (uses SIGKILL)",
+               },
+       },
+       Action: func(context *cli.Context) error {
+               if err := checkArgs(context, 1, exactArgs); err != nil {
+                       return err
+               }
+
+               id := context.Args().First()
+               force := context.Bool("force")
+               container, err := getContainer(context)
+               if err != nil {
+                       if lerr, ok := err.(libcontainer.Error); ok && lerr.Code() == libcontainer.ContainerNotExists {
+                               // if there was an aborted start or something of the sort then the container's directory could exist but
+                               // libcontainer does not see it because the state.json file inside that directory was never created.
+                               path := filepath.Join(context.GlobalString("root"), id)
+                               if e := os.RemoveAll(path); e != nil {
+                                       fmt.Fprintf(os.Stderr, "remove %s: %v\n", path, e)
+                               }
+                               if force {
+                                       return nil
+                               }
+                       }
+                       return err
+               }
+               s, err := container.Status()
+               if err != nil {
+                       return err
+               }
+               switch s {
+               case libcontainer.Stopped:
+                       destroy(container)
+               case libcontainer.Created:
+                       return killContainer(container)
+               default:
+                       if force {
+                               return killContainer(container)
+                       }
+                       return fmt.Errorf("cannot delete container %s that is not stopped: %s\n", id, s)
+               }
+
+               return nil
+       },
+}
diff --git a/docs/checkpoint-restore.md b/docs/checkpoint-restore.md
new file mode 100644 (file)
index 0000000..80ec46b
--- /dev/null
@@ -0,0 +1,50 @@
+# Checkpoint and Restore #
+
+For a basic description about checkpointing and restoring containers with
+`runc` please see [runc-checkpoint(8)](../man/runc-checkpoint.8.md) and
+[runc-restore(8)](../man/runc-restore.8.md).
+
+## Checkpoint/Restore Annotations ##
+
+In addition to specifying options on the command-line like it is described
+in the man-pages (see above), it is also possible to influence CRIU's
+behaviour using CRIU configuration files. For details about CRIU's
+configuration file support please see [CRIU's wiki](https://criu.org/Configuration_files).
+
+In addition to CRIU's default configuration files `runc` tells CRIU to
+also evaluate the file `/etc/criu/runc.conf`. Using the annotation
+`org.criu.config` it is, however, possible to change this additional
+CRIU configuration file.
+
+If the annotation `org.criu.config` is set to an empty string `runc`
+will not pass any additional configuration file to CRIU. With an empty
+string it is therefore possible to disable the additional CRIU configuration
+file. This can be used to make sure that no additional configuration file
+changes CRIU's behaviour accidentally.
+
+If the annotation `org.criu.config` is set to a non-empty string `runc` will
+pass that string to CRIU to be evaluated as an additional configuration file.
+If CRIU cannot open this additional configuration file, it will ignore this
+file and continue.
+
+### Annotation Example to disable additional CRIU configuration file ###
+
+```
+{
+       "ociVersion": "1.0.0",
+       "annotations": {
+               "org.criu.config": ""
+       },
+       "process": {
+```
+
+### Annotation Example to set a specific CRIU configuration file ###
+
+```
+{
+       "ociVersion": "1.0.0",
+       "annotations": {
+               "org.criu.config": "/etc/special-runc-criu-options"
+       },
+       "process": {
+```
diff --git a/docs/terminals.md b/docs/terminals.md
new file mode 100644 (file)
index 0000000..fc000e1
--- /dev/null
@@ -0,0 +1,314 @@
+# Terminals and Standard IO #
+
+*Note that the default configuration of `runc` (foreground, new terminal) is
+generally the best option for most users. This document exists to help explain
+what the purpose of the different modes is, and to try to steer users away from
+common mistakes and misunderstandings.*
+
+In general, most processes on Unix (and Unix-like) operating systems have 3
+standard file descriptors provided at the start, collectively referred to as
+"standard IO" (`stdio`):
+
+* `0`: standard-in (`stdin`), the input stream into the process
+* `1`: standard-out (`stdout`), the output stream from the process
+* `2`: standard-error (`stderr`), the error stream from the process
+
+When creating and running a container via `runc`, it is important to take care
+to structure the `stdio` the new container's process receives. In some ways
+containers are just regular processes, while in other ways they're an isolated
+sub-partition of your machine (in a similar sense to a VM). This means that the
+structure of IO is not as simple as with ordinary programs (which generally
+just use the file descriptors you give them).
+
+## Other File Descriptors ##
+
+Before we continue, it is important to note that processes can have more file
+descriptors than just `stdio`. By default in `runc` no other file descriptors
+will be passed to the spawned container process. If you wish to explicitly pass
+file descriptors to the container you have to use the `--preserve-fds` option.
+These ancillary file descriptors don't have any of the strange semantics
+discussed further in this document (those only apply to `stdio`) -- they are
+passed untouched by `runc`.
+
+It should be noted that `--preserve-fds` does not take individual file
+descriptors to preserve. Instead, it takes how many file descriptors (not
+including `stdio` or `LISTEN_FDS`) should be passed to the container. In the
+following example:
+
+```
+% runc run --preserve-fds 5 <container>
+```
+
+`runc` will pass the first `5` file descriptors (`3`, `4`, `5`, `6`, and `7` --
+assuming that `LISTEN_FDS` has not been configured) to the container.
+
+In addition to `--preserve-fds`, `LISTEN_FDS` file descriptors are passed
+automatically to allow for `systemd`-style socket activation. To extend the
+above example:
+
+```
+% LISTEN_PID=$pid_of_runc LISTEN_FDS=3 runc run --preserve-fds 5 <container>
+```
+
+`runc` will now pass the first `8` file descriptors (and it will also pass
+`LISTEN_FDS=3` and `LISTEN_PID=1` to the container). The first `3` (`3`, `4`,
+and `5`) were passed due to `LISTEN_FDS` and the other `5` (`6`, `7`, `8`, `9`,
+and `10`) were passed due to `--preserve-fds`. You should keep this in mind if
+you use `runc` directly in something like a `systemd` unit file. To disable
+this `LISTEN_FDS`-style passing just unset `LISTEN_FDS`.
+
+**Be very careful when passing file descriptors to a container process.** Due
+to some Linux kernel (mis)features, a container with access to certain types of
+file descriptors (such as `O_PATH` descriptors) outside of the container's root
+file system can use these to break out of the container's pivoted mount
+namespace. [This has resulted in CVEs in the past.][CVE-2016-9962]
+
+[CVE-2016-9962]: https://nvd.nist.gov/vuln/detail/CVE-2016-9962
+
+## <a name="terminal-modes" /> Terminal Modes ##
+
+`runc` supports two distinct methods for passing `stdio` to the container's
+primary process:
+
+* [new terminal](#new-terminal) (`terminal: true`)
+* [pass-through](#pass-through) (`terminal: false`)
+
+When first using `runc` these two modes will look incredibly similar, but this
+can be quite deceptive as these different modes have quite different
+characteristics.
+
+By default, `runc spec` will create a configuration that will create a new
+terminal (`terminal: true`). However, if the `terminal: ...` line is not
+present in `config.json` then pass-through is the default.
+
+*In general we recommend using new terminal, because it means that tools like
+`sudo` will work inside your container. But pass-through can be useful if you
+know what you're doing, or if you're using `runc` as part of a non-interactive
+pipeline.*
+
+### <a name="new-terminal"> New Terminal ###
+
+In new terminal mode, `runc` will create a brand-new "console" (or more
+precisely, a new pseudo-terminal using the container's namespaced
+`/dev/pts/ptmx`) for your contained process to use as its `stdio`.
+
+When you start a process in new terminal mode, `runc` will do the following:
+
+1. Create a new pseudo-terminal.
+2. Pass the slave end to the container's primary process as its `stdio`.
+3. Send the master end to a process to interact with the `stdio` for the
+   container's primary process ([details below](#runc-modes)).
+
+It should be noted that since a new pseudo-terminal is being used for
+communication with the container, some strange properties of pseudo-terminals
+might surprise you. For instance, by default, all new pseudo-terminals
+translate the byte `'\n'` to the sequence `'\r\n'` on both `stdout` and
+`stderr`. In addition there are [a whole range of `ioctls(2)` that can only
+interact with pseudo-terminal `stdio`][tty_ioctl(4)].
+
+> **NOTE**: In new terminal mode, all three `stdio` file descriptors are the
+> same underlying file. The reason for this is to match how a shell's `stdio`
+> looks to a process (as well as remove race condition issues with having to
+> deal with multiple master pseudo-terminal file descriptors). However this
+> means that it is not really possible to uniquely distinguish between `stdout`
+> and `stderr` from the caller's perspective.
+
+[tty_ioctl(4)]: https://linux.die.net/man/4/tty_ioctl
+
+### <a name="pass-through"> Pass-Through ###
+
+If you have already set up some file handles that you wish your contained
+process to use as its `stdio`, then you can ask `runc` to pass them through to
+the contained process (this is not necessarily the same as `--preserve-fds`'s
+passing of file descriptors -- [details below](#runc-modes)). As an example
+(assuming that `terminal: false` is set in `config.json`):
+
+```
+% echo input | runc run some_container > /tmp/log.out 2>& /tmp/log.err
+```
+
+Here the container's various `stdio` file descriptors will be substituted with
+the following:
+
+* `stdin` will be sourced from the `echo input` pipeline.
+* `stdout` will be output into `/tmp/log.out` on the host.
+* `stderr` will be output into `/tmp/log.err` on the host.
+
+It should be noted that the actual file handles seen inside the container may
+be different [based on the mode `runc` is being used in](#runc-modes) (for
+instance, the file referenced by `1` could be `/tmp/log.out` directly or a pipe
+which `runc` is using to buffer output, based on the mode). However the net
+result will be the same in either case. In principle you could use the [new
+terminal mode](#new-terminal) in a pipeline, but the difference will become
+more clear when you are introduced to [`runc`'s detached mode](#runc-modes).
+
+## <a name="runc-modes" /> `runc` Modes ##
+
+`runc` itself runs in two modes:
+
+* [foreground](#foreground)
+* [detached](#detached)
+
+You can use either [terminal mode](#terminal-modes) with either `runc` mode.
+However, there are considerations that may indicate preference for one mode
+over another. It should be noted that while two types of modes (terminal and
+`runc`) are conceptually independent from each other, you should be aware of
+the intricacies of which combination you are using.
+
+*In general we recommend using foreground because it's the most
+straight-forward to use, with the only downside being that you will have a
+long-running `runc` process. Detached mode is difficult to get right and
+generally requires having your own `stdio` management.*
+
+### Foreground ###
+
+The default (and most straight-forward) mode of `runc`. In this mode, your
+`runc` command remains in the foreground with the container process as a child.
+All `stdio` is buffered through the foreground `runc` process (irrespective of
+which terminal mode you are using). This is conceptually quite similar to
+running a normal process interactively in a shell (and if you are using `runc`
+in a shell interactively, this is what you should use).
+
+Because the `stdio` will be buffered in this mode, some very important
+peculiarities of this mode should be kept in mind:
+
+* With [new terminal mode](#new-terminal), the container will see a
+  pseudo-terminal as its `stdio` (as you might expect). However, the `stdio` of
+  the foreground `runc` process will remain the `stdio` that the process was
+  started with -- and `runc` will copy all `stdio` between its `stdio` and the
+  container's `stdio`. This means that while a new pseudo-terminal has been
+  created, the foreground `runc` process manages it over the lifetime of the
+  container.
+
+* With [pass-through mode](#pass-through), the foreground `runc`'s `stdio` is
+  **not** passed to the container. Instead, the container's `stdio` is a set of
+  pipes which are used to copy data between `runc`'s `stdio` and the
+  container's `stdio`. This means that the container never has direct access to
+  host file descriptors (aside from the pipes created by the container runtime,
+  but that shouldn't be an issue).
+
+The main drawback of the foreground mode of operation is that it requires a
+long-running foreground `runc` process. If you kill the foreground `runc`
+process then you will no longer have access to the `stdio` of the container
+(and in most cases this will result in the container dying abnormally due to
+`SIGPIPE` or some other error). By extension this means that any bug in the
+long-running foreground `runc` process (such as a memory leak) or a stray
+OOM-kill sweep could result in your container being killed **through no fault
+of the user**. In addition, there is no way in foreground mode of passing a
+file descriptor directly to the container process as its `stdio` (like
+`--preserve-fds` does).
+
+These shortcomings are obviously sub-optimal and are the reason that `runc` has
+an additional mode called "detached mode".
+
+### Detached ###
+
+In contrast to foreground mode, in detached mode there is no long-running
+foreground `runc` process once the container has started. In fact, there is no
+long-running `runc` process at all. However, this means that it is up to the
+caller to handle the `stdio` after `runc` has set it up for you. In a shell
+this means that the `runc` command will exit and control will return to the
+shell, after the container has been set up.
+
+You can run `runc` in detached mode in one of the following ways:
+
+* `runc run -d ...` which operates similar to `runc run` but is detached.
+* `runc create` followed by `runc start` which is the standard container
+  lifecycle defined by the OCI runtime specification (`runc create` sets up the
+  container completely, waiting for `runc start` to begin execution of user
+  code).
+
+The main use-case of detached mode is for higher-level tools that want to be
+wrappers around `runc`. By running `runc` in detached mode, those tools have
+far more control over the container's `stdio` without `runc` getting in the
+way (most wrappers around `runc` like `cri-o` or `containerd` use detached mode
+for this reason).
+
+Unfortunately using detached mode is a bit more complicated and requires more
+care than the foreground mode -- mainly because it is now up to the caller to
+handle the `stdio` of the container.
+
+#### Detached Pass-Through ####
+
+In detached mode, pass-through actually does what it says on the tin -- the
+`stdio` file descriptors of the `runc` process are passed through (untouched)
+to the container's `stdio`. The purpose of this option is to allow a user to
+set up `stdio` for a container themselves and then force `runc` to just use
+their pre-prepared `stdio` (without any pseudo-terminal funny business). *If
+you don't see why this would be useful, don't use this option.*
+
+**You must be incredibly careful when using detached pass-through (especially
+in a shell).** The reason for this is that by using detached pass-through you
+are passing host file descriptors to the container. In the case of a shell,
+usually your `stdio` is going to be a pseudo-terminal (on your host). A
+malicious container could take advantage of TTY-specific `ioctls` like
+`TIOCSTI` to fake input into the **host** shell (remember that in detached
+mode, control is returned to your shell and so the terminal you've given the
+container is being read by a shell prompt).
+
+There are also several other issues with running non-malicious containers in a
+shell with detached pass-through (where you pass your shell's `stdio` to the
+container):
+
+* Output from the container will be interleaved with output from your shell (in
+  a non-deterministic way), without any real way of distinguishing from where a
+  particular piece of output came from.
+
+* Any input to `stdin` will be non-deterministically split and given to either
+  the container or the shell (because both are blocked on a `read(2)` of the
+  same FIFO-style file descriptor).
+
+They are all related to the fact that there is going to be a race when either
+your host or the container tries to read from (or write to) `stdio`. This
+problem is especially obvious when in a shell, where usually the terminal has
+been put into raw mode (where each individual key-press should cause `read(2)`
+to return).
+
+> **NOTE**: There is also currently a [known problem][issue-1721] where using
+> detached pass-through will result in the container hanging if the `stdout` or
+> `stderr` is a pipe (though this should be a temporary issue).
+
+[issue-1721]: https://github.com/opencontainers/runc/issues/1721
+
+#### Detached New Terminal ####
+
+When creating a new pseudo-terminal in detached mode, and fairly obvious
+problem appears -- how do we use the new terminal that `runc` created? Unlike
+in pass-through, `runc` has created a new set of file descriptors that need to
+be used by *something* in order for container communication to work.
+
+The way this problem is resolved is through the use of Unix domain sockets.
+There is a feature of Unix sockets called `SCM_RIGHTS` which allows a file
+descriptor to be sent through a Unix socket to a completely separate process
+(which can then use that file descriptor as though they opened it). When using
+`runc` in detached new terminal mode, this is how a user gets access to the
+pseudo-terminal's master file descriptor.
+
+To this end, there is a new option (which is required if you want to use `runc`
+in detached new terminal mode): `--console-socket`. This option takes the path
+to a Unix domain socket which `runc` will connect to and send the
+pseudo-terminal master file descriptor down. The general process for getting
+the pseudo-terminal master is as follows:
+
+1. Create a Unix domain socket at some path, `$socket_path`.
+2. Call `runc run` or `runc create` with the argument `--console-socket
+   $socket_path`.
+3. Using `recvmsg(2)` retrieve the file descriptor sent using `SCM_RIGHTS` by
+   `runc`.
+4. Now the manager can interact with the `stdio` of the container, using the
+   retrieved pseudo-terminal master.
+
+After `runc` exits, the only process with a copy of the pseudo-terminal master
+file descriptor is whoever read the file descriptor from the socket.
+
+> **NOTE**: Currently `runc` doesn't support abstract socket addresses (due to
+> it not being possible to pass an `argv` with a null-byte as the first
+> character). In the future this may change, but currently you must use a valid
+> path name.
+
+In order to help users make use of detached new terminal mode, we have provided
+a [Go implementation in the `go-runc` bindings][containerd/go-runc.Socket], as
+well as [a simple client][recvtty].
+
+[containerd/go-runc.Socket]: https://godoc.org/github.com/containerd/go-runc#Socket
+[recvtty]: /contrib/cmd/recvtty
diff --git a/events.go b/events.go
new file mode 100644 (file)
index 0000000..fb3f630
--- /dev/null
+++ b/events.go
@@ -0,0 +1,215 @@
+// +build linux
+
+package main
+
+import (
+       "encoding/json"
+       "fmt"
+       "os"
+       "sync"
+       "time"
+
+       "github.com/opencontainers/runc/libcontainer"
+       "github.com/opencontainers/runc/libcontainer/cgroups"
+       "github.com/opencontainers/runc/libcontainer/intelrdt"
+       "github.com/opencontainers/runc/types"
+
+       "github.com/sirupsen/logrus"
+       "github.com/urfave/cli"
+)
+
+var eventsCommand = cli.Command{
+       Name:  "events",
+       Usage: "display container events such as OOM notifications, cpu, memory, and IO usage statistics",
+       ArgsUsage: `<container-id>
+
+Where "<container-id>" is the name for the instance of the container.`,
+       Description: `The events command displays information about the container. By default the
+information is displayed once every 5 seconds.`,
+       Flags: []cli.Flag{
+               cli.DurationFlag{Name: "interval", Value: 5 * time.Second, Usage: "set the stats collection interval"},
+               cli.BoolFlag{Name: "stats", Usage: "display the container's stats then exit"},
+       },
+       Action: func(context *cli.Context) error {
+               if err := checkArgs(context, 1, exactArgs); err != nil {
+                       return err
+               }
+               container, err := getContainer(context)
+               if err != nil {
+                       return err
+               }
+               duration := context.Duration("interval")
+               if duration <= 0 {
+                       return fmt.Errorf("duration interval must be greater than 0")
+               }
+               status, err := container.Status()
+               if err != nil {
+                       return err
+               }
+               if status == libcontainer.Stopped {
+                       return fmt.Errorf("container with id %s is not running", container.ID())
+               }
+               var (
+                       stats  = make(chan *libcontainer.Stats, 1)
+                       events = make(chan *types.Event, 1024)
+                       group  = &sync.WaitGroup{}
+               )
+               group.Add(1)
+               go func() {
+                       defer group.Done()
+                       enc := json.NewEncoder(os.Stdout)
+                       for e := range events {
+                               if err := enc.Encode(e); err != nil {
+                                       logrus.Error(err)
+                               }
+                       }
+               }()
+               if context.Bool("stats") {
+                       s, err := container.Stats()
+                       if err != nil {
+                               return err
+                       }
+                       events <- &types.Event{Type: "stats", ID: container.ID(), Data: convertLibcontainerStats(s)}
+                       close(events)
+                       group.Wait()
+                       return nil
+               }
+               go func() {
+                       for range time.Tick(context.Duration("interval")) {
+                               s, err := container.Stats()
+                               if err != nil {
+                                       logrus.Error(err)
+                                       continue
+                               }
+                               stats <- s
+                       }
+               }()
+               n, err := container.NotifyOOM()
+               if err != nil {
+                       return err
+               }
+               for {
+                       select {
+                       case _, ok := <-n:
+                               if ok {
+                                       // this means an oom event was received, if it is !ok then
+                                       // the channel was closed because the container stopped and
+                                       // the cgroups no longer exist.
+                                       events <- &types.Event{Type: "oom", ID: container.ID()}
+                               } else {
+                                       n = nil
+                               }
+                       case s := <-stats:
+                               events <- &types.Event{Type: "stats", ID: container.ID(), Data: convertLibcontainerStats(s)}
+                       }
+                       if n == nil {
+                               close(events)
+                               break
+                       }
+               }
+               group.Wait()
+               return nil
+       },
+}
+
+func convertLibcontainerStats(ls *libcontainer.Stats) *types.Stats {
+       cg := ls.CgroupStats
+       if cg == nil {
+               return nil
+       }
+       var s types.Stats
+       s.Pids.Current = cg.PidsStats.Current
+       s.Pids.Limit = cg.PidsStats.Limit
+
+       s.CPU.Usage.Kernel = cg.CpuStats.CpuUsage.UsageInKernelmode
+       s.CPU.Usage.User = cg.CpuStats.CpuUsage.UsageInUsermode
+       s.CPU.Usage.Total = cg.CpuStats.CpuUsage.TotalUsage
+       s.CPU.Usage.Percpu = cg.CpuStats.CpuUsage.PercpuUsage
+       s.CPU.Throttling.Periods = cg.CpuStats.ThrottlingData.Periods
+       s.CPU.Throttling.ThrottledPeriods = cg.CpuStats.ThrottlingData.ThrottledPeriods
+       s.CPU.Throttling.ThrottledTime = cg.CpuStats.ThrottlingData.ThrottledTime
+
+       s.Memory.Cache = cg.MemoryStats.Cache
+       s.Memory.Kernel = convertMemoryEntry(cg.MemoryStats.KernelUsage)
+       s.Memory.KernelTCP = convertMemoryEntry(cg.MemoryStats.KernelTCPUsage)
+       s.Memory.Swap = convertMemoryEntry(cg.MemoryStats.SwapUsage)
+       s.Memory.Usage = convertMemoryEntry(cg.MemoryStats.Usage)
+       s.Memory.Raw = cg.MemoryStats.Stats
+
+       s.Blkio.IoServiceBytesRecursive = convertBlkioEntry(cg.BlkioStats.IoServiceBytesRecursive)
+       s.Blkio.IoServicedRecursive = convertBlkioEntry(cg.BlkioStats.IoServicedRecursive)
+       s.Blkio.IoQueuedRecursive = convertBlkioEntry(cg.BlkioStats.IoQueuedRecursive)
+       s.Blkio.IoServiceTimeRecursive = convertBlkioEntry(cg.BlkioStats.IoServiceTimeRecursive)
+       s.Blkio.IoWaitTimeRecursive = convertBlkioEntry(cg.BlkioStats.IoWaitTimeRecursive)
+       s.Blkio.IoMergedRecursive = convertBlkioEntry(cg.BlkioStats.IoMergedRecursive)
+       s.Blkio.IoTimeRecursive = convertBlkioEntry(cg.BlkioStats.IoTimeRecursive)
+       s.Blkio.SectorsRecursive = convertBlkioEntry(cg.BlkioStats.SectorsRecursive)
+
+       s.Hugetlb = make(map[string]types.Hugetlb)
+       for k, v := range cg.HugetlbStats {
+               s.Hugetlb[k] = convertHugtlb(v)
+       }
+
+       if is := ls.IntelRdtStats; is != nil {
+               if intelrdt.IsCatEnabled() {
+                       s.IntelRdt.L3CacheInfo = convertL3CacheInfo(is.L3CacheInfo)
+                       s.IntelRdt.L3CacheSchemaRoot = is.L3CacheSchemaRoot
+                       s.IntelRdt.L3CacheSchema = is.L3CacheSchema
+               }
+               if intelrdt.IsMbaEnabled() {
+                       s.IntelRdt.MemBwInfo = convertMemBwInfo(is.MemBwInfo)
+                       s.IntelRdt.MemBwSchemaRoot = is.MemBwSchemaRoot
+                       s.IntelRdt.MemBwSchema = is.MemBwSchema
+               }
+       }
+
+       s.NetworkInterfaces = ls.Interfaces
+       return &s
+}
+
+func convertHugtlb(c cgroups.HugetlbStats) types.Hugetlb {
+       return types.Hugetlb{
+               Usage:   c.Usage,
+               Max:     c.MaxUsage,
+               Failcnt: c.Failcnt,
+       }
+}
+
+func convertMemoryEntry(c cgroups.MemoryData) types.MemoryEntry {
+       return types.MemoryEntry{
+               Limit:   c.Limit,
+               Usage:   c.Usage,
+               Max:     c.MaxUsage,
+               Failcnt: c.Failcnt,
+       }
+}
+
+func convertBlkioEntry(c []cgroups.BlkioStatEntry) []types.BlkioEntry {
+       var out []types.BlkioEntry
+       for _, e := range c {
+               out = append(out, types.BlkioEntry{
+                       Major: e.Major,
+                       Minor: e.Minor,
+                       Op:    e.Op,
+                       Value: e.Value,
+               })
+       }
+       return out
+}
+
+func convertL3CacheInfo(i *intelrdt.L3CacheInfo) *types.L3CacheInfo {
+       return &types.L3CacheInfo{
+               CbmMask:    i.CbmMask,
+               MinCbmBits: i.MinCbmBits,
+               NumClosids: i.NumClosids,
+       }
+}
+
+func convertMemBwInfo(i *intelrdt.MemBwInfo) *types.MemBwInfo {
+       return &types.MemBwInfo{
+               BandwidthGran: i.BandwidthGran,
+               DelayLinear:   i.DelayLinear,
+               MinBandwidth:  i.MinBandwidth,
+               NumClosids:    i.NumClosids,
+       }
+}
diff --git a/exec.go b/exec.go
new file mode 100644 (file)
index 0000000..b963d68
--- /dev/null
+++ b/exec.go
@@ -0,0 +1,235 @@
+// +build linux
+
+package main
+
+import (
+       "encoding/json"
+       "fmt"
+       "os"
+       "strconv"
+       "strings"
+
+       "github.com/opencontainers/runc/libcontainer"
+       "github.com/opencontainers/runc/libcontainer/utils"
+       "github.com/opencontainers/runtime-spec/specs-go"
+       "github.com/urfave/cli"
+)
+
+var execCommand = cli.Command{
+       Name:  "exec",
+       Usage: "execute new process inside the container",
+       ArgsUsage: `<container-id> <command> [command options]  || -p process.json <container-id>
+
+Where "<container-id>" is the name for the instance of the container and
+"<command>" is the command to be executed in the container.
+"<command>" can't be empty unless a "-p" flag provided.
+
+EXAMPLE:
+For example, if the container is configured to run the linux ps command the
+following will output a list of processes running in the container:
+
+       # runc exec <container-id> ps`,
+       Flags: []cli.Flag{
+               cli.StringFlag{
+                       Name:  "console-socket",
+                       Usage: "path to an AF_UNIX socket which will receive a file descriptor referencing the master end of the console's pseudoterminal",
+               },
+               cli.StringFlag{
+                       Name:  "cwd",
+                       Usage: "current working directory in the container",
+               },
+               cli.StringSliceFlag{
+                       Name:  "env, e",
+                       Usage: "set environment variables",
+               },
+               cli.BoolFlag{
+                       Name:  "tty, t",
+                       Usage: "allocate a pseudo-TTY",
+               },
+               cli.StringFlag{
+                       Name:  "user, u",
+                       Usage: "UID (format: <uid>[:<gid>])",
+               },
+               cli.Int64SliceFlag{
+                       Name:  "additional-gids, g",
+                       Usage: "additional gids",
+               },
+               cli.StringFlag{
+                       Name:  "process, p",
+                       Usage: "path to the process.json",
+               },
+               cli.BoolFlag{
+                       Name:  "detach,d",
+                       Usage: "detach from the container's process",
+               },
+               cli.StringFlag{
+                       Name:  "pid-file",
+                       Value: "",
+                       Usage: "specify the file to write the process id to",
+               },
+               cli.StringFlag{
+                       Name:  "process-label",
+                       Usage: "set the asm process label for the process commonly used with selinux",
+               },
+               cli.StringFlag{
+                       Name:  "apparmor",
+                       Usage: "set the apparmor profile for the process",
+               },
+               cli.BoolFlag{
+                       Name:  "no-new-privs",
+                       Usage: "set the no new privileges value for the process",
+               },
+               cli.StringSliceFlag{
+                       Name:  "cap, c",
+                       Value: &cli.StringSlice{},
+                       Usage: "add a capability to the bounding set for the process",
+               },
+               cli.BoolFlag{
+                       Name:   "no-subreaper",
+                       Usage:  "disable the use of the subreaper used to reap reparented processes",
+                       Hidden: true,
+               },
+               cli.IntFlag{
+                       Name:  "preserve-fds",
+                       Usage: "Pass N additional file descriptors to the container (stdio + $LISTEN_FDS + N in total)",
+               },
+       },
+       Action: func(context *cli.Context) error {
+               if err := checkArgs(context, 1, minArgs); err != nil {
+                       return err
+               }
+               if err := revisePidFile(context); err != nil {
+                       return err
+               }
+               status, err := execProcess(context)
+               if err == nil {
+                       os.Exit(status)
+               }
+               return fmt.Errorf("exec failed: %v", err)
+       },
+       SkipArgReorder: true,
+}
+
+func execProcess(context *cli.Context) (int, error) {
+       container, err := getContainer(context)
+       if err != nil {
+               return -1, err
+       }
+       status, err := container.Status()
+       if err != nil {
+               return -1, err
+       }
+       if status == libcontainer.Stopped {
+               return -1, fmt.Errorf("cannot exec a container that has stopped")
+       }
+       path := context.String("process")
+       if path == "" && len(context.Args()) == 1 {
+               return -1, fmt.Errorf("process args cannot be empty")
+       }
+       detach := context.Bool("detach")
+       state, err := container.State()
+       if err != nil {
+               return -1, err
+       }
+       bundle := utils.SearchLabels(state.Config.Labels, "bundle")
+       p, err := getProcess(context, bundle)
+       if err != nil {
+               return -1, err
+       }
+
+       logLevel := "info"
+       if context.GlobalBool("debug") {
+               logLevel = "debug"
+       }
+
+       r := &runner{
+               enableSubreaper: false,
+               shouldDestroy:   false,
+               container:       container,
+               consoleSocket:   context.String("console-socket"),
+               detach:          detach,
+               pidFile:         context.String("pid-file"),
+               action:          CT_ACT_RUN,
+               init:            false,
+               preserveFDs:     context.Int("preserve-fds"),
+               logLevel:        logLevel,
+       }
+       return r.run(p)
+}
+
+func getProcess(context *cli.Context, bundle string) (*specs.Process, error) {
+       if path := context.String("process"); path != "" {
+               f, err := os.Open(path)
+               if err != nil {
+                       return nil, err
+               }
+               defer f.Close()
+               var p specs.Process
+               if err := json.NewDecoder(f).Decode(&p); err != nil {
+                       return nil, err
+               }
+               return &p, validateProcessSpec(&p)
+       }
+       // process via cli flags
+       if err := os.Chdir(bundle); err != nil {
+               return nil, err
+       }
+       spec, err := loadSpec(specConfig)
+       if err != nil {
+               return nil, err
+       }
+       p := spec.Process
+       p.Args = context.Args()[1:]
+       // override the cwd, if passed
+       if context.String("cwd") != "" {
+               p.Cwd = context.String("cwd")
+       }
+       if ap := context.String("apparmor"); ap != "" {
+               p.ApparmorProfile = ap
+       }
+       if l := context.String("process-label"); l != "" {
+               p.SelinuxLabel = l
+       }
+       if caps := context.StringSlice("cap"); len(caps) > 0 {
+               for _, c := range caps {
+                       p.Capabilities.Bounding = append(p.Capabilities.Bounding, c)
+                       p.Capabilities.Inheritable = append(p.Capabilities.Inheritable, c)
+                       p.Capabilities.Effective = append(p.Capabilities.Effective, c)
+                       p.Capabilities.Permitted = append(p.Capabilities.Permitted, c)
+                       p.Capabilities.Ambient = append(p.Capabilities.Ambient, c)
+               }
+       }
+       // append the passed env variables
+       p.Env = append(p.Env, context.StringSlice("env")...)
+
+       // set the tty
+       if context.IsSet("tty") {
+               p.Terminal = context.Bool("tty")
+       }
+       if context.IsSet("no-new-privs") {
+               p.NoNewPrivileges = context.Bool("no-new-privs")
+       }
+       // override the user, if passed
+       if context.String("user") != "" {
+               u := strings.SplitN(context.String("user"), ":", 2)
+               if len(u) > 1 {
+                       gid, err := strconv.Atoi(u[1])
+                       if err != nil {
+                               return nil, fmt.Errorf("parsing %s as int for gid failed: %v", u[1], err)
+                       }
+                       p.User.GID = uint32(gid)
+               }
+               uid, err := strconv.Atoi(u[0])
+               if err != nil {
+                       return nil, fmt.Errorf("parsing %s as int for uid failed: %v", u[0], err)
+               }
+               p.User.UID = uint32(uid)
+       }
+       for _, gid := range context.Int64Slice("additional-gids") {
+               if gid < 0 {
+                       return nil, fmt.Errorf("additional-gids must be a positive number %d", gid)
+               }
+               p.User.AdditionalGids = append(p.User.AdditionalGids, uint32(gid))
+       }
+       return p, validateProcessSpec(p)
+}
diff --git a/init.go b/init.go
new file mode 100644 (file)
index 0000000..08351fd
--- /dev/null
+++ b/init.go
@@ -0,0 +1,50 @@
+package main
+
+import (
+       "fmt"
+       "os"
+       "runtime"
+
+       "github.com/opencontainers/runc/libcontainer"
+       "github.com/opencontainers/runc/libcontainer/logs"
+       _ "github.com/opencontainers/runc/libcontainer/nsenter"
+       "github.com/sirupsen/logrus"
+       "github.com/urfave/cli"
+)
+
+func init() {
+       if len(os.Args) > 1 && os.Args[1] == "init" {
+               runtime.GOMAXPROCS(1)
+               runtime.LockOSThread()
+
+               level := os.Getenv("_LIBCONTAINER_LOGLEVEL")
+               logLevel, err := logrus.ParseLevel(level)
+               if err != nil {
+                       panic(fmt.Sprintf("libcontainer: failed to parse log level: %q: %v", level, err))
+               }
+
+               err = logs.ConfigureLogging(logs.Config{
+                       LogPipeFd: os.Getenv("_LIBCONTAINER_LOGPIPE"),
+                       LogFormat: "json",
+                       LogLevel:  logLevel,
+               })
+               if err != nil {
+                       panic(fmt.Sprintf("libcontainer: failed to configure logging: %v", err))
+               }
+               logrus.Debug("child process in init()")
+       }
+}
+
+var initCommand = cli.Command{
+       Name:  "init",
+       Usage: `initialize the namespaces and launch the process (do not call it outside of runc)`,
+       Action: func(context *cli.Context) error {
+               factory, _ := libcontainer.New("")
+               if err := factory.StartInitialization(); err != nil {
+                       // as the error is sent back to the parent there is no need to log
+                       // or write it to stderr because the parent process will handle this
+                       os.Exit(1)
+               }
+               panic("libcontainer: container init failed to exec")
+       },
+}
diff --git a/kill.go b/kill.go
new file mode 100644 (file)
index 0000000..c2d7929
--- /dev/null
+++ b/kill.go
@@ -0,0 +1,68 @@
+// +build linux
+
+package main
+
+import (
+       "fmt"
+       "strconv"
+       "strings"
+       "syscall"
+
+       "github.com/urfave/cli"
+)
+
+var killCommand = cli.Command{
+       Name:  "kill",
+       Usage: "kill sends the specified signal (default: SIGTERM) to the container's init process",
+       ArgsUsage: `<container-id> [signal]
+
+Where "<container-id>" is the name for the instance of the container and
+"[signal]" is the signal to be sent to the init process.
+
+EXAMPLE:
+For example, if the container id is "ubuntu01" the following will send a "KILL"
+signal to the init process of the "ubuntu01" container:
+        
+       # runc kill ubuntu01 KILL`,
+       Flags: []cli.Flag{
+               cli.BoolFlag{
+                       Name:  "all, a",
+                       Usage: "send the specified signal to all processes inside the container",
+               },
+       },
+       Action: func(context *cli.Context) error {
+               if err := checkArgs(context, 1, minArgs); err != nil {
+                       return err
+               }
+               if err := checkArgs(context, 2, maxArgs); err != nil {
+                       return err
+               }
+               container, err := getContainer(context)
+               if err != nil {
+                       return err
+               }
+
+               sigstr := context.Args().Get(1)
+               if sigstr == "" {
+                       sigstr = "SIGTERM"
+               }
+
+               signal, err := parseSignal(sigstr)
+               if err != nil {
+                       return err
+               }
+               return container.Signal(signal, context.Bool("all"))
+       },
+}
+
+func parseSignal(rawSignal string) (syscall.Signal, error) {
+       s, err := strconv.Atoi(rawSignal)
+       if err == nil {
+               return syscall.Signal(s), nil
+       }
+       signal, ok := signalMap[strings.TrimPrefix(strings.ToUpper(rawSignal), "SIG")]
+       if !ok {
+               return -1, fmt.Errorf("unknown signal %q", rawSignal)
+       }
+       return signal, nil
+}
diff --git a/libcontainer/README.md b/libcontainer/README.md
new file mode 100644 (file)
index 0000000..a791ca2
--- /dev/null
@@ -0,0 +1,331 @@
+# libcontainer
+
+[![GoDoc](https://godoc.org/github.com/opencontainers/runc/libcontainer?status.svg)](https://godoc.org/github.com/opencontainers/runc/libcontainer)
+
+Libcontainer provides a native Go implementation for creating containers
+with namespaces, cgroups, capabilities, and filesystem access controls.
+It allows you to manage the lifecycle of the container performing additional operations
+after the container is created.
+
+
+#### Container
+A container is a self contained execution environment that shares the kernel of the
+host system and which is (optionally) isolated from other containers in the system.
+
+#### Using libcontainer
+
+Because containers are spawned in a two step process you will need a binary that
+will be executed as the init process for the container. In libcontainer, we use
+the current binary (/proc/self/exe) to be executed as the init process, and use
+arg "init", we call the first step process "bootstrap", so you always need a "init"
+function as the entry of "bootstrap".
+
+In addition to the go init function the early stage bootstrap is handled by importing
+[nsenter](https://github.com/opencontainers/runc/blob/master/libcontainer/nsenter/README.md).
+
+```go
+import (
+       _ "github.com/opencontainers/runc/libcontainer/nsenter"
+)
+
+func init() {
+       if len(os.Args) > 1 && os.Args[1] == "init" {
+               runtime.GOMAXPROCS(1)
+               runtime.LockOSThread()
+               factory, _ := libcontainer.New("")
+               if err := factory.StartInitialization(); err != nil {
+                       logrus.Fatal(err)
+               }
+               panic("--this line should have never been executed, congratulations--")
+       }
+}
+```
+
+Then to create a container you first have to initialize an instance of a factory
+that will handle the creation and initialization for a container.
+
+```go
+factory, err := libcontainer.New("/var/lib/container", libcontainer.Cgroupfs, libcontainer.InitArgs(os.Args[0], "init"))
+if err != nil {
+       logrus.Fatal(err)
+       return
+}
+```
+
+Once you have an instance of the factory created we can create a configuration
+struct describing how the container is to be created. A sample would look similar to this:
+
+```go
+defaultMountFlags := unix.MS_NOEXEC | unix.MS_NOSUID | unix.MS_NODEV
+config := &configs.Config{
+       Rootfs: "/your/path/to/rootfs",
+       Capabilities: &configs.Capabilities{
+                Bounding: []string{
+                        "CAP_CHOWN",
+                        "CAP_DAC_OVERRIDE",
+                        "CAP_FSETID",
+                        "CAP_FOWNER",
+                        "CAP_MKNOD",
+                        "CAP_NET_RAW",
+                        "CAP_SETGID",
+                        "CAP_SETUID",
+                        "CAP_SETFCAP",
+                        "CAP_SETPCAP",
+                        "CAP_NET_BIND_SERVICE",
+                        "CAP_SYS_CHROOT",
+                        "CAP_KILL",
+                        "CAP_AUDIT_WRITE",
+                },
+                Effective: []string{
+                        "CAP_CHOWN",
+                        "CAP_DAC_OVERRIDE",
+                        "CAP_FSETID",
+                        "CAP_FOWNER",
+                        "CAP_MKNOD",
+                        "CAP_NET_RAW",
+                        "CAP_SETGID",
+                        "CAP_SETUID",
+                        "CAP_SETFCAP",
+                        "CAP_SETPCAP",
+                        "CAP_NET_BIND_SERVICE",
+                        "CAP_SYS_CHROOT",
+                        "CAP_KILL",
+                        "CAP_AUDIT_WRITE",
+                },
+                Inheritable: []string{
+                        "CAP_CHOWN",
+                        "CAP_DAC_OVERRIDE",
+                        "CAP_FSETID",
+                        "CAP_FOWNER",
+                        "CAP_MKNOD",
+                        "CAP_NET_RAW",
+                        "CAP_SETGID",
+                        "CAP_SETUID",
+                        "CAP_SETFCAP",
+                        "CAP_SETPCAP",
+                        "CAP_NET_BIND_SERVICE",
+                        "CAP_SYS_CHROOT",
+                        "CAP_KILL",
+                        "CAP_AUDIT_WRITE",
+                },
+                Permitted: []string{
+                        "CAP_CHOWN",
+                        "CAP_DAC_OVERRIDE",
+                        "CAP_FSETID",
+                        "CAP_FOWNER",
+                        "CAP_MKNOD",
+                        "CAP_NET_RAW",
+                        "CAP_SETGID",
+                        "CAP_SETUID",
+                        "CAP_SETFCAP",
+                        "CAP_SETPCAP",
+                        "CAP_NET_BIND_SERVICE",
+                        "CAP_SYS_CHROOT",
+                        "CAP_KILL",
+                        "CAP_AUDIT_WRITE",
+                },
+                Ambient: []string{
+                        "CAP_CHOWN",
+                        "CAP_DAC_OVERRIDE",
+                        "CAP_FSETID",
+                        "CAP_FOWNER",
+                        "CAP_MKNOD",
+                        "CAP_NET_RAW",
+                        "CAP_SETGID",
+                        "CAP_SETUID",
+                        "CAP_SETFCAP",
+                        "CAP_SETPCAP",
+                        "CAP_NET_BIND_SERVICE",
+                        "CAP_SYS_CHROOT",
+                        "CAP_KILL",
+                        "CAP_AUDIT_WRITE",
+                },
+        },
+       Namespaces: configs.Namespaces([]configs.Namespace{
+               {Type: configs.NEWNS},
+               {Type: configs.NEWUTS},
+               {Type: configs.NEWIPC},
+               {Type: configs.NEWPID},
+               {Type: configs.NEWUSER},
+               {Type: configs.NEWNET},
+               {Type: configs.NEWCGROUP},
+       }),
+       Cgroups: &configs.Cgroup{
+               Name:   "test-container",
+               Parent: "system",
+               Resources: &configs.Resources{
+                       MemorySwappiness: nil,
+                       AllowAllDevices:  nil,
+                       AllowedDevices:   configs.DefaultAllowedDevices,
+               },
+       },
+       MaskPaths: []string{
+               "/proc/kcore",
+               "/sys/firmware",
+       },
+       ReadonlyPaths: []string{
+               "/proc/sys", "/proc/sysrq-trigger", "/proc/irq", "/proc/bus",
+       },
+       Devices:  configs.DefaultAutoCreatedDevices,
+       Hostname: "testing",
+       Mounts: []*configs.Mount{
+               {
+                       Source:      "proc",
+                       Destination: "/proc",
+                       Device:      "proc",
+                       Flags:       defaultMountFlags,
+               },
+               {
+                       Source:      "tmpfs",
+                       Destination: "/dev",
+                       Device:      "tmpfs",
+                       Flags:       unix.MS_NOSUID | unix.MS_STRICTATIME,
+                       Data:        "mode=755",
+               },
+               {
+                       Source:      "devpts",
+                       Destination: "/dev/pts",
+                       Device:      "devpts",
+                       Flags:       unix.MS_NOSUID | unix.MS_NOEXEC,
+                       Data:        "newinstance,ptmxmode=0666,mode=0620,gid=5",
+               },
+               {
+                       Device:      "tmpfs",
+                       Source:      "shm",
+                       Destination: "/dev/shm",
+                       Data:        "mode=1777,size=65536k",
+                       Flags:       defaultMountFlags,
+               },
+               {
+                       Source:      "mqueue",
+                       Destination: "/dev/mqueue",
+                       Device:      "mqueue",
+                       Flags:       defaultMountFlags,
+               },
+               {
+                       Source:      "sysfs",
+                       Destination: "/sys",
+                       Device:      "sysfs",
+                       Flags:       defaultMountFlags | unix.MS_RDONLY,
+               },
+       },
+       UidMappings: []configs.IDMap{
+               {
+                       ContainerID: 0,
+                       HostID: 1000,
+                       Size: 65536,
+               },
+       },
+       GidMappings: []configs.IDMap{
+               {
+                       ContainerID: 0,
+                       HostID: 1000,
+                       Size: 65536,
+               },
+       },
+       Networks: []*configs.Network{
+               {
+                       Type:    "loopback",
+                       Address: "127.0.0.1/0",
+                       Gateway: "localhost",
+               },
+       },
+       Rlimits: []configs.Rlimit{
+               {
+                       Type: unix.RLIMIT_NOFILE,
+                       Hard: uint64(1025),
+                       Soft: uint64(1025),
+               },
+       },
+}
+```
+
+Once you have the configuration populated you can create a container:
+
+```go
+container, err := factory.Create("container-id", config)
+if err != nil {
+       logrus.Fatal(err)
+       return
+}
+```
+
+To spawn bash as the initial process inside the container and have the
+processes pid returned in order to wait, signal, or kill the process:
+
+```go
+process := &libcontainer.Process{
+       Args:   []string{"/bin/bash"},
+       Env:    []string{"PATH=/bin"},
+       User:   "daemon",
+       Stdin:  os.Stdin,
+       Stdout: os.Stdout,
+       Stderr: os.Stderr,
+       Init:   true,
+}
+
+err := container.Run(process)
+if err != nil {
+       container.Destroy()
+       logrus.Fatal(err)
+       return
+}
+
+// wait for the process to finish.
+_, err := process.Wait()
+if err != nil {
+       logrus.Fatal(err)
+}
+
+// destroy the container.
+container.Destroy()
+```
+
+Additional ways to interact with a running container are:
+
+```go
+// return all the pids for all processes running inside the container.
+processes, err := container.Processes()
+
+// get detailed cpu, memory, io, and network statistics for the container and
+// it's processes.
+stats, err := container.Stats()
+
+// pause all processes inside the container.
+container.Pause()
+
+// resume all paused processes.
+container.Resume()
+
+// send signal to container's init process.
+container.Signal(signal)
+
+// update container resource constraints.
+container.Set(config)
+
+// get current status of the container.
+status, err := container.Status()
+
+// get current container's state information.
+state, err := container.State()
+```
+
+
+#### Checkpoint & Restore
+
+libcontainer now integrates [CRIU](http://criu.org/) for checkpointing and restoring containers.
+This let's you save the state of a process running inside a container to disk, and then restore
+that state into a new process, on the same machine or on another machine.
+
+`criu` version 1.5.2 or higher is required to use checkpoint and restore.
+If you don't already  have `criu` installed, you can build it from source, following the
+[online instructions](http://criu.org/Installation). `criu` is also installed in the docker image
+generated when building libcontainer with docker.
+
+
+## Copyright and license
+
+Code and documentation copyright 2014 Docker, inc.
+The code and documentation are released under the [Apache 2.0 license](../LICENSE).
+The documentation is also released under Creative Commons Attribution 4.0 International License.
+You may obtain a copy of the license, titled CC-BY-4.0, at http://creativecommons.org/licenses/by/4.0/.
diff --git a/libcontainer/SPEC.md b/libcontainer/SPEC.md
new file mode 100644 (file)
index 0000000..07ebdc1
--- /dev/null
@@ -0,0 +1,465 @@
+## Container Specification - v1
+
+This is the standard configuration for version 1 containers.  It includes
+namespaces, standard filesystem setup, a default Linux capability set, and
+information about resource reservations.  It also has information about any 
+populated environment settings for the processes running inside a container.
+
+Along with the configuration of how a container is created the standard also
+discusses actions that can be performed on a container to manage and inspect
+information about the processes running inside.
+
+The v1 profile is meant to be able to accommodate the majority of applications
+with a strong security configuration.
+
+### System Requirements and Compatibility
+
+Minimum requirements:
+* Kernel version - 3.10 recommended 2.6.2x minimum(with backported patches)
+* Mounted cgroups with each subsystem in its own hierarchy
+
+
+### Namespaces
+
+|     Flag        | Enabled |
+| --------------- | ------- |
+| CLONE_NEWPID    |    1    |
+| CLONE_NEWUTS    |    1    |
+| CLONE_NEWIPC    |    1    |
+| CLONE_NEWNET    |    1    |
+| CLONE_NEWNS     |    1    |
+| CLONE_NEWUSER   |    1    |
+| CLONE_NEWCGROUP |    1    |
+
+Namespaces are created for the container via the `unshare` syscall.
+
+
+### Filesystem
+
+A root filesystem must be provided to a container for execution.  The container
+will use this root filesystem (rootfs) to jail and spawn processes inside where
+the binaries and system libraries are local to that directory.  Any binaries
+to be executed must be contained within this rootfs.
+
+Mounts that happen inside the container are automatically cleaned up when the
+container exits as the mount namespace is destroyed and the kernel will 
+unmount all the mounts that were setup within that namespace.
+
+For a container to execute properly there are certain filesystems that 
+are required to be mounted within the rootfs that the runtime will setup.
+
+|     Path    |  Type  |                  Flags                 |                 Data                     |
+| ----------- | ------ | -------------------------------------- | ---------------------------------------- |
+| /proc       | proc   | MS_NOEXEC,MS_NOSUID,MS_NODEV           |                                          |
+| /dev        | tmpfs  | MS_NOEXEC,MS_STRICTATIME               | mode=755                                 |
+| /dev/shm    | tmpfs  | MS_NOEXEC,MS_NOSUID,MS_NODEV           | mode=1777,size=65536k                    |
+| /dev/mqueue | mqueue | MS_NOEXEC,MS_NOSUID,MS_NODEV           |                                          |
+| /dev/pts    | devpts | MS_NOEXEC,MS_NOSUID                    | newinstance,ptmxmode=0666,mode=620,gid=5 |
+| /sys        | sysfs  | MS_NOEXEC,MS_NOSUID,MS_NODEV,MS_RDONLY |                                          |
+
+
+After a container's filesystems are mounted within the newly created 
+mount namespace `/dev` will need to be populated with a set of device nodes.
+It is expected that a rootfs does not need to have any device nodes specified
+for `/dev` within the rootfs as the container will setup the correct devices
+that are required for executing a container's process.
+
+|      Path    | Mode |   Access   |
+| ------------ | ---- | ---------- |
+| /dev/null    | 0666 |  rwm       |
+| /dev/zero    | 0666 |  rwm       |
+| /dev/full    | 0666 |  rwm       |
+| /dev/tty     | 0666 |  rwm       |
+| /dev/random  | 0666 |  rwm       |
+| /dev/urandom | 0666 |  rwm       |
+
+
+**ptmx**
+`/dev/ptmx` will need to be a symlink to the host's `/dev/ptmx` within
+the container.  
+
+The use of a pseudo TTY is optional within a container and it should support both.
+If a pseudo is provided to the container `/dev/console` will need to be 
+setup by binding the console in `/dev/` after it has been populated and mounted
+in tmpfs.
+
+|      Source     | Destination  | UID GID | Mode | Type |
+| --------------- | ------------ | ------- | ---- | ---- |
+| *pty host path* | /dev/console | 0 0     | 0600 | bind | 
+
+
+After `/dev/null` has been setup we check for any external links between
+the container's io, STDIN, STDOUT, STDERR.  If the container's io is pointing
+to `/dev/null` outside the container we close and `dup2` the `/dev/null` 
+that is local to the container's rootfs.
+
+
+After the container has `/proc` mounted a few standard symlinks are setup 
+within `/dev/` for the io.
+
+|    Source       | Destination |
+| --------------- | ----------- |
+| /proc/self/fd   | /dev/fd     |
+| /proc/self/fd/0 | /dev/stdin  |
+| /proc/self/fd/1 | /dev/stdout |
+| /proc/self/fd/2 | /dev/stderr |
+
+A `pivot_root` is used to change the root for the process, effectively 
+jailing the process inside the rootfs.
+
+```c
+put_old = mkdir(...);
+pivot_root(rootfs, put_old);
+chdir("/");
+unmount(put_old, MS_DETACH);
+rmdir(put_old);
+```
+
+For container's running with a rootfs inside `ramfs` a `MS_MOVE` combined
+with a `chroot` is required as `pivot_root` is not supported in `ramfs`.
+
+```c
+mount(rootfs, "/", NULL, MS_MOVE, NULL);
+chroot(".");
+chdir("/");
+```
+
+The `umask` is set back to `0022` after the filesystem setup has been completed.
+
+### Resources
+
+Cgroups are used to handle resource allocation for containers.  This includes
+system resources like cpu, memory, and device access.
+
+| Subsystem  | Enabled |
+| ---------- | ------- |
+| devices    | 1       |
+| memory     | 1       |
+| cpu        | 1       |
+| cpuacct    | 1       |
+| cpuset     | 1       |
+| blkio      | 1       |
+| perf_event | 1       |
+| freezer    | 1       |
+| hugetlb    | 1       |
+| pids       | 1       |
+
+
+All cgroup subsystem are joined so that statistics can be collected from
+each of the subsystems.  Freezer does not expose any stats but is joined
+so that containers can be paused and resumed.
+
+The parent process of the container's init must place the init pid inside
+the correct cgroups before the initialization begins.  This is done so
+that no processes or threads escape the cgroups.  This sync is 
+done via a pipe ( specified in the runtime section below ) that the container's
+init process will block waiting for the parent to finish setup.
+
+### IntelRdt
+
+Intel platforms with new Xeon CPU support Resource Director Technology (RDT).
+Cache Allocation Technology (CAT) and Memory Bandwidth Allocation (MBA) are
+two sub-features of RDT.
+
+Cache Allocation Technology (CAT) provides a way for the software to restrict
+cache allocation to a defined 'subset' of L3 cache which may be overlapping
+with other 'subsets'. The different subsets are identified by class of
+service (CLOS) and each CLOS has a capacity bitmask (CBM).
+
+Memory Bandwidth Allocation (MBA) provides indirect and approximate throttle
+over memory bandwidth for the software. A user controls the resource by
+indicating the percentage of maximum memory bandwidth or memory bandwidth limit
+in MBps unit if MBA Software Controller is enabled.
+
+It can be used to handle L3 cache and memory bandwidth resources allocation
+for containers if hardware and kernel support Intel RDT CAT and MBA features.
+
+In Linux 4.10 kernel or newer, the interface is defined and exposed via
+"resource control" filesystem, which is a "cgroup-like" interface.
+
+Comparing with cgroups, it has similar process management lifecycle and
+interfaces in a container. But unlike cgroups' hierarchy, it has single level
+filesystem layout.
+
+CAT and MBA features are introduced in Linux 4.10 and 4.12 kernel via
+"resource control" filesystem.
+
+Intel RDT "resource control" filesystem hierarchy:
+```
+mount -t resctrl resctrl /sys/fs/resctrl
+tree /sys/fs/resctrl
+/sys/fs/resctrl/
+|-- info
+|   |-- L3
+|   |   |-- cbm_mask
+|   |   |-- min_cbm_bits
+|   |   |-- num_closids
+|   |-- MB
+|       |-- bandwidth_gran
+|       |-- delay_linear
+|       |-- min_bandwidth
+|       |-- num_closids
+|-- ...
+|-- schemata
+|-- tasks
+|-- <container_id>
+    |-- ...
+    |-- schemata
+    |-- tasks
+```
+
+For runc, we can make use of `tasks` and `schemata` configuration for L3
+cache and memory bandwidth resources constraints.
+
+The file `tasks` has a list of tasks that belongs to this group (e.g.,
+<container_id>" group). Tasks can be added to a group by writing the task ID
+to the "tasks" file (which will automatically remove them from the previous
+group to which they belonged). New tasks created by fork(2) and clone(2) are
+added to the same group as their parent.
+
+The file `schemata` has a list of all the resources available to this group.
+Each resource (L3 cache, memory bandwidth) has its own line and format.
+
+L3 cache schema:
+It has allocation bitmasks/values for L3 cache on each socket, which
+contains L3 cache id and capacity bitmask (CBM).
+```
+       Format: "L3:<cache_id0>=<cbm0>;<cache_id1>=<cbm1>;..."
+```
+For example, on a two-socket machine, the schema line could be "L3:0=ff;1=c0"
+which means L3 cache id 0's CBM is 0xff, and L3 cache id 1's CBM is 0xc0.
+
+The valid L3 cache CBM is a *contiguous bits set* and number of bits that can
+be set is less than the max bit. The max bits in the CBM is varied among
+supported Intel CPU models. Kernel will check if it is valid when writing.
+e.g., default value 0xfffff in root indicates the max bits of CBM is 20
+bits, which mapping to entire L3 cache capacity. Some valid CBM values to
+set in a group: 0xf, 0xf0, 0x3ff, 0x1f00 and etc.
+
+Memory bandwidth schema:
+It has allocation values for memory bandwidth on each socket, which contains
+L3 cache id and memory bandwidth.
+```
+       Format: "MB:<cache_id0>=bandwidth0;<cache_id1>=bandwidth1;..."
+```
+For example, on a two-socket machine, the schema line could be "MB:0=20;1=70"
+
+The minimum bandwidth percentage value for each CPU model is predefined and
+can be looked up through "info/MB/min_bandwidth". The bandwidth granularity
+that is allocated is also dependent on the CPU model and can be looked up at
+"info/MB/bandwidth_gran". The available bandwidth control steps are:
+min_bw + N * bw_gran. Intermediate values are rounded to the next control
+step available on the hardware.
+
+If MBA Software Controller is enabled through mount option "-o mba_MBps"
+mount -t resctrl resctrl -o mba_MBps /sys/fs/resctrl
+We could specify memory bandwidth in "MBps" (Mega Bytes per second) unit
+instead of "percentages". The kernel underneath would use a software feedback
+mechanism or a "Software Controller" which reads the actual bandwidth using
+MBM counters and adjust the memory bandwidth percentages to ensure:
+"actual memory bandwidth < user specified memory bandwidth".
+
+For example, on a two-socket machine, the schema line could be
+"MB:0=5000;1=7000" which means 5000 MBps memory bandwidth limit on socket 0
+and 7000 MBps memory bandwidth limit on socket 1.
+
+For more information about Intel RDT kernel interface:  
+https://www.kernel.org/doc/Documentation/x86/intel_rdt_ui.txt
+
+```
+An example for runc:
+Consider a two-socket machine with two L3 caches where the default CBM is
+0x7ff and the max CBM length is 11 bits, and minimum memory bandwidth of 10%
+with a memory bandwidth granularity of 10%.
+
+Tasks inside the container only have access to the "upper" 7/11 of L3 cache
+on socket 0 and the "lower" 5/11 L3 cache on socket 1, and may use a
+maximum memory bandwidth of 20% on socket 0 and 70% on socket 1.
+
+"linux": {
+    "intelRdt": {
+        "closID": "guaranteed_group",
+        "l3CacheSchema": "L3:0=7f0;1=1f",
+        "memBwSchema": "MB:0=20;1=70"
+    }
+}
+```
+
+### Security 
+
+The standard set of Linux capabilities that are set in a container
+provide a good default for security and flexibility for the applications.
+
+
+|     Capability       | Enabled |
+| -------------------- | ------- |
+| CAP_NET_RAW          | 1       |
+| CAP_NET_BIND_SERVICE | 1       |
+| CAP_AUDIT_READ       | 1       |
+| CAP_AUDIT_WRITE      | 1       |
+| CAP_DAC_OVERRIDE     | 1       |
+| CAP_SETFCAP          | 1       |
+| CAP_SETPCAP          | 1       |
+| CAP_SETGID           | 1       |
+| CAP_SETUID           | 1       |
+| CAP_MKNOD            | 1       |
+| CAP_CHOWN            | 1       |
+| CAP_FOWNER           | 1       |
+| CAP_FSETID           | 1       |
+| CAP_KILL             | 1       |
+| CAP_SYS_CHROOT       | 1       |
+| CAP_NET_BROADCAST    | 0       |
+| CAP_SYS_MODULE       | 0       |
+| CAP_SYS_RAWIO        | 0       |
+| CAP_SYS_PACCT        | 0       |
+| CAP_SYS_ADMIN        | 0       |
+| CAP_SYS_NICE         | 0       |
+| CAP_SYS_RESOURCE     | 0       |
+| CAP_SYS_TIME         | 0       |
+| CAP_SYS_TTY_CONFIG   | 0       |
+| CAP_AUDIT_CONTROL    | 0       |
+| CAP_MAC_OVERRIDE     | 0       |
+| CAP_MAC_ADMIN        | 0       |
+| CAP_NET_ADMIN        | 0       |
+| CAP_SYSLOG           | 0       |
+| CAP_DAC_READ_SEARCH  | 0       |
+| CAP_LINUX_IMMUTABLE  | 0       |
+| CAP_IPC_LOCK         | 0       |
+| CAP_IPC_OWNER        | 0       |
+| CAP_SYS_PTRACE       | 0       |
+| CAP_SYS_BOOT         | 0       |
+| CAP_LEASE            | 0       |
+| CAP_WAKE_ALARM       | 0       |
+| CAP_BLOCK_SUSPEND    | 0       |
+
+
+Additional security layers like [apparmor](https://wiki.ubuntu.com/AppArmor)
+and [selinux](http://selinuxproject.org/page/Main_Page) can be used with
+the containers.  A container should support setting an apparmor profile or 
+selinux process and mount labels if provided in the configuration.  
+
+Standard apparmor profile:
+```c
+#include <tunables/global>
+profile <profile_name> flags=(attach_disconnected,mediate_deleted) {
+  #include <abstractions/base>
+  network,
+  capability,
+  file,
+  umount,
+
+  deny @{PROC}/sys/fs/** wklx,
+  deny @{PROC}/sysrq-trigger rwklx,
+  deny @{PROC}/mem rwklx,
+  deny @{PROC}/kmem rwklx,
+  deny @{PROC}/sys/kernel/[^s][^h][^m]* wklx,
+  deny @{PROC}/sys/kernel/*/** wklx,
+
+  deny mount,
+
+  deny /sys/[^f]*/** wklx,
+  deny /sys/f[^s]*/** wklx,
+  deny /sys/fs/[^c]*/** wklx,
+  deny /sys/fs/c[^g]*/** wklx,
+  deny /sys/fs/cg[^r]*/** wklx,
+  deny /sys/firmware/efi/efivars/** rwklx,
+  deny /sys/kernel/security/** rwklx,
+}
+```
+
+*TODO: seccomp work is being done to find a good default config*
+
+### Runtime and Init Process
+
+During container creation the parent process needs to talk to the container's init 
+process and have a form of synchronization.  This is accomplished by creating
+a pipe that is passed to the container's init.  When the init process first spawns 
+it will block on its side of the pipe until the parent closes its side.  This
+allows the parent to have time to set the new process inside a cgroup hierarchy 
+and/or write any uid/gid mappings required for user namespaces.  
+The pipe is passed to the init process via FD 3.
+
+The application consuming libcontainer should be compiled statically.  libcontainer
+does not define any init process and the arguments provided are used to `exec` the
+process inside the application.  There should be no long running init within the 
+container spec.
+
+If a pseudo tty is provided to a container it will open and `dup2` the console
+as the container's STDIN, STDOUT, STDERR as well as mounting the console
+as `/dev/console`.
+
+An extra set of mounts are provided to a container and setup for use.  A container's
+rootfs can contain some non portable files inside that can cause side effects during
+execution of a process.  These files are usually created and populated with the container
+specific information via the runtime.  
+
+**Extra runtime files:**
+* /etc/hosts 
+* /etc/resolv.conf
+* /etc/hostname
+* /etc/localtime
+
+
+#### Defaults
+
+There are a few defaults that can be overridden by users, but in their omission
+these apply to processes within a container.
+
+|       Type          |             Value              |
+| ------------------- | ------------------------------ |
+| Parent Death Signal | SIGKILL                        | 
+| UID                 | 0                              |
+| GID                 | 0                              |
+| GROUPS              | 0, NULL                        |
+| CWD                 | "/"                            |
+| $HOME               | Current user's home dir or "/" |
+| Readonly rootfs     | false                          |
+| Pseudo TTY          | false                          |
+
+
+## Actions
+
+After a container is created there is a standard set of actions that can
+be done to the container.  These actions are part of the public API for 
+a container.
+
+|     Action     |                         Description                                |
+| -------------- | ------------------------------------------------------------------ |
+| Get processes  | Return all the pids for processes running inside a container       | 
+| Get Stats      | Return resource statistics for the container as a whole            |
+| Wait           | Waits on the container's init process ( pid 1 )                    |
+| Wait Process   | Wait on any of the container's processes returning the exit status | 
+| Destroy        | Kill the container's init process and remove any filesystem state  |
+| Signal         | Send a signal to the container's init process                      |
+| Signal Process | Send a signal to any of the container's processes                  |
+| Pause          | Pause all processes inside the container                           |
+| Resume         | Resume all processes inside the container if paused                |
+| Exec           | Execute a new process inside of the container  ( requires setns )  |
+| Set            | Setup configs of the container after it's created                  |
+
+### Execute a new process inside of a running container
+
+User can execute a new process inside of a running container. Any binaries to be
+executed must be accessible within the container's rootfs.
+
+The started process will run inside the container's rootfs. Any changes
+made by the process to the container's filesystem will persist after the
+process finished executing.
+
+The started process will join all the container's existing namespaces. When the
+container is paused, the process will also be paused and will resume when
+the container is unpaused.  The started process will only run when the container's
+primary process (PID 1) is running, and will not be restarted when the container
+is restarted.
+
+#### Planned additions
+
+The started process will have its own cgroups nested inside the container's
+cgroups. This is used for process tracking and optionally resource allocation
+handling for the new process. Freezer cgroup is required, the rest of the cgroups
+are optional. The process executor must place its pid inside the correct
+cgroups before starting the process. This is done so that no child processes or
+threads can escape the cgroups.
+
+When the process is stopped, the process executor will try (in a best-effort way)
+to stop all its children and remove the sub-cgroups.
diff --git a/libcontainer/apparmor/apparmor.go b/libcontainer/apparmor/apparmor.go
new file mode 100644 (file)
index 0000000..debfc1e
--- /dev/null
@@ -0,0 +1,60 @@
+// +build apparmor,linux
+
+package apparmor
+
+import (
+       "fmt"
+       "io/ioutil"
+       "os"
+
+       "github.com/opencontainers/runc/libcontainer/utils"
+)
+
+// IsEnabled returns true if apparmor is enabled for the host.
+func IsEnabled() bool {
+       if _, err := os.Stat("/sys/kernel/security/apparmor"); err == nil && os.Getenv("container") == "" {
+               if _, err = os.Stat("/sbin/apparmor_parser"); err == nil {
+                       buf, err := ioutil.ReadFile("/sys/module/apparmor/parameters/enabled")
+                       return err == nil && len(buf) > 1 && buf[0] == 'Y'
+               }
+       }
+       return false
+}
+
+func setProcAttr(attr, value string) error {
+       // Under AppArmor you can only change your own attr, so use /proc/self/
+       // instead of /proc/<tid>/ like libapparmor does
+       path := fmt.Sprintf("/proc/self/attr/%s", attr)
+
+       f, err := os.OpenFile(path, os.O_WRONLY, 0)
+       if err != nil {
+               return err
+       }
+       defer f.Close()
+
+       if err := utils.EnsureProcHandle(f); err != nil {
+               return err
+       }
+
+       _, err = fmt.Fprintf(f, "%s", value)
+       return err
+}
+
+// changeOnExec reimplements aa_change_onexec from libapparmor in Go
+func changeOnExec(name string) error {
+       value := "exec " + name
+       if err := setProcAttr("exec", value); err != nil {
+               return fmt.Errorf("apparmor failed to apply profile: %s", err)
+       }
+       return nil
+}
+
+// ApplyProfile will apply the profile with the specified name to the process after
+// the next exec.
+func ApplyProfile(name string) error {
+       if name == "" {
+               return nil
+       }
+
+       return changeOnExec(name)
+}
diff --git a/libcontainer/apparmor/apparmor_disabled.go b/libcontainer/apparmor/apparmor_disabled.go
new file mode 100644 (file)
index 0000000..d4110cf
--- /dev/null
@@ -0,0 +1,20 @@
+// +build !apparmor !linux
+
+package apparmor
+
+import (
+       "errors"
+)
+
+var ErrApparmorNotEnabled = errors.New("apparmor: config provided but apparmor not supported")
+
+func IsEnabled() bool {
+       return false
+}
+
+func ApplyProfile(name string) error {
+       if name != "" {
+               return ErrApparmorNotEnabled
+       }
+       return nil
+}
diff --git a/libcontainer/capabilities_linux.go b/libcontainer/capabilities_linux.go
new file mode 100644 (file)
index 0000000..9daef29
--- /dev/null
@@ -0,0 +1,117 @@
+// +build linux
+
+package libcontainer
+
+import (
+       "fmt"
+       "strings"
+
+       "github.com/opencontainers/runc/libcontainer/configs"
+       "github.com/syndtr/gocapability/capability"
+)
+
+const allCapabilityTypes = capability.CAPS | capability.BOUNDS | capability.AMBS
+
+var capabilityMap map[string]capability.Cap
+
+func init() {
+       capabilityMap = make(map[string]capability.Cap)
+       last := capability.CAP_LAST_CAP
+       // workaround for RHEL6 which has no /proc/sys/kernel/cap_last_cap
+       if last == capability.Cap(63) {
+               last = capability.CAP_BLOCK_SUSPEND
+       }
+       for _, cap := range capability.List() {
+               if cap > last {
+                       continue
+               }
+               capKey := fmt.Sprintf("CAP_%s", strings.ToUpper(cap.String()))
+               capabilityMap[capKey] = cap
+       }
+}
+
+func newContainerCapList(capConfig *configs.Capabilities) (*containerCapabilities, error) {
+       bounding := []capability.Cap{}
+       for _, c := range capConfig.Bounding {
+               v, ok := capabilityMap[c]
+               if !ok {
+                       return nil, fmt.Errorf("unknown capability %q", c)
+               }
+               bounding = append(bounding, v)
+       }
+       effective := []capability.Cap{}
+       for _, c := range capConfig.Effective {
+               v, ok := capabilityMap[c]
+               if !ok {
+                       return nil, fmt.Errorf("unknown capability %q", c)
+               }
+               effective = append(effective, v)
+       }
+       inheritable := []capability.Cap{}
+       for _, c := range capConfig.Inheritable {
+               v, ok := capabilityMap[c]
+               if !ok {
+                       return nil, fmt.Errorf("unknown capability %q", c)
+               }
+               inheritable = append(inheritable, v)
+       }
+       permitted := []capability.Cap{}
+       for _, c := range capConfig.Permitted {
+               v, ok := capabilityMap[c]
+               if !ok {
+                       return nil, fmt.Errorf("unknown capability %q", c)
+               }
+               permitted = append(permitted, v)
+       }
+       ambient := []capability.Cap{}
+       for _, c := range capConfig.Ambient {
+               v, ok := capabilityMap[c]
+               if !ok {
+                       return nil, fmt.Errorf("unknown capability %q", c)
+               }
+               ambient = append(ambient, v)
+       }
+       pid, err := capability.NewPid2(0)
+       if err != nil {
+               return nil, err
+       }
+       err = pid.Load()
+       if err != nil {
+               return nil, err
+       }
+       return &containerCapabilities{
+               bounding:    bounding,
+               effective:   effective,
+               inheritable: inheritable,
+               permitted:   permitted,
+               ambient:     ambient,
+               pid:         pid,
+       }, nil
+}
+
+type containerCapabilities struct {
+       pid         capability.Capabilities
+       bounding    []capability.Cap
+       effective   []capability.Cap
+       inheritable []capability.Cap
+       permitted   []capability.Cap
+       ambient     []capability.Cap
+}
+
+// ApplyBoundingSet sets the capability bounding set to those specified in the whitelist.
+func (c *containerCapabilities) ApplyBoundingSet() error {
+       c.pid.Clear(capability.BOUNDS)
+       c.pid.Set(capability.BOUNDS, c.bounding...)
+       return c.pid.Apply(capability.BOUNDS)
+}
+
+// Apply sets all the capabilities for the current process in the config.
+func (c *containerCapabilities) ApplyCaps() error {
+       c.pid.Clear(allCapabilityTypes)
+       c.pid.Set(capability.BOUNDS, c.bounding...)
+       c.pid.Set(capability.PERMITTED, c.permitted...)
+       c.pid.Set(capability.INHERITABLE, c.inheritable...)
+       c.pid.Set(capability.EFFECTIVE, c.effective...)
+       c.pid.Set(capability.AMBIENT, c.ambient...)
+       return c.pid.Apply(allCapabilityTypes)
+}
diff --git a/libcontainer/cgroups/cgroups.go b/libcontainer/cgroups/cgroups.go
new file mode 100644 (file)
index 0000000..c0a9659
--- /dev/null
@@ -0,0 +1,74 @@
+// +build linux
+
+package cgroups
+
+import (
+       "fmt"
+
+       "github.com/opencontainers/runc/libcontainer/configs"
+)
+
+type Manager interface {
+       // Applies cgroup configuration to the process with the specified pid
+       Apply(pid int) error
+
+       // Returns the PIDs inside the cgroup set
+       GetPids() ([]int, error)
+
+       // Returns the PIDs inside the cgroup set & all sub-cgroups
+       GetAllPids() ([]int, error)
+
+       // Returns statistics for the cgroup set
+       GetStats() (*Stats, error)
+
+       // Toggles the freezer cgroup according with specified state
+       Freeze(state configs.FreezerState) error
+
+       // Destroys the cgroup set
+       Destroy() error
+
+       // The option func SystemdCgroups() and Cgroupfs() require following attributes:
+       //      Paths   map[string]string
+       //      Cgroups *configs.Cgroup
+       // Paths maps cgroup subsystem to path at which it is mounted.
+       // Cgroups specifies specific cgroup settings for the various subsystems
+
+       // Returns cgroup paths to save in a state file and to be able to
+       // restore the object later.
+       GetPaths() map[string]string
+
+       // GetUnifiedPath returns the unified path when running in unified mode.
+       // The value corresponds to the all values of GetPaths() map.
+       //
+       // GetUnifiedPath returns error when running in hybrid mode as well as
+       // in legacy mode.
+       GetUnifiedPath() (string, error)
+
+       // Sets the cgroup as configured.
+       Set(container *configs.Config) error
+
+       // Gets the cgroup as configured.
+       GetCgroups() (*configs.Cgroup, error)
+}
+
+type NotFoundError struct {
+       Subsystem string
+}
+
+func (e *NotFoundError) Error() string {
+       return fmt.Sprintf("mountpoint for %s not found", e.Subsystem)
+}
+
+func NewNotFoundError(sub string) error {
+       return &NotFoundError{
+               Subsystem: sub,
+       }
+}
+
+func IsNotFound(err error) bool {
+       if err == nil {
+               return false
+       }
+       _, ok := err.(*NotFoundError)
+       return ok
+}
diff --git a/libcontainer/cgroups/cgroups_test.go b/libcontainer/cgroups/cgroups_test.go
new file mode 100644 (file)
index 0000000..9efb83e
--- /dev/null
@@ -0,0 +1,20 @@
+// +build linux
+
+package cgroups
+
+import (
+       "testing"
+)
+
+func TestParseCgroups(t *testing.T) {
+       cgroups, err := ParseCgroupFile("/proc/self/cgroup")
+       if err != nil {
+               t.Fatal(err)
+       }
+       if IsCgroup2UnifiedMode() {
+               return
+       }
+       if _, ok := cgroups["cpu"]; !ok {
+               t.Fail()
+       }
+}
diff --git a/libcontainer/cgroups/cgroups_unsupported.go b/libcontainer/cgroups/cgroups_unsupported.go
new file mode 100644 (file)
index 0000000..278d507
--- /dev/null
@@ -0,0 +1,3 @@
+// +build !linux
+
+package cgroups
diff --git a/libcontainer/cgroups/ebpf/devicefilter/devicefilter.go b/libcontainer/cgroups/ebpf/devicefilter/devicefilter.go
new file mode 100644 (file)
index 0000000..847ce8e
--- /dev/null
@@ -0,0 +1,180 @@
+// Package devicefilter containes eBPF device filter program
+//
+// The implementation is based on https://github.com/containers/crun/blob/0.10.2/src/libcrun/ebpf.c
+//
+// Although ebpf.c is originally licensed under LGPL-3.0-or-later, the author (Giuseppe Scrivano)
+// agreed to relicense the file in Apache License 2.0: https://github.com/opencontainers/runc/issues/2144#issuecomment-543116397
+package devicefilter
+
+import (
+       "fmt"
+       "math"
+
+       "github.com/cilium/ebpf/asm"
+       "github.com/opencontainers/runc/libcontainer/configs"
+       "github.com/pkg/errors"
+       "golang.org/x/sys/unix"
+)
+
+const (
+       // license string format is same as kernel MODULE_LICENSE macro
+       license = "Apache"
+)
+
+// DeviceFilter returns eBPF device filter program and its license string
+func DeviceFilter(devices []*configs.Device) (asm.Instructions, string, error) {
+       p := &program{}
+       p.init()
+       for i := len(devices) - 1; i >= 0; i-- {
+               if err := p.appendDevice(devices[i]); err != nil {
+                       return nil, "", err
+               }
+       }
+       insts, err := p.finalize()
+       return insts, license, err
+}
+
+type program struct {
+       insts       asm.Instructions
+       hasWildCard bool
+       blockID     int
+}
+
+func (p *program) init() {
+       // struct bpf_cgroup_dev_ctx: https://elixir.bootlin.com/linux/v5.3.6/source/include/uapi/linux/bpf.h#L3423
+       /*
+               u32 access_type
+               u32 major
+               u32 minor
+       */
+       // R2 <- type (lower 16 bit of u32 access_type at R1[0])
+       p.insts = append(p.insts,
+               asm.LoadMem(asm.R2, asm.R1, 0, asm.Half))
+
+       // R3 <- access (upper 16 bit of u32 access_type at R1[0])
+       p.insts = append(p.insts,
+               asm.LoadMem(asm.R3, asm.R1, 0, asm.Word),
+               // RSh: bitwise shift right
+               asm.RSh.Imm32(asm.R3, 16))
+
+       // R4 <- major (u32 major at R1[4])
+       p.insts = append(p.insts,
+               asm.LoadMem(asm.R4, asm.R1, 4, asm.Word))
+
+       // R5 <- minor (u32 minor at R1[8])
+       p.insts = append(p.insts,
+               asm.LoadMem(asm.R5, asm.R1, 8, asm.Word))
+}
+
+// appendDevice needs to be called from the last element of OCI linux.resources.devices to the head element.
+func (p *program) appendDevice(dev *configs.Device) error {
+       if p.blockID < 0 {
+               return errors.New("the program is finalized")
+       }
+       if p.hasWildCard {
+               // All entries after wildcard entry are ignored
+               return nil
+       }
+
+       bpfType := int32(-1)
+       hasType := true
+       switch dev.Type {
+       case 'c':
+               bpfType = int32(unix.BPF_DEVCG_DEV_CHAR)
+       case 'b':
+               bpfType = int32(unix.BPF_DEVCG_DEV_BLOCK)
+       case 'a':
+               hasType = false
+       default:
+               // if not specified in OCI json, typ is set to DeviceTypeAll
+               return errors.Errorf("invalid DeviceType %q", string(dev.Type))
+       }
+       if dev.Major > math.MaxUint32 {
+               return errors.Errorf("invalid major %d", dev.Major)
+       }
+       if dev.Minor > math.MaxUint32 {
+               return errors.Errorf("invalid minor %d", dev.Major)
+       }
+       hasMajor := dev.Major >= 0 // if not specified in OCI json, major is set to -1
+       hasMinor := dev.Minor >= 0
+       bpfAccess := int32(0)
+       for _, r := range dev.Permissions {
+               switch r {
+               case 'r':
+                       bpfAccess |= unix.BPF_DEVCG_ACC_READ
+               case 'w':
+                       bpfAccess |= unix.BPF_DEVCG_ACC_WRITE
+               case 'm':
+                       bpfAccess |= unix.BPF_DEVCG_ACC_MKNOD
+               default:
+                       return errors.Errorf("unknown device access %v", r)
+               }
+       }
+       // If the access is rwm, skip the check.
+       hasAccess := bpfAccess != (unix.BPF_DEVCG_ACC_READ | unix.BPF_DEVCG_ACC_WRITE | unix.BPF_DEVCG_ACC_MKNOD)
+
+       blockSym := fmt.Sprintf("block-%d", p.blockID)
+       nextBlockSym := fmt.Sprintf("block-%d", p.blockID+1)
+       prevBlockLastIdx := len(p.insts) - 1
+       if hasType {
+               p.insts = append(p.insts,
+                       // if (R2 != bpfType) goto next
+                       asm.JNE.Imm(asm.R2, bpfType, nextBlockSym),
+               )
+       }
+       if hasAccess {
+               p.insts = append(p.insts,
+                       // if (R3 & bpfAccess == 0 /* use R1 as a temp var */) goto next
+                       asm.Mov.Reg32(asm.R1, asm.R3),
+                       asm.And.Imm32(asm.R1, bpfAccess),
+                       asm.JEq.Imm(asm.R1, 0, nextBlockSym),
+               )
+       }
+       if hasMajor {
+               p.insts = append(p.insts,
+                       // if (R4 != major) goto next
+                       asm.JNE.Imm(asm.R4, int32(dev.Major), nextBlockSym),
+               )
+       }
+       if hasMinor {
+               p.insts = append(p.insts,
+                       // if (R5 != minor) goto next
+                       asm.JNE.Imm(asm.R5, int32(dev.Minor), nextBlockSym),
+               )
+       }
+       if !hasType && !hasAccess && !hasMajor && !hasMinor {
+               p.hasWildCard = true
+       }
+       p.insts = append(p.insts, acceptBlock(dev.Allow)...)
+       // set blockSym to the first instruction we added in this iteration
+       p.insts[prevBlockLastIdx+1] = p.insts[prevBlockLastIdx+1].Sym(blockSym)
+       p.blockID++
+       return nil
+}
+
+func (p *program) finalize() (asm.Instructions, error) {
+       if p.hasWildCard {
+               // acceptBlock with asm.Return() is already inserted
+               return p.insts, nil
+       }
+       blockSym := fmt.Sprintf("block-%d", p.blockID)
+       p.insts = append(p.insts,
+               // R0 <- 0
+               asm.Mov.Imm32(asm.R0, 0).Sym(blockSym),
+               asm.Return(),
+       )
+       p.blockID = -1
+       return p.insts, nil
+}
+
+func acceptBlock(accept bool) asm.Instructions {
+       v := int32(0)
+       if accept {
+               v = 1
+       }
+       return []asm.Instruction{
+               // R0 <- v
+               asm.Mov.Imm32(asm.R0, v),
+               asm.Return(),
+       }
+}
diff --git a/libcontainer/cgroups/ebpf/devicefilter/devicefilter_test.go b/libcontainer/cgroups/ebpf/devicefilter/devicefilter_test.go
new file mode 100644 (file)
index 0000000..59ff4b4
--- /dev/null
@@ -0,0 +1,258 @@
+package devicefilter
+
+import (
+       "strings"
+       "testing"
+
+       "github.com/opencontainers/runc/libcontainer/configs"
+       "github.com/opencontainers/runc/libcontainer/specconv"
+)
+
+func hash(s, comm string) string {
+       var res []string
+       for _, l := range strings.Split(s, "\n") {
+               trimmed := strings.TrimSpace(l)
+               if trimmed == "" || strings.HasPrefix(trimmed, comm) {
+                       continue
+               }
+               res = append(res, trimmed)
+       }
+       return strings.Join(res, "\n")
+}
+
+func testDeviceFilter(t testing.TB, devices []*configs.Device, expectedStr string) {
+       insts, _, err := DeviceFilter(devices)
+       if err != nil {
+               t.Fatalf("%s: %v (devices: %+v)", t.Name(), err, devices)
+       }
+       s := insts.String()
+       t.Logf("%s: devices: %+v\n%s", t.Name(), devices, s)
+       if expectedStr != "" {
+               hashed := hash(s, "//")
+               expectedHashed := hash(expectedStr, "//")
+               if expectedHashed != hashed {
+                       t.Fatalf("expected:\n%q\ngot\n%q", expectedHashed, hashed)
+               }
+       }
+}
+
+func TestDeviceFilter_Nil(t *testing.T) {
+       expected := `
+// load parameters into registers
+        0: LdXMemH dst: r2 src: r1 off: 0 imm: 0
+        1: LdXMemW dst: r3 src: r1 off: 0 imm: 0
+        2: RSh32Imm dst: r3 imm: 16
+        3: LdXMemW dst: r4 src: r1 off: 4 imm: 0
+        4: LdXMemW dst: r5 src: r1 off: 8 imm: 0
+block-0:
+// return 0 (reject)
+        5: Mov32Imm dst: r0 imm: 0
+        6: Exit
+       `
+       testDeviceFilter(t, nil, expected)
+}
+
+func TestDeviceFilter_BuiltInAllowList(t *testing.T) {
+       expected := `
+// load parameters into registers
+         0: LdXMemH dst: r2 src: r1 off: 0 imm: 0
+         1: LdXMemW dst: r3 src: r1 off: 0 imm: 0
+         2: RSh32Imm dst: r3 imm: 16
+         3: LdXMemW dst: r4 src: r1 off: 4 imm: 0
+         4: LdXMemW dst: r5 src: r1 off: 8 imm: 0
+block-0:
+// tuntap (c, 10, 200, rwm, allow)
+         5: JNEImm dst: r2 off: -1 imm: 2 <block-1>
+         6: JNEImm dst: r4 off: -1 imm: 10 <block-1>
+         7: JNEImm dst: r5 off: -1 imm: 200 <block-1>
+         8: Mov32Imm dst: r0 imm: 1
+         9: Exit
+block-1:
+        10: JNEImm dst: r2 off: -1 imm: 2 <block-2>
+        11: JNEImm dst: r4 off: -1 imm: 5 <block-2>
+        12: JNEImm dst: r5 off: -1 imm: 2 <block-2>
+        13: Mov32Imm dst: r0 imm: 1
+        14: Exit
+block-2:
+// /dev/pts (c, 136, wildcard, rwm, true)
+        15: JNEImm dst: r2 off: -1 imm: 2 <block-3>
+        16: JNEImm dst: r4 off: -1 imm: 136 <block-3>
+        17: Mov32Imm dst: r0 imm: 1
+        18: Exit
+block-3:
+        19: JNEImm dst: r2 off: -1 imm: 2 <block-4>
+        20: JNEImm dst: r4 off: -1 imm: 5 <block-4>
+        21: JNEImm dst: r5 off: -1 imm: 1 <block-4>
+        22: Mov32Imm dst: r0 imm: 1
+        23: Exit
+block-4:
+        24: JNEImm dst: r2 off: -1 imm: 2 <block-5>
+        25: JNEImm dst: r4 off: -1 imm: 1 <block-5>
+        26: JNEImm dst: r5 off: -1 imm: 9 <block-5>
+        27: Mov32Imm dst: r0 imm: 1
+        28: Exit
+block-5:
+        29: JNEImm dst: r2 off: -1 imm: 2 <block-6>
+        30: JNEImm dst: r4 off: -1 imm: 1 <block-6>
+        31: JNEImm dst: r5 off: -1 imm: 5 <block-6>
+        32: Mov32Imm dst: r0 imm: 1
+        33: Exit
+block-6:
+        34: JNEImm dst: r2 off: -1 imm: 2 <block-7>
+        35: JNEImm dst: r4 off: -1 imm: 5 <block-7>
+        36: JNEImm dst: r5 off: -1 imm: 0 <block-7>
+        37: Mov32Imm dst: r0 imm: 1
+        38: Exit
+block-7:
+        39: JNEImm dst: r2 off: -1 imm: 2 <block-8>
+        40: JNEImm dst: r4 off: -1 imm: 1 <block-8>
+        41: JNEImm dst: r5 off: -1 imm: 7 <block-8>
+        42: Mov32Imm dst: r0 imm: 1
+        43: Exit
+block-8:
+        44: JNEImm dst: r2 off: -1 imm: 2 <block-9>
+        45: JNEImm dst: r4 off: -1 imm: 1 <block-9>
+        46: JNEImm dst: r5 off: -1 imm: 8 <block-9>
+        47: Mov32Imm dst: r0 imm: 1
+        48: Exit
+block-9:
+        49: JNEImm dst: r2 off: -1 imm: 2 <block-10>
+        50: JNEImm dst: r4 off: -1 imm: 1 <block-10>
+        51: JNEImm dst: r5 off: -1 imm: 3 <block-10>
+        52: Mov32Imm dst: r0 imm: 1
+        53: Exit
+block-10:
+// (b, wildcard, wildcard, m, true)
+        54: JNEImm dst: r2 off: -1 imm: 1 <block-11>
+        55: Mov32Reg dst: r1 src: r3
+        56: And32Imm dst: r1 imm: 1
+        57: JEqImm dst: r1 off: -1 imm: 0 <block-11>
+        58: Mov32Imm dst: r0 imm: 1
+        59: Exit
+block-11:
+// (c, wildcard, wildcard, m, true)
+        60: JNEImm dst: r2 off: -1 imm: 2 <block-12>
+        61: Mov32Reg dst: r1 src: r3
+        62: And32Imm dst: r1 imm: 1
+        63: JEqImm dst: r1 off: -1 imm: 0 <block-12>
+        64: Mov32Imm dst: r0 imm: 1
+        65: Exit
+block-12:
+        66: Mov32Imm dst: r0 imm: 0
+        67: Exit
+`
+       testDeviceFilter(t, specconv.AllowedDevices, expected)
+}
+
+func TestDeviceFilter_Privileged(t *testing.T) {
+       devices := []*configs.Device{
+               {
+                       Type:        'a',
+                       Major:       -1,
+                       Minor:       -1,
+                       Permissions: "rwm",
+                       Allow:       true,
+               },
+       }
+       expected :=
+               `
+// load parameters into registers
+        0: LdXMemH dst: r2 src: r1 off: 0 imm: 0
+        1: LdXMemW dst: r3 src: r1 off: 0 imm: 0
+        2: RSh32Imm dst: r3 imm: 16
+        3: LdXMemW dst: r4 src: r1 off: 4 imm: 0
+        4: LdXMemW dst: r5 src: r1 off: 8 imm: 0
+block-0:
+// return 1 (accept)
+        5: Mov32Imm dst: r0 imm: 1
+        6: Exit
+       `
+       testDeviceFilter(t, devices, expected)
+}
+
+func TestDeviceFilter_PrivilegedExceptSingleDevice(t *testing.T) {
+       devices := []*configs.Device{
+               {
+                       Type:        'a',
+                       Major:       -1,
+                       Minor:       -1,
+                       Permissions: "rwm",
+                       Allow:       true,
+               },
+               {
+                       Type:        'b',
+                       Major:       8,
+                       Minor:       0,
+                       Permissions: "rwm",
+                       Allow:       false,
+               },
+       }
+       expected := `
+// load parameters into registers
+         0: LdXMemH dst: r2 src: r1 off: 0 imm: 0
+         1: LdXMemW dst: r3 src: r1 off: 0 imm: 0
+         2: RSh32Imm dst: r3 imm: 16
+         3: LdXMemW dst: r4 src: r1 off: 4 imm: 0
+         4: LdXMemW dst: r5 src: r1 off: 8 imm: 0
+block-0:
+// return 0 (reject) if type==b && major == 8 && minor == 0
+         5: JNEImm dst: r2 off: -1 imm: 1 <block-1>
+         6: JNEImm dst: r4 off: -1 imm: 8 <block-1>
+         7: JNEImm dst: r5 off: -1 imm: 0 <block-1>
+         8: Mov32Imm dst: r0 imm: 0
+         9: Exit
+block-1:
+// return 1 (accept)
+        10: Mov32Imm dst: r0 imm: 1
+        11: Exit
+`
+       testDeviceFilter(t, devices, expected)
+}
+
+func TestDeviceFilter_Weird(t *testing.T) {
+       devices := []*configs.Device{
+               {
+                       Type:        'b',
+                       Major:       8,
+                       Minor:       1,
+                       Permissions: "rwm",
+                       Allow:       false,
+               },
+               {
+                       Type:        'a',
+                       Major:       -1,
+                       Minor:       -1,
+                       Permissions: "rwm",
+                       Allow:       true,
+               },
+               {
+                       Type:        'b',
+                       Major:       8,
+                       Minor:       2,
+                       Permissions: "rwm",
+                       Allow:       false,
+               },
+       }
+       // 8/1 is allowed, 8/2 is not allowed.
+       // This conforms to runc v1.0.0-rc.9 (cgroup1) behavior.
+       expected := `
+// load parameters into registers
+         0: LdXMemH dst: r2 src: r1 off: 0 imm: 0
+         1: LdXMemW dst: r3 src: r1 off: 0 imm: 0
+         2: RSh32Imm dst: r3 imm: 16
+         3: LdXMemW dst: r4 src: r1 off: 4 imm: 0
+         4: LdXMemW dst: r5 src: r1 off: 8 imm: 0
+block-0:
+// return 0 (reject) if type==b && major == 8 && minor == 2
+         5: JNEImm dst: r2 off: -1 imm: 1 <block-1>
+         6: JNEImm dst: r4 off: -1 imm: 8 <block-1>
+         7: JNEImm dst: r5 off: -1 imm: 2 <block-1>
+         8: Mov32Imm dst: r0 imm: 0
+         9: Exit
+block-1:
+// return 1 (accept)
+        10: Mov32Imm dst: r0 imm: 1
+        11: Exit
+`
+       testDeviceFilter(t, devices, expected)
+}
diff --git a/libcontainer/cgroups/ebpf/ebpf.go b/libcontainer/cgroups/ebpf/ebpf.go
new file mode 100644 (file)
index 0000000..4795e0a
--- /dev/null
@@ -0,0 +1,45 @@
+package ebpf
+
+import (
+       "github.com/cilium/ebpf"
+       "github.com/cilium/ebpf/asm"
+       "github.com/pkg/errors"
+       "golang.org/x/sys/unix"
+)
+
+// LoadAttachCgroupDeviceFilter installs eBPF device filter program to /sys/fs/cgroup/<foo> directory.
+//
+// Requires the system to be running in cgroup2 unified-mode with kernel >= 4.15 .
+//
+// https://github.com/torvalds/linux/commit/ebc614f687369f9df99828572b1d85a7c2de3d92
+func LoadAttachCgroupDeviceFilter(insts asm.Instructions, license string, dirFD int) (func() error, error) {
+       nilCloser := func() error {
+               return nil
+       }
+       // Increase `ulimit -l` limit to avoid BPF_PROG_LOAD error (#2167).
+       // This limit is not inherited into the container.
+       memlockLimit := &unix.Rlimit{
+               Cur: unix.RLIM_INFINITY,
+               Max: unix.RLIM_INFINITY,
+       }
+       _ = unix.Setrlimit(unix.RLIMIT_MEMLOCK, memlockLimit)
+       spec := &ebpf.ProgramSpec{
+               Type:         ebpf.CGroupDevice,
+               Instructions: insts,
+               License:      license,
+       }
+       prog, err := ebpf.NewProgram(spec)
+       if err != nil {
+               return nilCloser, err
+       }
+       if err := prog.Attach(dirFD, ebpf.AttachCGroupDevice, unix.BPF_F_ALLOW_MULTI); err != nil {
+               return nilCloser, errors.Wrap(err, "failed to call BPF_PROG_ATTACH (BPF_CGROUP_DEVICE, BPF_F_ALLOW_MULTI)")
+       }
+       closer := func() error {
+               if err := prog.Detach(dirFD, ebpf.AttachCGroupDevice, unix.BPF_F_ALLOW_MULTI); err != nil {
+                       return errors.Wrap(err, "failed to call BPF_PROG_DETACH (BPF_CGROUP_DEVICE, BPF_F_ALLOW_MULTI)")
+               }
+               return nil
+       }
+       return closer, nil
+}
diff --git a/libcontainer/cgroups/fs/apply_raw.go b/libcontainer/cgroups/fs/apply_raw.go
new file mode 100644 (file)
index 0000000..ec148b4
--- /dev/null
@@ -0,0 +1,411 @@
+// +build linux
+
+package fs
+
+import (
+       "fmt"
+       "io"
+       "os"
+       "path/filepath"
+       "sync"
+
+       "github.com/opencontainers/runc/libcontainer/cgroups"
+       "github.com/opencontainers/runc/libcontainer/configs"
+       libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils"
+       "github.com/pkg/errors"
+       "golang.org/x/sys/unix"
+)
+
+var (
+       subsystemsLegacy = subsystemSet{
+               &CpusetGroup{},
+               &DevicesGroup{},
+               &MemoryGroup{},
+               &CpuGroup{},
+               &CpuacctGroup{},
+               &PidsGroup{},
+               &BlkioGroup{},
+               &HugetlbGroup{},
+               &NetClsGroup{},
+               &NetPrioGroup{},
+               &PerfEventGroup{},
+               &FreezerGroup{},
+               &NameGroup{GroupName: "name=systemd", Join: true},
+       }
+       HugePageSizes, _ = cgroups.GetHugePageSize()
+)
+
+var errSubsystemDoesNotExist = fmt.Errorf("cgroup: subsystem does not exist")
+
+type subsystemSet []subsystem
+
+func (s subsystemSet) Get(name string) (subsystem, error) {
+       for _, ss := range s {
+               if ss.Name() == name {
+                       return ss, nil
+               }
+       }
+       return nil, errSubsystemDoesNotExist
+}
+
+type subsystem interface {
+       // Name returns the name of the subsystem.
+       Name() string
+       // Returns the stats, as 'stats', corresponding to the cgroup under 'path'.
+       GetStats(path string, stats *cgroups.Stats) error
+       // Removes the cgroup represented by 'cgroupData'.
+       Remove(*cgroupData) error
+       // Creates and joins the cgroup represented by 'cgroupData'.
+       Apply(*cgroupData) error
+       // Set the cgroup represented by cgroup.
+       Set(path string, cgroup *configs.Cgroup) error
+}
+
+type Manager struct {
+       mu       sync.Mutex
+       Cgroups  *configs.Cgroup
+       Rootless bool // ignore permission-related errors
+       Paths    map[string]string
+}
+
+// The absolute path to the root of the cgroup hierarchies.
+var cgroupRootLock sync.Mutex
+var cgroupRoot string
+
+// Gets the cgroupRoot.
+func getCgroupRoot() (string, error) {
+       cgroupRootLock.Lock()
+       defer cgroupRootLock.Unlock()
+
+       if cgroupRoot != "" {
+               return cgroupRoot, nil
+       }
+
+       root, err := cgroups.FindCgroupMountpointDir()
+       if err != nil {
+               return "", err
+       }
+
+       if _, err := os.Stat(root); err != nil {
+               return "", err
+       }
+
+       cgroupRoot = root
+       return cgroupRoot, nil
+}
+
+type cgroupData struct {
+       root      string
+       innerPath string
+       config    *configs.Cgroup
+       pid       int
+}
+
+// isIgnorableError returns whether err is a permission error (in the loose
+// sense of the word). This includes EROFS (which for an unprivileged user is
+// basically a permission error) and EACCES (for similar reasons) as well as
+// the normal EPERM.
+func isIgnorableError(rootless bool, err error) bool {
+       // We do not ignore errors if we are root.
+       if !rootless {
+               return false
+       }
+       // Is it an ordinary EPERM?
+       if os.IsPermission(errors.Cause(err)) {
+               return true
+       }
+
+       // Try to handle other errnos.
+       var errno error
+       switch err := errors.Cause(err).(type) {
+       case *os.PathError:
+               errno = err.Err
+       case *os.LinkError:
+               errno = err.Err
+       case *os.SyscallError:
+               errno = err.Err
+       }
+       return errno == unix.EROFS || errno == unix.EPERM || errno == unix.EACCES
+}
+
+func (m *Manager) getSubsystems() subsystemSet {
+       return subsystemsLegacy
+}
+
+func (m *Manager) Apply(pid int) (err error) {
+       if m.Cgroups == nil {
+               return nil
+       }
+       m.mu.Lock()
+       defer m.mu.Unlock()
+
+       var c = m.Cgroups
+
+       d, err := getCgroupData(m.Cgroups, pid)
+       if err != nil {
+               return err
+       }
+
+       m.Paths = make(map[string]string)
+       if c.Paths != nil {
+               for name, path := range c.Paths {
+                       _, err := d.path(name)
+                       if err != nil {
+                               if cgroups.IsNotFound(err) {
+                                       continue
+                               }
+                               return err
+                       }
+                       m.Paths[name] = path
+               }
+               return cgroups.EnterPid(m.Paths, pid)
+       }
+
+       for _, sys := range m.getSubsystems() {
+               // TODO: Apply should, ideally, be reentrant or be broken up into a separate
+               // create and join phase so that the cgroup hierarchy for a container can be
+               // created then join consists of writing the process pids to cgroup.procs
+               p, err := d.path(sys.Name())
+               if err != nil {
+                       // The non-presence of the devices subsystem is
+                       // considered fatal for security reasons.
+                       if cgroups.IsNotFound(err) && sys.Name() != "devices" {
+                               continue
+                       }
+                       return err
+               }
+               m.Paths[sys.Name()] = p
+
+               if err := sys.Apply(d); err != nil {
+                       // In the case of rootless (including euid=0 in userns), where an explicit cgroup path hasn't
+                       // been set, we don't bail on error in case of permission problems.
+                       // Cases where limits have been set (and we couldn't create our own
+                       // cgroup) are handled by Set.
+                       if isIgnorableError(m.Rootless, err) && m.Cgroups.Path == "" {
+                               delete(m.Paths, sys.Name())
+                               continue
+                       }
+                       return err
+               }
+
+       }
+       return nil
+}
+
+func (m *Manager) Destroy() error {
+       if m.Cgroups == nil || m.Cgroups.Paths != nil {
+               return nil
+       }
+       m.mu.Lock()
+       defer m.mu.Unlock()
+       if err := cgroups.RemovePaths(m.Paths); err != nil {
+               return err
+       }
+       m.Paths = make(map[string]string)
+       return nil
+}
+
+func (m *Manager) GetPaths() map[string]string {
+       m.mu.Lock()
+       paths := m.Paths
+       m.mu.Unlock()
+       return paths
+}
+
+func (m *Manager) GetUnifiedPath() (string, error) {
+       return "", errors.New("unified path is only supported when running in unified mode")
+}
+
+func (m *Manager) GetStats() (*cgroups.Stats, error) {
+       m.mu.Lock()
+       defer m.mu.Unlock()
+       stats := cgroups.NewStats()
+       for name, path := range m.Paths {
+               sys, err := m.getSubsystems().Get(name)
+               if err == errSubsystemDoesNotExist || !cgroups.PathExists(path) {
+                       continue
+               }
+               if err := sys.GetStats(path, stats); err != nil {
+                       return nil, err
+               }
+       }
+       return stats, nil
+}
+
+func (m *Manager) Set(container *configs.Config) error {
+       if container.Cgroups == nil {
+               return nil
+       }
+
+       // If Paths are set, then we are just joining cgroups paths
+       // and there is no need to set any values.
+       if m.Cgroups != nil && m.Cgroups.Paths != nil {
+               return nil
+       }
+
+       paths := m.GetPaths()
+       for _, sys := range m.getSubsystems() {
+               path := paths[sys.Name()]
+               if err := sys.Set(path, container.Cgroups); err != nil {
+                       if m.Rootless && sys.Name() == "devices" {
+                               continue
+                       }
+                       // When m.Rootless is true, errors from the device subsystem are ignored because it is really not expected to work.
+                       // However, errors from other subsystems are not ignored.
+                       // see @test "runc create (rootless + limits + no cgrouppath + no permission) fails with informative error"
+                       if path == "" {
+                               // We never created a path for this cgroup, so we cannot set
+                               // limits for it (though we have already tried at this point).
+                               return fmt.Errorf("cannot set %s limit: container could not join or create cgroup", sys.Name())
+                       }
+                       return err
+               }
+       }
+
+       if m.Paths["cpu"] != "" {
+               if err := CheckCpushares(m.Paths["cpu"], container.Cgroups.Resources.CpuShares); err != nil {
+                       return err
+               }
+       }
+       return nil
+}
+
+// Freeze toggles the container's freezer cgroup depending on the state
+// provided
+func (m *Manager) Freeze(state configs.FreezerState) error {
+       if m.Cgroups == nil {
+               return errors.New("cannot toggle freezer: cgroups not configured for container")
+       }
+
+       paths := m.GetPaths()
+       dir := paths["freezer"]
+       prevState := m.Cgroups.Resources.Freezer
+       m.Cgroups.Resources.Freezer = state
+       freezer, err := m.getSubsystems().Get("freezer")
+       if err != nil {
+               return err
+       }
+       err = freezer.Set(dir, m.Cgroups)
+       if err != nil {
+               m.Cgroups.Resources.Freezer = prevState
+               return err
+       }
+       return nil
+}
+
+func (m *Manager) GetPids() ([]int, error) {
+       paths := m.GetPaths()
+       return cgroups.GetPids(paths["devices"])
+}
+
+func (m *Manager) GetAllPids() ([]int, error) {
+       paths := m.GetPaths()
+       return cgroups.GetAllPids(paths["devices"])
+}
+
+func getCgroupData(c *configs.Cgroup, pid int) (*cgroupData, error) {
+       root, err := getCgroupRoot()
+       if err != nil {
+               return nil, err
+       }
+
+       if (c.Name != "" || c.Parent != "") && c.Path != "" {
+               return nil, fmt.Errorf("cgroup: either Path or Name and Parent should be used")
+       }
+
+       // XXX: Do not remove this code. Path safety is important! -- cyphar
+       cgPath := libcontainerUtils.CleanPath(c.Path)
+       cgParent := libcontainerUtils.CleanPath(c.Parent)
+       cgName := libcontainerUtils.CleanPath(c.Name)
+
+       innerPath := cgPath
+       if innerPath == "" {
+               innerPath = filepath.Join(cgParent, cgName)
+       }
+
+       return &cgroupData{
+               root:      root,
+               innerPath: innerPath,
+               config:    c,
+               pid:       pid,
+       }, nil
+}
+
+func (raw *cgroupData) path(subsystem string) (string, error) {
+       mnt, err := cgroups.FindCgroupMountpoint(raw.root, subsystem)
+       // If we didn't mount the subsystem, there is no point we make the path.
+       if err != nil {
+               return "", err
+       }
+
+       // If the cgroup name/path is absolute do not look relative to the cgroup of the init process.
+       if filepath.IsAbs(raw.innerPath) {
+               // Sometimes subsystems can be mounted together as 'cpu,cpuacct'.
+               return filepath.Join(raw.root, filepath.Base(mnt), raw.innerPath), nil
+       }
+
+       // Use GetOwnCgroupPath instead of GetInitCgroupPath, because the creating
+       // process could in container and shared pid namespace with host, and
+       // /proc/1/cgroup could point to whole other world of cgroups.
+       parentPath, err := cgroups.GetOwnCgroupPath(subsystem)
+       if err != nil {
+               return "", err
+       }
+
+       return filepath.Join(parentPath, raw.innerPath), nil
+}
+
+func (raw *cgroupData) join(subsystem string) (string, error) {
+       path, err := raw.path(subsystem)
+       if err != nil {
+               return "", err
+       }
+       if err := os.MkdirAll(path, 0755); err != nil {
+               return "", err
+       }
+       if err := cgroups.WriteCgroupProc(path, raw.pid); err != nil {
+               return "", err
+       }
+       return path, nil
+}
+
+func removePath(p string, err error) error {
+       if err != nil {
+               return err
+       }
+       if p != "" {
+               return os.RemoveAll(p)
+       }
+       return nil
+}
+
+func CheckCpushares(path string, c uint64) error {
+       var cpuShares uint64
+
+       if c == 0 {
+               return nil
+       }
+
+       fd, err := os.Open(filepath.Join(path, "cpu.shares"))
+       if err != nil {
+               return err
+       }
+       defer fd.Close()
+
+       _, err = fmt.Fscanf(fd, "%d", &cpuShares)
+       if err != nil && err != io.EOF {
+               return err
+       }
+
+       if c > cpuShares {
+               return fmt.Errorf("The maximum allowed cpu-shares is %d", cpuShares)
+       } else if c < cpuShares {
+               return fmt.Errorf("The minimum allowed cpu-shares is %d", cpuShares)
+       }
+
+       return nil
+}
+
+func (m *Manager) GetCgroups() (*configs.Cgroup, error) {
+       return m.Cgroups, nil
+}
diff --git a/libcontainer/cgroups/fs/apply_raw_test.go b/libcontainer/cgroups/fs/apply_raw_test.go
new file mode 100644 (file)
index 0000000..f3b6556
--- /dev/null
@@ -0,0 +1,297 @@
+// +build linux
+
+package fs
+
+import (
+       "path/filepath"
+       "strings"
+       "testing"
+
+       "github.com/opencontainers/runc/libcontainer/cgroups"
+       "github.com/opencontainers/runc/libcontainer/configs"
+)
+
+func TestInvalidCgroupPath(t *testing.T) {
+       if cgroups.IsCgroup2UnifiedMode() {
+               t.Skip("cgroup v1 is not supported")
+       }
+       root, err := getCgroupRoot()
+       if err != nil {
+               t.Errorf("couldn't get cgroup root: %v", err)
+       }
+
+       config := &configs.Cgroup{
+               Path: "../../../../../../../../../../some/path",
+       }
+
+       data, err := getCgroupData(config, 0)
+       if err != nil {
+               t.Errorf("couldn't get cgroup data: %v", err)
+       }
+
+       // Make sure the final innerPath doesn't go outside the cgroup mountpoint.
+       if strings.HasPrefix(data.innerPath, "..") {
+               t.Errorf("SECURITY: cgroup innerPath is outside cgroup mountpoint!")
+       }
+
+       // Double-check, using an actual cgroup.
+       deviceRoot := filepath.Join(root, "devices")
+       devicePath, err := data.path("devices")
+       if err != nil {
+               t.Errorf("couldn't get cgroup path: %v", err)
+       }
+       if !strings.HasPrefix(devicePath, deviceRoot) {
+               t.Errorf("SECURITY: cgroup path() is outside cgroup mountpoint!")
+       }
+}
+
+func TestInvalidAbsoluteCgroupPath(t *testing.T) {
+       if cgroups.IsCgroup2UnifiedMode() {
+               t.Skip("cgroup v1 is not supported")
+       }
+       root, err := getCgroupRoot()
+       if err != nil {
+               t.Errorf("couldn't get cgroup root: %v", err)
+       }
+
+       config := &configs.Cgroup{
+               Path: "/../../../../../../../../../../some/path",
+       }
+
+       data, err := getCgroupData(config, 0)
+       if err != nil {
+               t.Errorf("couldn't get cgroup data: %v", err)
+       }
+
+       // Make sure the final innerPath doesn't go outside the cgroup mountpoint.
+       if strings.HasPrefix(data.innerPath, "..") {
+               t.Errorf("SECURITY: cgroup innerPath is outside cgroup mountpoint!")
+       }
+
+       // Double-check, using an actual cgroup.
+       deviceRoot := filepath.Join(root, "devices")
+       devicePath, err := data.path("devices")
+       if err != nil {
+               t.Errorf("couldn't get cgroup path: %v", err)
+       }
+       if !strings.HasPrefix(devicePath, deviceRoot) {
+               t.Errorf("SECURITY: cgroup path() is outside cgroup mountpoint!")
+       }
+}
+
+// XXX: Remove me after we get rid of configs.Cgroup.Name and configs.Cgroup.Parent.
+func TestInvalidCgroupParent(t *testing.T) {
+       if cgroups.IsCgroup2UnifiedMode() {
+               t.Skip("cgroup v1 is not supported")
+       }
+       root, err := getCgroupRoot()
+       if err != nil {
+               t.Errorf("couldn't get cgroup root: %v", err)
+       }
+
+       config := &configs.Cgroup{
+               Parent: "../../../../../../../../../../some/path",
+               Name:   "name",
+       }
+
+       data, err := getCgroupData(config, 0)
+       if err != nil {
+               t.Errorf("couldn't get cgroup data: %v", err)
+       }
+
+       // Make sure the final innerPath doesn't go outside the cgroup mountpoint.
+       if strings.HasPrefix(data.innerPath, "..") {
+               t.Errorf("SECURITY: cgroup innerPath is outside cgroup mountpoint!")
+       }
+
+       // Double-check, using an actual cgroup.
+       deviceRoot := filepath.Join(root, "devices")
+       devicePath, err := data.path("devices")
+       if err != nil {
+               t.Errorf("couldn't get cgroup path: %v", err)
+       }
+       if !strings.HasPrefix(devicePath, deviceRoot) {
+               t.Errorf("SECURITY: cgroup path() is outside cgroup mountpoint!")
+       }
+}
+
+// XXX: Remove me after we get rid of configs.Cgroup.Name and configs.Cgroup.Parent.
+func TestInvalidAbsoluteCgroupParent(t *testing.T) {
+       if cgroups.IsCgroup2UnifiedMode() {
+               t.Skip("cgroup v1 is not supported")
+       }
+       root, err := getCgroupRoot()
+       if err != nil {
+               t.Errorf("couldn't get cgroup root: %v", err)
+       }
+
+       config := &configs.Cgroup{
+               Parent: "/../../../../../../../../../../some/path",
+               Name:   "name",
+       }
+
+       data, err := getCgroupData(config, 0)
+       if err != nil {
+               t.Errorf("couldn't get cgroup data: %v", err)
+       }
+
+       // Make sure the final innerPath doesn't go outside the cgroup mountpoint.
+       if strings.HasPrefix(data.innerPath, "..") {
+               t.Errorf("SECURITY: cgroup innerPath is outside cgroup mountpoint!")
+       }
+
+       // Double-check, using an actual cgroup.
+       deviceRoot := filepath.Join(root, "devices")
+       devicePath, err := data.path("devices")
+       if err != nil {
+               t.Errorf("couldn't get cgroup path: %v", err)
+       }
+       if !strings.HasPrefix(devicePath, deviceRoot) {
+               t.Errorf("SECURITY: cgroup path() is outside cgroup mountpoint!")
+       }
+}
+
+// XXX: Remove me after we get rid of configs.Cgroup.Name and configs.Cgroup.Parent.
+func TestInvalidCgroupName(t *testing.T) {
+       if cgroups.IsCgroup2UnifiedMode() {
+               t.Skip("cgroup v1 is not supported")
+       }
+       root, err := getCgroupRoot()
+       if err != nil {
+               t.Errorf("couldn't get cgroup root: %v", err)
+       }
+
+       config := &configs.Cgroup{
+               Parent: "parent",
+               Name:   "../../../../../../../../../../some/path",
+       }
+
+       data, err := getCgroupData(config, 0)
+       if err != nil {
+               t.Errorf("couldn't get cgroup data: %v", err)
+       }
+
+       // Make sure the final innerPath doesn't go outside the cgroup mountpoint.
+       if strings.HasPrefix(data.innerPath, "..") {
+               t.Errorf("SECURITY: cgroup innerPath is outside cgroup mountpoint!")
+       }
+
+       // Double-check, using an actual cgroup.
+       deviceRoot := filepath.Join(root, "devices")
+       devicePath, err := data.path("devices")
+       if err != nil {
+               t.Errorf("couldn't get cgroup path: %v", err)
+       }
+       if !strings.HasPrefix(devicePath, deviceRoot) {
+               t.Errorf("SECURITY: cgroup path() is outside cgroup mountpoint!")
+       }
+
+}
+
+// XXX: Remove me after we get rid of configs.Cgroup.Name and configs.Cgroup.Parent.
+func TestInvalidAbsoluteCgroupName(t *testing.T) {
+       if cgroups.IsCgroup2UnifiedMode() {
+               t.Skip("cgroup v1 is not supported")
+       }
+       root, err := getCgroupRoot()
+       if err != nil {
+               t.Errorf("couldn't get cgroup root: %v", err)
+       }
+
+       config := &configs.Cgroup{
+               Parent: "parent",
+               Name:   "/../../../../../../../../../../some/path",
+       }
+
+       data, err := getCgroupData(config, 0)
+       if err != nil {
+               t.Errorf("couldn't get cgroup data: %v", err)
+       }
+
+       // Make sure the final innerPath doesn't go outside the cgroup mountpoint.
+       if strings.HasPrefix(data.innerPath, "..") {
+               t.Errorf("SECURITY: cgroup innerPath is outside cgroup mountpoint!")
+       }
+
+       // Double-check, using an actual cgroup.
+       deviceRoot := filepath.Join(root, "devices")
+       devicePath, err := data.path("devices")
+       if err != nil {
+               t.Errorf("couldn't get cgroup path: %v", err)
+       }
+       if !strings.HasPrefix(devicePath, deviceRoot) {
+               t.Errorf("SECURITY: cgroup path() is outside cgroup mountpoint!")
+       }
+}
+
+// XXX: Remove me after we get rid of configs.Cgroup.Name and configs.Cgroup.Parent.
+func TestInvalidCgroupNameAndParent(t *testing.T) {
+       if cgroups.IsCgroup2UnifiedMode() {
+               t.Skip("cgroup v1 is not supported")
+       }
+       root, err := getCgroupRoot()
+       if err != nil {
+               t.Errorf("couldn't get cgroup root: %v", err)
+       }
+
+       config := &configs.Cgroup{
+               Parent: "../../../../../../../../../../some/path",
+               Name:   "../../../../../../../../../../some/path",
+       }
+
+       data, err := getCgroupData(config, 0)
+       if err != nil {
+               t.Errorf("couldn't get cgroup data: %v", err)
+       }
+
+       // Make sure the final innerPath doesn't go outside the cgroup mountpoint.
+       if strings.HasPrefix(data.innerPath, "..") {
+               t.Errorf("SECURITY: cgroup innerPath is outside cgroup mountpoint!")
+       }
+
+       // Double-check, using an actual cgroup.
+       deviceRoot := filepath.Join(root, "devices")
+       devicePath, err := data.path("devices")
+       if err != nil {
+               t.Errorf("couldn't get cgroup path: %v", err)
+       }
+       if !strings.HasPrefix(devicePath, deviceRoot) {
+               t.Errorf("SECURITY: cgroup path() is outside cgroup mountpoint!")
+       }
+}
+
+// XXX: Remove me after we get rid of configs.Cgroup.Name and configs.Cgroup.Parent.
+func TestInvalidAbsoluteCgroupNameAndParent(t *testing.T) {
+       if cgroups.IsCgroup2UnifiedMode() {
+               t.Skip("cgroup v1 is not supported")
+       }
+       root, err := getCgroupRoot()
+       if err != nil {
+               t.Errorf("couldn't get cgroup root: %v", err)
+       }
+
+       config := &configs.Cgroup{
+               Parent: "/../../../../../../../../../../some/path",
+               Name:   "/../../../../../../../../../../some/path",
+       }
+
+       data, err := getCgroupData(config, 0)
+       if err != nil {
+               t.Errorf("couldn't get cgroup data: %v", err)
+       }
+
+       // Make sure the final innerPath doesn't go outside the cgroup mountpoint.
+       if strings.HasPrefix(data.innerPath, "..") {
+               t.Errorf("SECURITY: cgroup innerPath is outside cgroup mountpoint!")
+       }
+
+       // Double-check, using an actual cgroup.
+       deviceRoot := filepath.Join(root, "devices")
+       devicePath, err := data.path("devices")
+       if err != nil {
+               t.Errorf("couldn't get cgroup path: %v", err)
+       }
+       if !strings.HasPrefix(devicePath, deviceRoot) {
+               t.Errorf("SECURITY: cgroup path() is outside cgroup mountpoint!")
+       }
+}
diff --git a/libcontainer/cgroups/fs/blkio.go b/libcontainer/cgroups/fs/blkio.go
new file mode 100644 (file)
index 0000000..52c118d
--- /dev/null
@@ -0,0 +1,238 @@
+// +build linux
+
+package fs
+
+import (
+       "bufio"
+       "fmt"
+       "os"
+       "path/filepath"
+       "strconv"
+       "strings"
+
+       "github.com/opencontainers/runc/libcontainer/cgroups"
+       "github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
+       "github.com/opencontainers/runc/libcontainer/configs"
+)
+
+type BlkioGroup struct {
+}
+
+func (s *BlkioGroup) Name() string {
+       return "blkio"
+}
+
+func (s *BlkioGroup) Apply(d *cgroupData) error {
+       _, err := d.join("blkio")
+       if err != nil && !cgroups.IsNotFound(err) {
+               return err
+       }
+       return nil
+}
+
+func (s *BlkioGroup) Set(path string, cgroup *configs.Cgroup) error {
+       if cgroup.Resources.BlkioWeight != 0 {
+               if err := fscommon.WriteFile(path, "blkio.weight", strconv.FormatUint(uint64(cgroup.Resources.BlkioWeight), 10)); err != nil {
+                       return err
+               }
+       }
+
+       if cgroup.Resources.BlkioLeafWeight != 0 {
+               if err := fscommon.WriteFile(path, "blkio.leaf_weight", strconv.FormatUint(uint64(cgroup.Resources.BlkioLeafWeight), 10)); err != nil {
+                       return err
+               }
+       }
+       for _, wd := range cgroup.Resources.BlkioWeightDevice {
+               if err := fscommon.WriteFile(path, "blkio.weight_device", wd.WeightString()); err != nil {
+                       return err
+               }
+               if err := fscommon.WriteFile(path, "blkio.leaf_weight_device", wd.LeafWeightString()); err != nil {
+                       return err
+               }
+       }
+       for _, td := range cgroup.Resources.BlkioThrottleReadBpsDevice {
+               if err := fscommon.WriteFile(path, "blkio.throttle.read_bps_device", td.String()); err != nil {
+                       return err
+               }
+       }
+       for _, td := range cgroup.Resources.BlkioThrottleWriteBpsDevice {
+               if err := fscommon.WriteFile(path, "blkio.throttle.write_bps_device", td.String()); err != nil {
+                       return err
+               }
+       }
+       for _, td := range cgroup.Resources.BlkioThrottleReadIOPSDevice {
+               if err := fscommon.WriteFile(path, "blkio.throttle.read_iops_device", td.String()); err != nil {
+                       return err
+               }
+       }
+       for _, td := range cgroup.Resources.BlkioThrottleWriteIOPSDevice {
+               if err := fscommon.WriteFile(path, "blkio.throttle.write_iops_device", td.String()); err != nil {
+                       return err
+               }
+       }
+
+       return nil
+}
+
+func (s *BlkioGroup) Remove(d *cgroupData) error {
+       return removePath(d.path("blkio"))
+}
+
+/*
+examples:
+
+    blkio.sectors
+    8:0 6792
+
+    blkio.io_service_bytes
+    8:0 Read 1282048
+    8:0 Write 2195456
+    8:0 Sync 2195456
+    8:0 Async 1282048
+    8:0 Total 3477504
+    Total 3477504
+
+    blkio.io_serviced
+    8:0 Read 124
+    8:0 Write 104
+    8:0 Sync 104
+    8:0 Async 124
+    8:0 Total 228
+    Total 228
+
+    blkio.io_queued
+    8:0 Read 0
+    8:0 Write 0
+    8:0 Sync 0
+    8:0 Async 0
+    8:0 Total 0
+    Total 0
+*/
+
+func splitBlkioStatLine(r rune) bool {
+       return r == ' ' || r == ':'
+}
+
+func getBlkioStat(path string) ([]cgroups.BlkioStatEntry, error) {
+       var blkioStats []cgroups.BlkioStatEntry
+       f, err := os.Open(path)
+       if err != nil {
+               if os.IsNotExist(err) {
+                       return blkioStats, nil
+               }
+               return nil, err
+       }
+       defer f.Close()
+
+       sc := bufio.NewScanner(f)
+       for sc.Scan() {
+               // format: dev type amount
+               fields := strings.FieldsFunc(sc.Text(), splitBlkioStatLine)
+               if len(fields) < 3 {
+                       if len(fields) == 2 && fields[0] == "Total" {
+                               // skip total line
+                               continue
+                       } else {
+                               return nil, fmt.Errorf("Invalid line found while parsing %s: %s", path, sc.Text())
+                       }
+               }
+
+               v, err := strconv.ParseUint(fields[0], 10, 64)
+               if err != nil {
+                       return nil, err
+               }
+               major := v
+
+               v, err = strconv.ParseUint(fields[1], 10, 64)
+               if err != nil {
+                       return nil, err
+               }
+               minor := v
+
+               op := ""
+               valueField := 2
+               if len(fields) == 4 {
+                       op = fields[2]
+                       valueField = 3
+               }
+               v, err = strconv.ParseUint(fields[valueField], 10, 64)
+               if err != nil {
+                       return nil, err
+               }
+               blkioStats = append(blkioStats, cgroups.BlkioStatEntry{Major: major, Minor: minor, Op: op, Value: v})
+       }
+
+       return blkioStats, nil
+}
+
+func (s *BlkioGroup) GetStats(path string, stats *cgroups.Stats) error {
+       // Try to read CFQ stats available on all CFQ enabled kernels first
+       if blkioStats, err := getBlkioStat(filepath.Join(path, "blkio.io_serviced_recursive")); err == nil && blkioStats != nil {
+               return getCFQStats(path, stats)
+       }
+       return getStats(path, stats) // Use generic stats as fallback
+}
+
+func getCFQStats(path string, stats *cgroups.Stats) error {
+       var blkioStats []cgroups.BlkioStatEntry
+       var err error
+
+       if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.sectors_recursive")); err != nil {
+               return err
+       }
+       stats.BlkioStats.SectorsRecursive = blkioStats
+
+       if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.io_service_bytes_recursive")); err != nil {
+               return err
+       }
+       stats.BlkioStats.IoServiceBytesRecursive = blkioStats
+
+       if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.io_serviced_recursive")); err != nil {
+               return err
+       }
+       stats.BlkioStats.IoServicedRecursive = blkioStats
+
+       if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.io_queued_recursive")); err != nil {
+               return err
+       }
+       stats.BlkioStats.IoQueuedRecursive = blkioStats
+
+       if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.io_service_time_recursive")); err != nil {
+               return err
+       }
+       stats.BlkioStats.IoServiceTimeRecursive = blkioStats
+
+       if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.io_wait_time_recursive")); err != nil {
+               return err
+       }
+       stats.BlkioStats.IoWaitTimeRecursive = blkioStats
+
+       if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.io_merged_recursive")); err != nil {
+               return err
+       }
+       stats.BlkioStats.IoMergedRecursive = blkioStats
+
+       if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.time_recursive")); err != nil {
+               return err
+       }
+       stats.BlkioStats.IoTimeRecursive = blkioStats
+
+       return nil
+}
+
+func getStats(path string, stats *cgroups.Stats) error {
+       var blkioStats []cgroups.BlkioStatEntry
+       var err error
+
+       if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.throttle.io_service_bytes")); err != nil {
+               return err
+       }
+       stats.BlkioStats.IoServiceBytesRecursive = blkioStats
+
+       if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.throttle.io_serviced")); err != nil {
+               return err
+       }
+       stats.BlkioStats.IoServicedRecursive = blkioStats
+
+       return nil
+}
diff --git a/libcontainer/cgroups/fs/blkio_test.go b/libcontainer/cgroups/fs/blkio_test.go
new file mode 100644 (file)
index 0000000..5ba60fa
--- /dev/null
@@ -0,0 +1,637 @@
+// +build linux
+
+package fs
+
+import (
+       "strconv"
+       "testing"
+
+       "github.com/opencontainers/runc/libcontainer/cgroups"
+       "github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
+       "github.com/opencontainers/runc/libcontainer/configs"
+)
+
+const (
+       sectorsRecursiveContents      = `8:0 1024`
+       serviceBytesRecursiveContents = `8:0 Read 100
+8:0 Write 200
+8:0 Sync 300
+8:0 Async 500
+8:0 Total 500
+Total 500`
+       servicedRecursiveContents = `8:0 Read 10
+8:0 Write 40
+8:0 Sync 20
+8:0 Async 30
+8:0 Total 50
+Total 50`
+       queuedRecursiveContents = `8:0 Read 1
+8:0 Write 4
+8:0 Sync 2
+8:0 Async 3
+8:0 Total 5
+Total 5`
+       serviceTimeRecursiveContents = `8:0 Read 173959
+8:0 Write 0
+8:0 Sync 0
+8:0 Async 173959
+8:0 Total 17395
+Total 17395`
+       waitTimeRecursiveContents = `8:0 Read 15571
+8:0 Write 0
+8:0 Sync 0
+8:0 Async 15571
+8:0 Total 15571`
+       mergedRecursiveContents = `8:0 Read 5
+8:0 Write 10
+8:0 Sync 0
+8:0 Async 0
+8:0 Total 15
+Total 15`
+       timeRecursiveContents = `8:0 8`
+       throttleServiceBytes  = `8:0 Read 11030528
+8:0 Write 23
+8:0 Sync 42
+8:0 Async 11030528
+8:0 Total 11030528
+252:0 Read 11030528
+252:0 Write 23
+252:0 Sync 42
+252:0 Async 11030528
+252:0 Total 11030528
+Total 22061056`
+       throttleServiced = `8:0 Read 164
+8:0 Write 23
+8:0 Sync 42
+8:0 Async 164
+8:0 Total 164
+252:0 Read 164
+252:0 Write 23
+252:0 Sync 42
+252:0 Async 164
+252:0 Total 164
+Total 328`
+)
+
+func appendBlkioStatEntry(blkioStatEntries *[]cgroups.BlkioStatEntry, major, minor, value uint64, op string) {
+       *blkioStatEntries = append(*blkioStatEntries, cgroups.BlkioStatEntry{Major: major, Minor: minor, Value: value, Op: op})
+}
+
+func TestBlkioSetWeight(t *testing.T) {
+       helper := NewCgroupTestUtil("blkio", t)
+       defer helper.cleanup()
+
+       const (
+               weightBefore = 100
+               weightAfter  = 200
+       )
+
+       helper.writeFileContents(map[string]string{
+               "blkio.weight": strconv.Itoa(weightBefore),
+       })
+
+       helper.CgroupData.config.Resources.BlkioWeight = weightAfter
+       blkio := &BlkioGroup{}
+       if err := blkio.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+               t.Fatal(err)
+       }
+
+       value, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "blkio.weight")
+       if err != nil {
+               t.Fatalf("Failed to parse blkio.weight - %s", err)
+       }
+
+       if value != weightAfter {
+               t.Fatal("Got the wrong value, set blkio.weight failed.")
+       }
+}
+
+func TestBlkioSetWeightDevice(t *testing.T) {
+       helper := NewCgroupTestUtil("blkio", t)
+       defer helper.cleanup()
+
+       const (
+               weightDeviceBefore = "8:0 400"
+       )
+
+       wd := configs.NewWeightDevice(8, 0, 500, 0)
+       weightDeviceAfter := wd.WeightString()
+
+       helper.writeFileContents(map[string]string{
+               "blkio.weight_device": weightDeviceBefore,
+       })
+
+       helper.CgroupData.config.Resources.BlkioWeightDevice = []*configs.WeightDevice{wd}
+       blkio := &BlkioGroup{}
+       if err := blkio.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+               t.Fatal(err)
+       }
+
+       value, err := fscommon.GetCgroupParamString(helper.CgroupPath, "blkio.weight_device")
+       if err != nil {
+               t.Fatalf("Failed to parse blkio.weight_device - %s", err)
+       }
+
+       if value != weightDeviceAfter {
+               t.Fatal("Got the wrong value, set blkio.weight_device failed.")
+       }
+}
+
+// regression #274
+func TestBlkioSetMultipleWeightDevice(t *testing.T) {
+       helper := NewCgroupTestUtil("blkio", t)
+       defer helper.cleanup()
+
+       const (
+               weightDeviceBefore = "8:0 400"
+       )
+
+       wd1 := configs.NewWeightDevice(8, 0, 500, 0)
+       wd2 := configs.NewWeightDevice(8, 16, 500, 0)
+       // we cannot actually set and check both because normal ioutil.WriteFile
+       // when writing to cgroup file will overwrite the whole file content instead
+       // of updating it as the kernel is doing. Just check the second device
+       // is present will suffice for the test to ensure multiple writes are done.
+       weightDeviceAfter := wd2.WeightString()
+
+       helper.writeFileContents(map[string]string{
+               "blkio.weight_device": weightDeviceBefore,
+       })
+
+       helper.CgroupData.config.Resources.BlkioWeightDevice = []*configs.WeightDevice{wd1, wd2}
+       blkio := &BlkioGroup{}
+       if err := blkio.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+               t.Fatal(err)
+       }
+
+       value, err := fscommon.GetCgroupParamString(helper.CgroupPath, "blkio.weight_device")
+       if err != nil {
+               t.Fatalf("Failed to parse blkio.weight_device - %s", err)
+       }
+
+       if value != weightDeviceAfter {
+               t.Fatal("Got the wrong value, set blkio.weight_device failed.")
+       }
+}
+
+func TestBlkioStats(t *testing.T) {
+       helper := NewCgroupTestUtil("blkio", t)
+       defer helper.cleanup()
+       helper.writeFileContents(map[string]string{
+               "blkio.io_service_bytes_recursive": serviceBytesRecursiveContents,
+               "blkio.io_serviced_recursive":      servicedRecursiveContents,
+               "blkio.io_queued_recursive":        queuedRecursiveContents,
+               "blkio.io_service_time_recursive":  serviceTimeRecursiveContents,
+               "blkio.io_wait_time_recursive":     waitTimeRecursiveContents,
+               "blkio.io_merged_recursive":        mergedRecursiveContents,
+               "blkio.time_recursive":             timeRecursiveContents,
+               "blkio.sectors_recursive":          sectorsRecursiveContents,
+       })
+
+       blkio := &BlkioGroup{}
+       actualStats := *cgroups.NewStats()
+       err := blkio.GetStats(helper.CgroupPath, &actualStats)
+       if err != nil {
+               t.Fatal(err)
+       }
+
+       // Verify expected stats.
+       expectedStats := cgroups.BlkioStats{}
+       appendBlkioStatEntry(&expectedStats.SectorsRecursive, 8, 0, 1024, "")
+
+       appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 100, "Read")
+       appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 200, "Write")
+       appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 300, "Sync")
+       appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 500, "Async")
+       appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 500, "Total")
+
+       appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 10, "Read")
+       appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 40, "Write")
+       appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 20, "Sync")
+       appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 30, "Async")
+       appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 50, "Total")
+
+       appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 1, "Read")
+       appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 4, "Write")
+       appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 2, "Sync")
+       appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 3, "Async")
+       appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 5, "Total")
+
+       appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 173959, "Read")
+       appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 0, "Write")
+       appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 0, "Sync")
+       appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 173959, "Async")
+       appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 17395, "Total")
+
+       appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 15571, "Read")
+       appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 0, "Write")
+       appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 0, "Sync")
+       appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 15571, "Async")
+       appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 15571, "Total")
+
+       appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 5, "Read")
+       appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 10, "Write")
+       appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 0, "Sync")
+       appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 0, "Async")
+       appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 15, "Total")
+
+       appendBlkioStatEntry(&expectedStats.IoTimeRecursive, 8, 0, 8, "")
+
+       expectBlkioStatsEquals(t, expectedStats, actualStats.BlkioStats)
+}
+
+func TestBlkioStatsNoSectorsFile(t *testing.T) {
+       helper := NewCgroupTestUtil("blkio", t)
+       defer helper.cleanup()
+       helper.writeFileContents(map[string]string{
+               "blkio.io_service_bytes_recursive": serviceBytesRecursiveContents,
+               "blkio.io_serviced_recursive":      servicedRecursiveContents,
+               "blkio.io_queued_recursive":        queuedRecursiveContents,
+               "blkio.io_service_time_recursive":  serviceTimeRecursiveContents,
+               "blkio.io_wait_time_recursive":     waitTimeRecursiveContents,
+               "blkio.io_merged_recursive":        mergedRecursiveContents,
+               "blkio.time_recursive":             timeRecursiveContents,
+       })
+
+       blkio := &BlkioGroup{}
+       actualStats := *cgroups.NewStats()
+       err := blkio.GetStats(helper.CgroupPath, &actualStats)
+       if err != nil {
+               t.Fatalf("Failed unexpectedly: %s", err)
+       }
+}
+
+func TestBlkioStatsNoServiceBytesFile(t *testing.T) {
+       helper := NewCgroupTestUtil("blkio", t)
+       defer helper.cleanup()
+       helper.writeFileContents(map[string]string{
+               "blkio.io_serviced_recursive":     servicedRecursiveContents,
+               "blkio.io_queued_recursive":       queuedRecursiveContents,
+               "blkio.sectors_recursive":         sectorsRecursiveContents,
+               "blkio.io_service_time_recursive": serviceTimeRecursiveContents,
+               "blkio.io_wait_time_recursive":    waitTimeRecursiveContents,
+               "blkio.io_merged_recursive":       mergedRecursiveContents,
+               "blkio.time_recursive":            timeRecursiveContents,
+       })
+
+       blkio := &BlkioGroup{}
+       actualStats := *cgroups.NewStats()
+       err := blkio.GetStats(helper.CgroupPath, &actualStats)
+       if err != nil {
+               t.Fatalf("Failed unexpectedly: %s", err)
+       }
+}
+
+func TestBlkioStatsNoServicedFile(t *testing.T) {
+       helper := NewCgroupTestUtil("blkio", t)
+       defer helper.cleanup()
+       helper.writeFileContents(map[string]string{
+               "blkio.io_service_bytes_recursive": serviceBytesRecursiveContents,
+               "blkio.io_queued_recursive":        queuedRecursiveContents,
+               "blkio.sectors_recursive":          sectorsRecursiveContents,
+               "blkio.io_service_time_recursive":  serviceTimeRecursiveContents,
+               "blkio.io_wait_time_recursive":     waitTimeRecursiveContents,
+               "blkio.io_merged_recursive":        mergedRecursiveContents,
+               "blkio.time_recursive":             timeRecursiveContents,
+       })
+
+       blkio := &BlkioGroup{}
+       actualStats := *cgroups.NewStats()
+       err := blkio.GetStats(helper.CgroupPath, &actualStats)
+       if err != nil {
+               t.Fatalf("Failed unexpectedly: %s", err)
+       }
+}
+
+func TestBlkioStatsNoQueuedFile(t *testing.T) {
+       helper := NewCgroupTestUtil("blkio", t)
+       defer helper.cleanup()
+       helper.writeFileContents(map[string]string{
+               "blkio.io_service_bytes_recursive": serviceBytesRecursiveContents,
+               "blkio.io_serviced_recursive":      servicedRecursiveContents,
+               "blkio.sectors_recursive":          sectorsRecursiveContents,
+               "blkio.io_service_time_recursive":  serviceTimeRecursiveContents,
+               "blkio.io_wait_time_recursive":     waitTimeRecursiveContents,
+               "blkio.io_merged_recursive":        mergedRecursiveContents,
+               "blkio.time_recursive":             timeRecursiveContents,
+       })
+
+       blkio := &BlkioGroup{}
+       actualStats := *cgroups.NewStats()
+       err := blkio.GetStats(helper.CgroupPath, &actualStats)
+       if err != nil {
+               t.Fatalf("Failed unexpectedly: %s", err)
+       }
+}
+
+func TestBlkioStatsNoServiceTimeFile(t *testing.T) {
+       if testing.Short() {
+               t.Skip("skipping test in short mode.")
+       }
+       helper := NewCgroupTestUtil("blkio", t)
+       defer helper.cleanup()
+       helper.writeFileContents(map[string]string{
+               "blkio.io_service_bytes_recursive": serviceBytesRecursiveContents,
+               "blkio.io_serviced_recursive":      servicedRecursiveContents,
+               "blkio.io_queued_recursive":        queuedRecursiveContents,
+               "blkio.io_wait_time_recursive":     waitTimeRecursiveContents,
+               "blkio.io_merged_recursive":        mergedRecursiveContents,
+               "blkio.time_recursive":             timeRecursiveContents,
+               "blkio.sectors_recursive":          sectorsRecursiveContents,
+       })
+
+       blkio := &BlkioGroup{}
+       actualStats := *cgroups.NewStats()
+       err := blkio.GetStats(helper.CgroupPath, &actualStats)
+       if err != nil {
+               t.Fatalf("Failed unexpectedly: %s", err)
+       }
+}
+
+func TestBlkioStatsNoWaitTimeFile(t *testing.T) {
+       if testing.Short() {
+               t.Skip("skipping test in short mode.")
+       }
+       helper := NewCgroupTestUtil("blkio", t)
+       defer helper.cleanup()
+       helper.writeFileContents(map[string]string{
+               "blkio.io_service_bytes_recursive": serviceBytesRecursiveContents,
+               "blkio.io_serviced_recursive":      servicedRecursiveContents,
+               "blkio.io_queued_recursive":        queuedRecursiveContents,
+               "blkio.io_service_time_recursive":  serviceTimeRecursiveContents,
+               "blkio.io_merged_recursive":        mergedRecursiveContents,
+               "blkio.time_recursive":             timeRecursiveContents,
+               "blkio.sectors_recursive":          sectorsRecursiveContents,
+       })
+
+       blkio := &BlkioGroup{}
+       actualStats := *cgroups.NewStats()
+       err := blkio.GetStats(helper.CgroupPath, &actualStats)
+       if err != nil {
+               t.Fatalf("Failed unexpectedly: %s", err)
+       }
+}
+
+func TestBlkioStatsNoMergedFile(t *testing.T) {
+       if testing.Short() {
+               t.Skip("skipping test in short mode.")
+       }
+       helper := NewCgroupTestUtil("blkio", t)
+       defer helper.cleanup()
+       helper.writeFileContents(map[string]string{
+               "blkio.io_service_bytes_recursive": serviceBytesRecursiveContents,
+               "blkio.io_serviced_recursive":      servicedRecursiveContents,
+               "blkio.io_queued_recursive":        queuedRecursiveContents,
+               "blkio.io_service_time_recursive":  serviceTimeRecursiveContents,
+               "blkio.io_wait_time_recursive":     waitTimeRecursiveContents,
+               "blkio.time_recursive":             timeRecursiveContents,
+               "blkio.sectors_recursive":          sectorsRecursiveContents,
+       })
+
+       blkio := &BlkioGroup{}
+       actualStats := *cgroups.NewStats()
+       err := blkio.GetStats(helper.CgroupPath, &actualStats)
+       if err != nil {
+               t.Fatalf("Failed unexpectedly: %s", err)
+       }
+}
+
+func TestBlkioStatsNoTimeFile(t *testing.T) {
+       if testing.Short() {
+               t.Skip("skipping test in short mode.")
+       }
+       helper := NewCgroupTestUtil("blkio", t)
+       defer helper.cleanup()
+       helper.writeFileContents(map[string]string{
+               "blkio.io_service_bytes_recursive": serviceBytesRecursiveContents,
+               "blkio.io_serviced_recursive":      servicedRecursiveContents,
+               "blkio.io_queued_recursive":        queuedRecursiveContents,
+               "blkio.io_service_time_recursive":  serviceTimeRecursiveContents,
+               "blkio.io_wait_time_recursive":     waitTimeRecursiveContents,
+               "blkio.io_merged_recursive":        mergedRecursiveContents,
+               "blkio.sectors_recursive":          sectorsRecursiveContents,
+       })
+
+       blkio := &BlkioGroup{}
+       actualStats := *cgroups.NewStats()
+       err := blkio.GetStats(helper.CgroupPath, &actualStats)
+       if err != nil {
+               t.Fatalf("Failed unexpectedly: %s", err)
+       }
+}
+
+func TestBlkioStatsUnexpectedNumberOfFields(t *testing.T) {
+       helper := NewCgroupTestUtil("blkio", t)
+       defer helper.cleanup()
+       helper.writeFileContents(map[string]string{
+               "blkio.io_service_bytes_recursive": "8:0 Read 100 100",
+               "blkio.io_serviced_recursive":      servicedRecursiveContents,
+               "blkio.io_queued_recursive":        queuedRecursiveContents,
+               "blkio.sectors_recursive":          sectorsRecursiveContents,
+               "blkio.io_service_time_recursive":  serviceTimeRecursiveContents,
+               "blkio.io_wait_time_recursive":     waitTimeRecursiveContents,
+               "blkio.io_merged_recursive":        mergedRecursiveContents,
+               "blkio.time_recursive":             timeRecursiveContents,
+       })
+
+       blkio := &BlkioGroup{}
+       actualStats := *cgroups.NewStats()
+       err := blkio.GetStats(helper.CgroupPath, &actualStats)
+       if err == nil {
+               t.Fatal("Expected to fail, but did not")
+       }
+}
+
+func TestBlkioStatsUnexpectedFieldType(t *testing.T) {
+       helper := NewCgroupTestUtil("blkio", t)
+       defer helper.cleanup()
+       helper.writeFileContents(map[string]string{
+               "blkio.io_service_bytes_recursive": "8:0 Read Write",
+               "blkio.io_serviced_recursive":      servicedRecursiveContents,
+               "blkio.io_queued_recursive":        queuedRecursiveContents,
+               "blkio.sectors_recursive":          sectorsRecursiveContents,
+               "blkio.io_service_time_recursive":  serviceTimeRecursiveContents,
+               "blkio.io_wait_time_recursive":     waitTimeRecursiveContents,
+               "blkio.io_merged_recursive":        mergedRecursiveContents,
+               "blkio.time_recursive":             timeRecursiveContents,
+       })
+
+       blkio := &BlkioGroup{}
+       actualStats := *cgroups.NewStats()
+       err := blkio.GetStats(helper.CgroupPath, &actualStats)
+       if err == nil {
+               t.Fatal("Expected to fail, but did not")
+       }
+}
+
+func TestNonCFQBlkioStats(t *testing.T) {
+       helper := NewCgroupTestUtil("blkio", t)
+       defer helper.cleanup()
+       helper.writeFileContents(map[string]string{
+               "blkio.io_service_bytes_recursive": "",
+               "blkio.io_serviced_recursive":      "",
+               "blkio.io_queued_recursive":        "",
+               "blkio.sectors_recursive":          "",
+               "blkio.io_service_time_recursive":  "",
+               "blkio.io_wait_time_recursive":     "",
+               "blkio.io_merged_recursive":        "",
+               "blkio.time_recursive":             "",
+               "blkio.throttle.io_service_bytes":  throttleServiceBytes,
+               "blkio.throttle.io_serviced":       throttleServiced,
+       })
+
+       blkio := &BlkioGroup{}
+       actualStats := *cgroups.NewStats()
+       err := blkio.GetStats(helper.CgroupPath, &actualStats)
+       if err != nil {
+               t.Fatal(err)
+       }
+
+       // Verify expected stats.
+       expectedStats := cgroups.BlkioStats{}
+
+       appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 11030528, "Read")
+       appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 23, "Write")
+       appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 42, "Sync")
+       appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 11030528, "Async")
+       appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 11030528, "Total")
+       appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 252, 0, 11030528, "Read")
+       appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 252, 0, 23, "Write")
+       appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 252, 0, 42, "Sync")
+       appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 252, 0, 11030528, "Async")
+       appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 252, 0, 11030528, "Total")
+
+       appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 164, "Read")
+       appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 23, "Write")
+       appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 42, "Sync")
+       appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 164, "Async")
+       appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 164, "Total")
+       appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 252, 0, 164, "Read")
+       appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 252, 0, 23, "Write")
+       appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 252, 0, 42, "Sync")
+       appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 252, 0, 164, "Async")
+       appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 252, 0, 164, "Total")
+
+       expectBlkioStatsEquals(t, expectedStats, actualStats.BlkioStats)
+}
+
+func TestBlkioSetThrottleReadBpsDevice(t *testing.T) {
+       helper := NewCgroupTestUtil("blkio", t)
+       defer helper.cleanup()
+
+       const (
+               throttleBefore = `8:0 1024`
+       )
+
+       td := configs.NewThrottleDevice(8, 0, 2048)
+       throttleAfter := td.String()
+
+       helper.writeFileContents(map[string]string{
+               "blkio.throttle.read_bps_device": throttleBefore,
+       })
+
+       helper.CgroupData.config.Resources.BlkioThrottleReadBpsDevice = []*configs.ThrottleDevice{td}
+       blkio := &BlkioGroup{}
+       if err := blkio.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+               t.Fatal(err)
+       }
+
+       value, err := fscommon.GetCgroupParamString(helper.CgroupPath, "blkio.throttle.read_bps_device")
+       if err != nil {
+               t.Fatalf("Failed to parse blkio.throttle.read_bps_device - %s", err)
+       }
+
+       if value != throttleAfter {
+               t.Fatal("Got the wrong value, set blkio.throttle.read_bps_device failed.")
+       }
+}
+func TestBlkioSetThrottleWriteBpsDevice(t *testing.T) {
+       helper := NewCgroupTestUtil("blkio", t)
+       defer helper.cleanup()
+
+       const (
+               throttleBefore = `8:0 1024`
+       )
+
+       td := configs.NewThrottleDevice(8, 0, 2048)
+       throttleAfter := td.String()
+
+       helper.writeFileContents(map[string]string{
+               "blkio.throttle.write_bps_device": throttleBefore,
+       })
+
+       helper.CgroupData.config.Resources.BlkioThrottleWriteBpsDevice = []*configs.ThrottleDevice{td}
+       blkio := &BlkioGroup{}
+       if err := blkio.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+               t.Fatal(err)
+       }
+
+       value, err := fscommon.GetCgroupParamString(helper.CgroupPath, "blkio.throttle.write_bps_device")
+       if err != nil {
+               t.Fatalf("Failed to parse blkio.throttle.write_bps_device - %s", err)
+       }
+
+       if value != throttleAfter {
+               t.Fatal("Got the wrong value, set blkio.throttle.write_bps_device failed.")
+       }
+}
+func TestBlkioSetThrottleReadIOpsDevice(t *testing.T) {
+       helper := NewCgroupTestUtil("blkio", t)
+       defer helper.cleanup()
+
+       const (
+               throttleBefore = `8:0 1024`
+       )
+
+       td := configs.NewThrottleDevice(8, 0, 2048)
+       throttleAfter := td.String()
+
+       helper.writeFileContents(map[string]string{
+               "blkio.throttle.read_iops_device": throttleBefore,
+       })
+
+       helper.CgroupData.config.Resources.BlkioThrottleReadIOPSDevice = []*configs.ThrottleDevice{td}
+       blkio := &BlkioGroup{}
+       if err := blkio.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+               t.Fatal(err)
+       }
+
+       value, err := fscommon.GetCgroupParamString(helper.CgroupPath, "blkio.throttle.read_iops_device")
+       if err != nil {
+               t.Fatalf("Failed to parse blkio.throttle.read_iops_device - %s", err)
+       }
+
+       if value != throttleAfter {
+               t.Fatal("Got the wrong value, set blkio.throttle.read_iops_device failed.")
+       }
+}
+func TestBlkioSetThrottleWriteIOpsDevice(t *testing.T) {
+       helper := NewCgroupTestUtil("blkio", t)
+       defer helper.cleanup()
+
+       const (
+               throttleBefore = `8:0 1024`
+       )
+
+       td := configs.NewThrottleDevice(8, 0, 2048)
+       throttleAfter := td.String()
+
+       helper.writeFileContents(map[string]string{
+               "blkio.throttle.write_iops_device": throttleBefore,
+       })
+
+       helper.CgroupData.config.Resources.BlkioThrottleWriteIOPSDevice = []*configs.ThrottleDevice{td}
+       blkio := &BlkioGroup{}
+       if err := blkio.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+               t.Fatal(err)
+       }
+
+       value, err := fscommon.GetCgroupParamString(helper.CgroupPath, "blkio.throttle.write_iops_device")
+       if err != nil {
+               t.Fatalf("Failed to parse blkio.throttle.write_iops_device - %s", err)
+       }
+
+       if value != throttleAfter {
+               t.Fatal("Got the wrong value, set blkio.throttle.write_iops_device failed.")
+       }
+}
diff --git a/libcontainer/cgroups/fs/cpu.go b/libcontainer/cgroups/fs/cpu.go
new file mode 100644 (file)
index 0000000..4db7b64
--- /dev/null
@@ -0,0 +1,118 @@
+// +build linux
+
+package fs
+
+import (
+       "bufio"
+       "os"
+       "path/filepath"
+       "strconv"
+
+       "github.com/opencontainers/runc/libcontainer/cgroups"
+       "github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
+       "github.com/opencontainers/runc/libcontainer/configs"
+)
+
+type CpuGroup struct {
+}
+
+func (s *CpuGroup) Name() string {
+       return "cpu"
+}
+
+func (s *CpuGroup) Apply(d *cgroupData) error {
+       // We always want to join the cpu group, to allow fair cpu scheduling
+       // on a container basis
+       path, err := d.path("cpu")
+       if err != nil && !cgroups.IsNotFound(err) {
+               return err
+       }
+       return s.ApplyDir(path, d.config, d.pid)
+}
+
+func (s *CpuGroup) ApplyDir(path string, cgroup *configs.Cgroup, pid int) error {
+       // This might happen if we have no cpu cgroup mounted.
+       // Just do nothing and don't fail.
+       if path == "" {
+               return nil
+       }
+       if err := os.MkdirAll(path, 0755); err != nil {
+               return err
+       }
+       // We should set the real-Time group scheduling settings before moving
+       // in the process because if the process is already in SCHED_RR mode
+       // and no RT bandwidth is set, adding it will fail.
+       if err := s.SetRtSched(path, cgroup); err != nil {
+               return err
+       }
+       // because we are not using d.join we need to place the pid into the procs file
+       // unlike the other subsystems
+       return cgroups.WriteCgroupProc(path, pid)
+}
+
+func (s *CpuGroup) SetRtSched(path string, cgroup *configs.Cgroup) error {
+       if cgroup.Resources.CpuRtPeriod != 0 {
+               if err := fscommon.WriteFile(path, "cpu.rt_period_us", strconv.FormatUint(cgroup.Resources.CpuRtPeriod, 10)); err != nil {
+                       return err
+               }
+       }
+       if cgroup.Resources.CpuRtRuntime != 0 {
+               if err := fscommon.WriteFile(path, "cpu.rt_runtime_us", strconv.FormatInt(cgroup.Resources.CpuRtRuntime, 10)); err != nil {
+                       return err
+               }
+       }
+       return nil
+}
+
+func (s *CpuGroup) Set(path string, cgroup *configs.Cgroup) error {
+       if cgroup.Resources.CpuShares != 0 {
+               if err := fscommon.WriteFile(path, "cpu.shares", strconv.FormatUint(cgroup.Resources.CpuShares, 10)); err != nil {
+                       return err
+               }
+       }
+       if cgroup.Resources.CpuPeriod != 0 {
+               if err := fscommon.WriteFile(path, "cpu.cfs_period_us", strconv.FormatUint(cgroup.Resources.CpuPeriod, 10)); err != nil {
+                       return err
+               }
+       }
+       if cgroup.Resources.CpuQuota != 0 {
+               if err := fscommon.WriteFile(path, "cpu.cfs_quota_us", strconv.FormatInt(cgroup.Resources.CpuQuota, 10)); err != nil {
+                       return err
+               }
+       }
+       return s.SetRtSched(path, cgroup)
+}
+
+func (s *CpuGroup) Remove(d *cgroupData) error {
+       return removePath(d.path("cpu"))
+}
+
+func (s *CpuGroup) GetStats(path string, stats *cgroups.Stats) error {
+       f, err := os.Open(filepath.Join(path, "cpu.stat"))
+       if err != nil {
+               if os.IsNotExist(err) {
+                       return nil
+               }
+               return err
+       }
+       defer f.Close()
+
+       sc := bufio.NewScanner(f)
+       for sc.Scan() {
+               t, v, err := fscommon.GetCgroupParamKeyValue(sc.Text())
+               if err != nil {
+                       return err
+               }
+               switch t {
+               case "nr_periods":
+                       stats.CpuStats.ThrottlingData.Periods = v
+
+               case "nr_throttled":
+                       stats.CpuStats.ThrottlingData.ThrottledPeriods = v
+
+               case "throttled_time":
+                       stats.CpuStats.ThrottlingData.ThrottledTime = v
+               }
+       }
+       return nil
+}
diff --git a/libcontainer/cgroups/fs/cpu_test.go b/libcontainer/cgroups/fs/cpu_test.go
new file mode 100644 (file)
index 0000000..2eeb489
--- /dev/null
@@ -0,0 +1,210 @@
+// +build linux
+
+package fs
+
+import (
+       "fmt"
+       "strconv"
+       "testing"
+
+       "github.com/opencontainers/runc/libcontainer/cgroups"
+       "github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
+)
+
+func TestCpuSetShares(t *testing.T) {
+       helper := NewCgroupTestUtil("cpu", t)
+       defer helper.cleanup()
+
+       const (
+               sharesBefore = 1024
+               sharesAfter  = 512
+       )
+
+       helper.writeFileContents(map[string]string{
+               "cpu.shares": strconv.Itoa(sharesBefore),
+       })
+
+       helper.CgroupData.config.Resources.CpuShares = sharesAfter
+       cpu := &CpuGroup{}
+       if err := cpu.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+               t.Fatal(err)
+       }
+
+       value, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "cpu.shares")
+       if err != nil {
+               t.Fatalf("Failed to parse cpu.shares - %s", err)
+       }
+
+       if value != sharesAfter {
+               t.Fatal("Got the wrong value, set cpu.shares failed.")
+       }
+}
+
+func TestCpuSetBandWidth(t *testing.T) {
+       helper := NewCgroupTestUtil("cpu", t)
+       defer helper.cleanup()
+
+       const (
+               quotaBefore     = 8000
+               quotaAfter      = 5000
+               periodBefore    = 10000
+               periodAfter     = 7000
+               rtRuntimeBefore = 8000
+               rtRuntimeAfter  = 5000
+               rtPeriodBefore  = 10000
+               rtPeriodAfter   = 7000
+       )
+
+       helper.writeFileContents(map[string]string{
+               "cpu.cfs_quota_us":  strconv.Itoa(quotaBefore),
+               "cpu.cfs_period_us": strconv.Itoa(periodBefore),
+               "cpu.rt_runtime_us": strconv.Itoa(rtRuntimeBefore),
+               "cpu.rt_period_us":  strconv.Itoa(rtPeriodBefore),
+       })
+
+       helper.CgroupData.config.Resources.CpuQuota = quotaAfter
+       helper.CgroupData.config.Resources.CpuPeriod = periodAfter
+       helper.CgroupData.config.Resources.CpuRtRuntime = rtRuntimeAfter
+       helper.CgroupData.config.Resources.CpuRtPeriod = rtPeriodAfter
+       cpu := &CpuGroup{}
+       if err := cpu.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+               t.Fatal(err)
+       }
+
+       quota, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "cpu.cfs_quota_us")
+       if err != nil {
+               t.Fatalf("Failed to parse cpu.cfs_quota_us - %s", err)
+       }
+       if quota != quotaAfter {
+               t.Fatal("Got the wrong value, set cpu.cfs_quota_us failed.")
+       }
+
+       period, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "cpu.cfs_period_us")
+       if err != nil {
+               t.Fatalf("Failed to parse cpu.cfs_period_us - %s", err)
+       }
+       if period != periodAfter {
+               t.Fatal("Got the wrong value, set cpu.cfs_period_us failed.")
+       }
+       rtRuntime, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "cpu.rt_runtime_us")
+       if err != nil {
+               t.Fatalf("Failed to parse cpu.rt_runtime_us - %s", err)
+       }
+       if rtRuntime != rtRuntimeAfter {
+               t.Fatal("Got the wrong value, set cpu.rt_runtime_us failed.")
+       }
+       rtPeriod, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "cpu.rt_period_us")
+       if err != nil {
+               t.Fatalf("Failed to parse cpu.rt_period_us - %s", err)
+       }
+       if rtPeriod != rtPeriodAfter {
+               t.Fatal("Got the wrong value, set cpu.rt_period_us failed.")
+       }
+}
+
+func TestCpuStats(t *testing.T) {
+       helper := NewCgroupTestUtil("cpu", t)
+       defer helper.cleanup()
+
+       const (
+               nrPeriods     = 2000
+               nrThrottled   = 200
+               throttledTime = uint64(18446744073709551615)
+       )
+
+       cpuStatContent := fmt.Sprintf("nr_periods %d\n nr_throttled %d\n throttled_time %d\n",
+               nrPeriods, nrThrottled, throttledTime)
+       helper.writeFileContents(map[string]string{
+               "cpu.stat": cpuStatContent,
+       })
+
+       cpu := &CpuGroup{}
+       actualStats := *cgroups.NewStats()
+       err := cpu.GetStats(helper.CgroupPath, &actualStats)
+       if err != nil {
+               t.Fatal(err)
+       }
+
+       expectedStats := cgroups.ThrottlingData{
+               Periods:          nrPeriods,
+               ThrottledPeriods: nrThrottled,
+               ThrottledTime:    throttledTime}
+
+       expectThrottlingDataEquals(t, expectedStats, actualStats.CpuStats.ThrottlingData)
+}
+
+func TestNoCpuStatFile(t *testing.T) {
+       helper := NewCgroupTestUtil("cpu", t)
+       defer helper.cleanup()
+
+       cpu := &CpuGroup{}
+       actualStats := *cgroups.NewStats()
+       err := cpu.GetStats(helper.CgroupPath, &actualStats)
+       if err != nil {
+               t.Fatal("Expected not to fail, but did")
+       }
+}
+
+func TestInvalidCpuStat(t *testing.T) {
+       helper := NewCgroupTestUtil("cpu", t)
+       defer helper.cleanup()
+       cpuStatContent := `nr_periods 2000
+       nr_throttled 200
+       throttled_time fortytwo`
+       helper.writeFileContents(map[string]string{
+               "cpu.stat": cpuStatContent,
+       })
+
+       cpu := &CpuGroup{}
+       actualStats := *cgroups.NewStats()
+       err := cpu.GetStats(helper.CgroupPath, &actualStats)
+       if err == nil {
+               t.Fatal("Expected failed stat parsing.")
+       }
+}
+
+func TestCpuSetRtSchedAtApply(t *testing.T) {
+       helper := NewCgroupTestUtil("cpu", t)
+       defer helper.cleanup()
+
+       const (
+               rtRuntimeBefore = 0
+               rtRuntimeAfter  = 5000
+               rtPeriodBefore  = 0
+               rtPeriodAfter   = 7000
+       )
+
+       helper.writeFileContents(map[string]string{
+               "cpu.rt_runtime_us": strconv.Itoa(rtRuntimeBefore),
+               "cpu.rt_period_us":  strconv.Itoa(rtPeriodBefore),
+       })
+
+       helper.CgroupData.config.Resources.CpuRtRuntime = rtRuntimeAfter
+       helper.CgroupData.config.Resources.CpuRtPeriod = rtPeriodAfter
+       cpu := &CpuGroup{}
+       if err := cpu.ApplyDir(helper.CgroupPath, helper.CgroupData.config, 1234); err != nil {
+               t.Fatal(err)
+       }
+
+       rtRuntime, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "cpu.rt_runtime_us")
+       if err != nil {
+               t.Fatalf("Failed to parse cpu.rt_runtime_us - %s", err)
+       }
+       if rtRuntime != rtRuntimeAfter {
+               t.Fatal("Got the wrong value, set cpu.rt_runtime_us failed.")
+       }
+       rtPeriod, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "cpu.rt_period_us")
+       if err != nil {
+               t.Fatalf("Failed to parse cpu.rt_period_us - %s", err)
+       }
+       if rtPeriod != rtPeriodAfter {
+               t.Fatal("Got the wrong value, set cpu.rt_period_us failed.")
+       }
+       pid, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "cgroup.procs")
+       if err != nil {
+               t.Fatalf("Failed to parse cgroup.procs - %s", err)
+       }
+       if pid != 1234 {
+               t.Fatal("Got the wrong value, set cgroup.procs failed.")
+       }
+}
diff --git a/libcontainer/cgroups/fs/cpuacct.go b/libcontainer/cgroups/fs/cpuacct.go
new file mode 100644 (file)
index 0000000..95dc9a1
--- /dev/null
@@ -0,0 +1,122 @@
+// +build linux
+
+package fs
+
+import (
+       "fmt"
+       "io/ioutil"
+       "path/filepath"
+       "strconv"
+       "strings"
+
+       "github.com/opencontainers/runc/libcontainer/cgroups"
+       "github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
+       "github.com/opencontainers/runc/libcontainer/configs"
+       "github.com/opencontainers/runc/libcontainer/system"
+)
+
+const (
+       cgroupCpuacctStat   = "cpuacct.stat"
+       nanosecondsInSecond = 1000000000
+)
+
+var clockTicks = uint64(system.GetClockTicks())
+
+type CpuacctGroup struct {
+}
+
+func (s *CpuacctGroup) Name() string {
+       return "cpuacct"
+}
+
+func (s *CpuacctGroup) Apply(d *cgroupData) error {
+       // we just want to join this group even though we don't set anything
+       if _, err := d.join("cpuacct"); err != nil && !cgroups.IsNotFound(err) {
+               return err
+       }
+
+       return nil
+}
+
+func (s *CpuacctGroup) Set(path string, cgroup *configs.Cgroup) error {
+       return nil
+}
+
+func (s *CpuacctGroup) Remove(d *cgroupData) error {
+       return removePath(d.path("cpuacct"))
+}
+
+func (s *CpuacctGroup) GetStats(path string, stats *cgroups.Stats) error {
+       userModeUsage, kernelModeUsage, err := getCpuUsageBreakdown(path)
+       if err != nil {
+               return err
+       }
+
+       totalUsage, err := fscommon.GetCgroupParamUint(path, "cpuacct.usage")
+       if err != nil {
+               return err
+       }
+
+       percpuUsage, err := getPercpuUsage(path)
+       if err != nil {
+               return err
+       }
+
+       stats.CpuStats.CpuUsage.TotalUsage = totalUsage
+       stats.CpuStats.CpuUsage.PercpuUsage = percpuUsage
+       stats.CpuStats.CpuUsage.UsageInUsermode = userModeUsage
+       stats.CpuStats.CpuUsage.UsageInKernelmode = kernelModeUsage
+       return nil
+}
+
+// Returns user and kernel usage breakdown in nanoseconds.
+func getCpuUsageBreakdown(path string) (uint64, uint64, error) {
+       userModeUsage := uint64(0)
+       kernelModeUsage := uint64(0)
+       const (
+               userField   = "user"
+               systemField = "system"
+       )
+
+       // Expected format:
+       // user <usage in ticks>
+       // system <usage in ticks>
+       data, err := ioutil.ReadFile(filepath.Join(path, cgroupCpuacctStat))
+       if err != nil {
+               return 0, 0, err
+       }
+       fields := strings.Fields(string(data))
+       if len(fields) < 4 {
+               return 0, 0, fmt.Errorf("failure - %s is expected to have at least 4 fields", filepath.Join(path, cgroupCpuacctStat))
+       }
+       if fields[0] != userField {
+               return 0, 0, fmt.Errorf("unexpected field %q in %q, expected %q", fields[0], cgroupCpuacctStat, userField)
+       }
+       if fields[2] != systemField {
+               return 0, 0, fmt.Errorf("unexpected field %q in %q, expected %q", fields[2], cgroupCpuacctStat, systemField)
+       }
+       if userModeUsage, err = strconv.ParseUint(fields[1], 10, 64); err != nil {
+               return 0, 0, err
+       }
+       if kernelModeUsage, err = strconv.ParseUint(fields[3], 10, 64); err != nil {
+               return 0, 0, err
+       }
+
+       return (userModeUsage * nanosecondsInSecond) / clockTicks, (kernelModeUsage * nanosecondsInSecond) / clockTicks, nil
+}
+
+func getPercpuUsage(path string) ([]uint64, error) {
+       percpuUsage := []uint64{}
+       data, err := ioutil.ReadFile(filepath.Join(path, "cpuacct.usage_percpu"))
+       if err != nil {
+               return percpuUsage, err
+       }
+       for _, value := range strings.Fields(string(data)) {
+               value, err := strconv.ParseUint(value, 10, 64)
+               if err != nil {
+                       return percpuUsage, fmt.Errorf("Unable to convert param value to uint64: %s", err)
+               }
+               percpuUsage = append(percpuUsage, value)
+       }
+       return percpuUsage, nil
+}
diff --git a/libcontainer/cgroups/fs/cpuset.go b/libcontainer/cgroups/fs/cpuset.go
new file mode 100644 (file)
index 0000000..bfc900e
--- /dev/null
@@ -0,0 +1,160 @@
+// +build linux
+
+package fs
+
+import (
+       "bytes"
+       "fmt"
+       "io/ioutil"
+       "os"
+       "path/filepath"
+
+       "github.com/opencontainers/runc/libcontainer/cgroups"
+       "github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
+       "github.com/opencontainers/runc/libcontainer/configs"
+       libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils"
+)
+
+type CpusetGroup struct {
+}
+
+func (s *CpusetGroup) Name() string {
+       return "cpuset"
+}
+
+func (s *CpusetGroup) Apply(d *cgroupData) error {
+       dir, err := d.path("cpuset")
+       if err != nil && !cgroups.IsNotFound(err) {
+               return err
+       }
+       return s.ApplyDir(dir, d.config, d.pid)
+}
+
+func (s *CpusetGroup) Set(path string, cgroup *configs.Cgroup) error {
+       if cgroup.Resources.CpusetCpus != "" {
+               if err := fscommon.WriteFile(path, "cpuset.cpus", cgroup.Resources.CpusetCpus); err != nil {
+                       return err
+               }
+       }
+       if cgroup.Resources.CpusetMems != "" {
+               if err := fscommon.WriteFile(path, "cpuset.mems", cgroup.Resources.CpusetMems); err != nil {
+                       return err
+               }
+       }
+       return nil
+}
+
+func (s *CpusetGroup) Remove(d *cgroupData) error {
+       return removePath(d.path("cpuset"))
+}
+
+func (s *CpusetGroup) GetStats(path string, stats *cgroups.Stats) error {
+       return nil
+}
+
+func (s *CpusetGroup) ApplyDir(dir string, cgroup *configs.Cgroup, pid int) error {
+       // This might happen if we have no cpuset cgroup mounted.
+       // Just do nothing and don't fail.
+       if dir == "" {
+               return nil
+       }
+       mountInfo, err := ioutil.ReadFile("/proc/self/mountinfo")
+       if err != nil {
+               return err
+       }
+       root := filepath.Dir(cgroups.GetClosestMountpointAncestor(dir, string(mountInfo)))
+       // 'ensureParent' start with parent because we don't want to
+       // explicitly inherit from parent, it could conflict with
+       // 'cpuset.cpu_exclusive'.
+       if err := s.ensureParent(filepath.Dir(dir), root); err != nil {
+               return err
+       }
+       if err := os.MkdirAll(dir, 0755); err != nil {
+               return err
+       }
+       // We didn't inherit cpuset configs from parent, but we have
+       // to ensure cpuset configs are set before moving task into the
+       // cgroup.
+       // The logic is, if user specified cpuset configs, use these
+       // specified configs, otherwise, inherit from parent. This makes
+       // cpuset configs work correctly with 'cpuset.cpu_exclusive', and
+       // keep backward compatibility.
+       if err := s.ensureCpusAndMems(dir, cgroup); err != nil {
+               return err
+       }
+
+       // because we are not using d.join we need to place the pid into the procs file
+       // unlike the other subsystems
+       return cgroups.WriteCgroupProc(dir, pid)
+}
+
+func (s *CpusetGroup) getSubsystemSettings(parent string) (cpus []byte, mems []byte, err error) {
+       if cpus, err = ioutil.ReadFile(filepath.Join(parent, "cpuset.cpus")); err != nil {
+               return
+       }
+       if mems, err = ioutil.ReadFile(filepath.Join(parent, "cpuset.mems")); err != nil {
+               return
+       }
+       return cpus, mems, nil
+}
+
+// ensureParent makes sure that the parent directory of current is created
+// and populated with the proper cpus and mems files copied from
+// it's parent.
+func (s *CpusetGroup) ensureParent(current, root string) error {
+       parent := filepath.Dir(current)
+       if libcontainerUtils.CleanPath(parent) == root {
+               return nil
+       }
+       // Avoid infinite recursion.
+       if parent == current {
+               return fmt.Errorf("cpuset: cgroup parent path outside cgroup root")
+       }
+       if err := s.ensureParent(parent, root); err != nil {
+               return err
+       }
+       if err := os.MkdirAll(current, 0755); err != nil {
+               return err
+       }
+       return s.copyIfNeeded(current, parent)
+}
+
+// copyIfNeeded copies the cpuset.cpus and cpuset.mems from the parent
+// directory to the current directory if the file's contents are 0
+func (s *CpusetGroup) copyIfNeeded(current, parent string) error {
+       var (
+               err                      error
+               currentCpus, currentMems []byte
+               parentCpus, parentMems   []byte
+       )
+
+       if currentCpus, currentMems, err = s.getSubsystemSettings(current); err != nil {
+               return err
+       }
+       if parentCpus, parentMems, err = s.getSubsystemSettings(parent); err != nil {
+               return err
+       }
+
+       if s.isEmpty(currentCpus) {
+               if err := fscommon.WriteFile(current, "cpuset.cpus", string(parentCpus)); err != nil {
+                       return err
+               }
+       }
+       if s.isEmpty(currentMems) {
+               if err := fscommon.WriteFile(current, "cpuset.mems", string(parentMems)); err != nil {
+                       return err
+               }
+       }
+       return nil
+}
+
+func (s *CpusetGroup) isEmpty(b []byte) bool {
+       return len(bytes.Trim(b, "\n")) == 0
+}
+
+func (s *CpusetGroup) ensureCpusAndMems(path string, cgroup *configs.Cgroup) error {
+       if err := s.Set(path, cgroup); err != nil {
+               return err
+       }
+       return s.copyIfNeeded(path, filepath.Dir(path))
+}
diff --git a/libcontainer/cgroups/fs/cpuset_test.go b/libcontainer/cgroups/fs/cpuset_test.go
new file mode 100644 (file)
index 0000000..927e631
--- /dev/null
@@ -0,0 +1,67 @@
+// +build linux
+
+package fs
+
+import (
+       "testing"
+
+       "github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
+)
+
+func TestCpusetSetCpus(t *testing.T) {
+       helper := NewCgroupTestUtil("cpuset", t)
+       defer helper.cleanup()
+
+       const (
+               cpusBefore = "0"
+               cpusAfter  = "1-3"
+       )
+
+       helper.writeFileContents(map[string]string{
+               "cpuset.cpus": cpusBefore,
+       })
+
+       helper.CgroupData.config.Resources.CpusetCpus = cpusAfter
+       cpuset := &CpusetGroup{}
+       if err := cpuset.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+               t.Fatal(err)
+       }
+
+       value, err := fscommon.GetCgroupParamString(helper.CgroupPath, "cpuset.cpus")
+       if err != nil {
+               t.Fatalf("Failed to parse cpuset.cpus - %s", err)
+       }
+
+       if value != cpusAfter {
+               t.Fatal("Got the wrong value, set cpuset.cpus failed.")
+       }
+}
+
+func TestCpusetSetMems(t *testing.T) {
+       helper := NewCgroupTestUtil("cpuset", t)
+       defer helper.cleanup()
+
+       const (
+               memsBefore = "0"
+               memsAfter  = "1"
+       )
+
+       helper.writeFileContents(map[string]string{
+               "cpuset.mems": memsBefore,
+       })
+
+       helper.CgroupData.config.Resources.CpusetMems = memsAfter
+       cpuset := &CpusetGroup{}
+       if err := cpuset.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+               t.Fatal(err)
+       }
+
+       value, err := fscommon.GetCgroupParamString(helper.CgroupPath, "cpuset.mems")
+       if err != nil {
+               t.Fatalf("Failed to parse cpuset.mems - %s", err)
+       }
+
+       if value != memsAfter {
+               t.Fatal("Got the wrong value, set cpuset.mems failed.")
+       }
+}
diff --git a/libcontainer/cgroups/fs/devices.go b/libcontainer/cgroups/fs/devices.go
new file mode 100644 (file)
index 0000000..036c8db
--- /dev/null
@@ -0,0 +1,81 @@
+// +build linux
+
+package fs
+
+import (
+       "github.com/opencontainers/runc/libcontainer/cgroups"
+       "github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
+       "github.com/opencontainers/runc/libcontainer/configs"
+       "github.com/opencontainers/runc/libcontainer/system"
+)
+
+type DevicesGroup struct {
+}
+
+func (s *DevicesGroup) Name() string {
+       return "devices"
+}
+
+func (s *DevicesGroup) Apply(d *cgroupData) error {
+       _, err := d.join("devices")
+       if err != nil {
+               // We will return error even it's `not found` error, devices
+               // cgroup is hard requirement for container's security.
+               return err
+       }
+       return nil
+}
+
+func (s *DevicesGroup) Set(path string, cgroup *configs.Cgroup) error {
+       if system.RunningInUserNS() {
+               return nil
+       }
+
+       devices := cgroup.Resources.Devices
+       if len(devices) > 0 {
+               for _, dev := range devices {
+                       file := "devices.deny"
+                       if dev.Allow {
+                               file = "devices.allow"
+                       }
+                       if err := fscommon.WriteFile(path, file, dev.CgroupString()); err != nil {
+                               return err
+                       }
+               }
+               return nil
+       }
+       if cgroup.Resources.AllowAllDevices != nil {
+               if *cgroup.Resources.AllowAllDevices == false {
+                       if err := fscommon.WriteFile(path, "devices.deny", "a"); err != nil {
+                               return err
+                       }
+
+                       for _, dev := range cgroup.Resources.AllowedDevices {
+                               if err := fscommon.WriteFile(path, "devices.allow", dev.CgroupString()); err != nil {
+                                       return err
+                               }
+                       }
+                       return nil
+               }
+
+               if err := fscommon.WriteFile(path, "devices.allow", "a"); err != nil {
+                       return err
+               }
+       }
+
+       for _, dev := range cgroup.Resources.DeniedDevices {
+               if err := fscommon.WriteFile(path, "devices.deny", dev.CgroupString()); err != nil {
+                       return err
+               }
+       }
+
+       return nil
+}
+
+func (s *DevicesGroup) Remove(d *cgroupData) error {
+       return removePath(d.path("devices"))
+}
+
+func (s *DevicesGroup) GetStats(path string, stats *cgroups.Stats) error {
+       return nil
+}
diff --git a/libcontainer/cgroups/fs/devices_test.go b/libcontainer/cgroups/fs/devices_test.go
new file mode 100644 (file)
index 0000000..648f4a2
--- /dev/null
@@ -0,0 +1,99 @@
+// +build linux
+
+package fs
+
+import (
+       "testing"
+
+       "github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
+       "github.com/opencontainers/runc/libcontainer/configs"
+)
+
+var (
+       allowedDevices = []*configs.Device{
+               {
+                       Path:        "/dev/zero",
+                       Type:        'c',
+                       Major:       1,
+                       Minor:       5,
+                       Permissions: "rwm",
+                       FileMode:    0666,
+               },
+       }
+       allowedList   = "c 1:5 rwm"
+       deniedDevices = []*configs.Device{
+               {
+                       Path:        "/dev/null",
+                       Type:        'c',
+                       Major:       1,
+                       Minor:       3,
+                       Permissions: "rwm",
+                       FileMode:    0666,
+               },
+       }
+       deniedList = "c 1:3 rwm"
+)
+
+func TestDevicesSetAllow(t *testing.T) {
+       helper := NewCgroupTestUtil("devices", t)
+       defer helper.cleanup()
+
+       helper.writeFileContents(map[string]string{
+               "devices.deny": "a",
+       })
+       allowAllDevices := false
+       helper.CgroupData.config.Resources.AllowAllDevices = &allowAllDevices
+       helper.CgroupData.config.Resources.AllowedDevices = allowedDevices
+       devices := &DevicesGroup{}
+       if err := devices.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+               t.Fatal(err)
+       }
+
+       value, err := fscommon.GetCgroupParamString(helper.CgroupPath, "devices.allow")
+       if err != nil {
+               t.Fatalf("Failed to parse devices.allow - %s", err)
+       }
+
+       if value != allowedList {
+               t.Fatal("Got the wrong value, set devices.allow failed.")
+       }
+
+       // When AllowAllDevices is nil, devices.allow file should not be modified.
+       helper.CgroupData.config.Resources.AllowAllDevices = nil
+       if err := devices.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+               t.Fatal(err)
+       }
+       value, err = fscommon.GetCgroupParamString(helper.CgroupPath, "devices.allow")
+       if err != nil {
+               t.Fatalf("Failed to parse devices.allow - %s", err)
+       }
+       if value != allowedList {
+               t.Fatal("devices policy shouldn't have changed on AllowedAllDevices=nil.")
+       }
+}
+
+func TestDevicesSetDeny(t *testing.T) {
+       helper := NewCgroupTestUtil("devices", t)
+       defer helper.cleanup()
+
+       helper.writeFileContents(map[string]string{
+               "devices.allow": "a",
+       })
+
+       allowAllDevices := true
+       helper.CgroupData.config.Resources.AllowAllDevices = &allowAllDevices
+       helper.CgroupData.config.Resources.DeniedDevices = deniedDevices
+       devices := &DevicesGroup{}
+       if err := devices.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+               t.Fatal(err)
+       }
+
+       value, err := fscommon.GetCgroupParamString(helper.CgroupPath, "devices.deny")
+       if err != nil {
+               t.Fatalf("Failed to parse devices.deny - %s", err)
+       }
+
+       if value != deniedList {
+               t.Fatal("Got the wrong value, set devices.deny failed.")
+       }
+}
diff --git a/libcontainer/cgroups/fs/freezer.go b/libcontainer/cgroups/fs/freezer.go
new file mode 100644 (file)
index 0000000..9dc81bd
--- /dev/null
@@ -0,0 +1,67 @@
+// +build linux
+
+package fs
+
+import (
+       "fmt"
+       "strings"
+       "time"
+
+       "github.com/opencontainers/runc/libcontainer/cgroups"
+       "github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
+       "github.com/opencontainers/runc/libcontainer/configs"
+)
+
+type FreezerGroup struct {
+}
+
+func (s *FreezerGroup) Name() string {
+       return "freezer"
+}
+
+func (s *FreezerGroup) Apply(d *cgroupData) error {
+       _, err := d.join("freezer")
+       if err != nil && !cgroups.IsNotFound(err) {
+               return err
+       }
+       return nil
+}
+
+func (s *FreezerGroup) Set(path string, cgroup *configs.Cgroup) error {
+       switch cgroup.Resources.Freezer {
+       case configs.Frozen, configs.Thawed:
+               for {
+                       // In case this loop does not exit because it doesn't get the expected
+                       // state, let's write again this state, hoping it's going to be properly
+                       // set this time. Otherwise, this loop could run infinitely, waiting for
+                       // a state change that would never happen.
+                       if err := fscommon.WriteFile(path, "freezer.state", string(cgroup.Resources.Freezer)); err != nil {
+                               return err
+                       }
+
+                       state, err := fscommon.ReadFile(path, "freezer.state")
+                       if err != nil {
+                               return err
+                       }
+                       if strings.TrimSpace(state) == string(cgroup.Resources.Freezer) {
+                               break
+                       }
+
+                       time.Sleep(1 * time.Millisecond)
+               }
+       case configs.Undefined:
+               return nil
+       default:
+               return fmt.Errorf("Invalid argument '%s' to freezer.state", string(cgroup.Resources.Freezer))
+       }
+
+       return nil
+}
+
+func (s *FreezerGroup) Remove(d *cgroupData) error {
+       return removePath(d.path("freezer"))
+}
+
+func (s *FreezerGroup) GetStats(path string, stats *cgroups.Stats) error {
+       return nil
+}
diff --git a/libcontainer/cgroups/fs/freezer_test.go b/libcontainer/cgroups/fs/freezer_test.go
new file mode 100644 (file)
index 0000000..ad80261
--- /dev/null
@@ -0,0 +1,48 @@
+// +build linux
+
+package fs
+
+import (
+       "testing"
+
+       "github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
+       "github.com/opencontainers/runc/libcontainer/configs"
+)
+
+func TestFreezerSetState(t *testing.T) {
+       helper := NewCgroupTestUtil("freezer", t)
+       defer helper.cleanup()
+
+       helper.writeFileContents(map[string]string{
+               "freezer.state": string(configs.Frozen),
+       })
+
+       helper.CgroupData.config.Resources.Freezer = configs.Thawed
+       freezer := &FreezerGroup{}
+       if err := freezer.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+               t.Fatal(err)
+       }
+
+       value, err := fscommon.GetCgroupParamString(helper.CgroupPath, "freezer.state")
+       if err != nil {
+               t.Fatalf("Failed to parse freezer.state - %s", err)
+       }
+       if value != string(configs.Thawed) {
+               t.Fatal("Got the wrong value, set freezer.state failed.")
+       }
+}
+
+func TestFreezerSetInvalidState(t *testing.T) {
+       helper := NewCgroupTestUtil("freezer", t)
+       defer helper.cleanup()
+
+       const (
+               invalidArg configs.FreezerState = "Invalid"
+       )
+
+       helper.CgroupData.config.Resources.Freezer = invalidArg
+       freezer := &FreezerGroup{}
+       if err := freezer.Set(helper.CgroupPath, helper.CgroupData.config); err == nil {
+               t.Fatal("Failed to return invalid argument error")
+       }
+}
diff --git a/libcontainer/cgroups/fs/fs_unsupported.go b/libcontainer/cgroups/fs/fs_unsupported.go
new file mode 100644 (file)
index 0000000..3ef9e03
--- /dev/null
@@ -0,0 +1,3 @@
+// +build !linux
+
+package fs
diff --git a/libcontainer/cgroups/fs/hugetlb.go b/libcontainer/cgroups/fs/hugetlb.go
new file mode 100644 (file)
index 0000000..68719c2
--- /dev/null
@@ -0,0 +1,72 @@
+// +build linux
+
+package fs
+
+import (
+       "fmt"
+       "strconv"
+       "strings"
+
+       "github.com/opencontainers/runc/libcontainer/cgroups"
+       "github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
+       "github.com/opencontainers/runc/libcontainer/configs"
+)
+
+type HugetlbGroup struct {
+}
+
+func (s *HugetlbGroup) Name() string {
+       return "hugetlb"
+}
+
+func (s *HugetlbGroup) Apply(d *cgroupData) error {
+       _, err := d.join("hugetlb")
+       if err != nil && !cgroups.IsNotFound(err) {
+               return err
+       }
+       return nil
+}
+
+func (s *HugetlbGroup) Set(path string, cgroup *configs.Cgroup) error {
+       for _, hugetlb := range cgroup.Resources.HugetlbLimit {
+               if err := fscommon.WriteFile(path, strings.Join([]string{"hugetlb", hugetlb.Pagesize, "limit_in_bytes"}, "."), strconv.FormatUint(hugetlb.Limit, 10)); err != nil {
+                       return err
+               }
+       }
+
+       return nil
+}
+
+func (s *HugetlbGroup) Remove(d *cgroupData) error {
+       return removePath(d.path("hugetlb"))
+}
+
+func (s *HugetlbGroup) GetStats(path string, stats *cgroups.Stats) error {
+       hugetlbStats := cgroups.HugetlbStats{}
+       for _, pageSize := range HugePageSizes {
+               usage := strings.Join([]string{"hugetlb", pageSize, "usage_in_bytes"}, ".")
+               value, err := fscommon.GetCgroupParamUint(path, usage)
+               if err != nil {
+                       return fmt.Errorf("failed to parse %s - %v", usage, err)
+               }
+               hugetlbStats.Usage = value
+
+               maxUsage := strings.Join([]string{"hugetlb", pageSize, "max_usage_in_bytes"}, ".")
+               value, err = fscommon.GetCgroupParamUint(path, maxUsage)
+               if err != nil {
+                       return fmt.Errorf("failed to parse %s - %v", maxUsage, err)
+               }
+               hugetlbStats.MaxUsage = value
+
+               failcnt := strings.Join([]string{"hugetlb", pageSize, "failcnt"}, ".")
+               value, err = fscommon.GetCgroupParamUint(path, failcnt)
+               if err != nil {
+                       return fmt.Errorf("failed to parse %s - %v", failcnt, err)
+               }
+               hugetlbStats.Failcnt = value
+
+               stats.HugetlbStats[pageSize] = hugetlbStats
+       }
+
+       return nil
+}
diff --git a/libcontainer/cgroups/fs/hugetlb_test.go b/libcontainer/cgroups/fs/hugetlb_test.go
new file mode 100644 (file)
index 0000000..9ddacfe
--- /dev/null
@@ -0,0 +1,155 @@
+// +build linux
+
+package fs
+
+import (
+       "fmt"
+       "strconv"
+       "testing"
+
+       "github.com/opencontainers/runc/libcontainer/cgroups"
+       "github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
+       "github.com/opencontainers/runc/libcontainer/configs"
+)
+
+const (
+       hugetlbUsageContents    = "128\n"
+       hugetlbMaxUsageContents = "256\n"
+       hugetlbFailcnt          = "100\n"
+)
+
+var (
+       usage    = "hugetlb.%s.usage_in_bytes"
+       limit    = "hugetlb.%s.limit_in_bytes"
+       maxUsage = "hugetlb.%s.max_usage_in_bytes"
+       failcnt  = "hugetlb.%s.failcnt"
+)
+
+func TestHugetlbSetHugetlb(t *testing.T) {
+       helper := NewCgroupTestUtil("hugetlb", t)
+       defer helper.cleanup()
+
+       const (
+               hugetlbBefore = 256
+               hugetlbAfter  = 512
+       )
+
+       for _, pageSize := range HugePageSizes {
+               helper.writeFileContents(map[string]string{
+                       fmt.Sprintf(limit, pageSize): strconv.Itoa(hugetlbBefore),
+               })
+       }
+
+       for _, pageSize := range HugePageSizes {
+               helper.CgroupData.config.Resources.HugetlbLimit = []*configs.HugepageLimit{
+                       {
+                               Pagesize: pageSize,
+                               Limit:    hugetlbAfter,
+                       },
+               }
+               hugetlb := &HugetlbGroup{}
+               if err := hugetlb.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+                       t.Fatal(err)
+               }
+       }
+
+       for _, pageSize := range HugePageSizes {
+               limit := fmt.Sprintf(limit, pageSize)
+               value, err := fscommon.GetCgroupParamUint(helper.CgroupPath, limit)
+               if err != nil {
+                       t.Fatalf("Failed to parse %s - %s", limit, err)
+               }
+               if value != hugetlbAfter {
+                       t.Fatalf("Set hugetlb.limit_in_bytes failed. Expected: %v, Got: %v", hugetlbAfter, value)
+               }
+       }
+}
+
+func TestHugetlbStats(t *testing.T) {
+       helper := NewCgroupTestUtil("hugetlb", t)
+       defer helper.cleanup()
+       for _, pageSize := range HugePageSizes {
+               helper.writeFileContents(map[string]string{
+                       fmt.Sprintf(usage, pageSize):    hugetlbUsageContents,
+                       fmt.Sprintf(maxUsage, pageSize): hugetlbMaxUsageContents,
+                       fmt.Sprintf(failcnt, pageSize):  hugetlbFailcnt,
+               })
+       }
+
+       hugetlb := &HugetlbGroup{}
+       actualStats := *cgroups.NewStats()
+       err := hugetlb.GetStats(helper.CgroupPath, &actualStats)
+       if err != nil {
+               t.Fatal(err)
+       }
+       expectedStats := cgroups.HugetlbStats{Usage: 128, MaxUsage: 256, Failcnt: 100}
+       for _, pageSize := range HugePageSizes {
+               expectHugetlbStatEquals(t, expectedStats, actualStats.HugetlbStats[pageSize])
+       }
+}
+
+func TestHugetlbStatsNoUsageFile(t *testing.T) {
+       helper := NewCgroupTestUtil("hugetlb", t)
+       defer helper.cleanup()
+       helper.writeFileContents(map[string]string{
+               maxUsage: hugetlbMaxUsageContents,
+       })
+
+       hugetlb := &HugetlbGroup{}
+       actualStats := *cgroups.NewStats()
+       err := hugetlb.GetStats(helper.CgroupPath, &actualStats)
+       if err == nil {
+               t.Fatal("Expected failure")
+       }
+}
+
+func TestHugetlbStatsNoMaxUsageFile(t *testing.T) {
+       helper := NewCgroupTestUtil("hugetlb", t)
+       defer helper.cleanup()
+       for _, pageSize := range HugePageSizes {
+               helper.writeFileContents(map[string]string{
+                       fmt.Sprintf(usage, pageSize): hugetlbUsageContents,
+               })
+       }
+
+       hugetlb := &HugetlbGroup{}
+       actualStats := *cgroups.NewStats()
+       err := hugetlb.GetStats(helper.CgroupPath, &actualStats)
+       if err == nil {
+               t.Fatal("Expected failure")
+       }
+}
+
+func TestHugetlbStatsBadUsageFile(t *testing.T) {
+       helper := NewCgroupTestUtil("hugetlb", t)
+       defer helper.cleanup()
+       for _, pageSize := range HugePageSizes {
+               helper.writeFileContents(map[string]string{
+                       fmt.Sprintf(usage, pageSize): "bad",
+                       maxUsage:                     hugetlbMaxUsageContents,
+               })
+       }
+
+       hugetlb := &HugetlbGroup{}
+       actualStats := *cgroups.NewStats()
+       err := hugetlb.GetStats(helper.CgroupPath, &actualStats)
+       if err == nil {
+               t.Fatal("Expected failure")
+       }
+}
+
+func TestHugetlbStatsBadMaxUsageFile(t *testing.T) {
+       helper := NewCgroupTestUtil("hugetlb", t)
+       defer helper.cleanup()
+       helper.writeFileContents(map[string]string{
+               usage:    hugetlbUsageContents,
+               maxUsage: "bad",
+       })
+
+       hugetlb := &HugetlbGroup{}
+       actualStats := *cgroups.NewStats()
+       err := hugetlb.GetStats(helper.CgroupPath, &actualStats)
+       if err == nil {
+               t.Fatal("Expected failure")
+       }
+}
diff --git a/libcontainer/cgroups/fs/kmem.go b/libcontainer/cgroups/fs/kmem.go
new file mode 100644 (file)
index 0000000..69b5a19
--- /dev/null
@@ -0,0 +1,62 @@
+// +build linux,!nokmem
+
+package fs
+
+import (
+       "errors"
+       "fmt"
+       "io/ioutil"
+       "os"
+       "path/filepath"
+       "strconv"
+       "syscall" // for Errno type only
+
+       "github.com/opencontainers/runc/libcontainer/cgroups"
+       "golang.org/x/sys/unix"
+)
+
+const cgroupKernelMemoryLimit = "memory.kmem.limit_in_bytes"
+
+func EnableKernelMemoryAccounting(path string) error {
+       // Ensure that kernel memory is available in this kernel build. If it
+       // isn't, we just ignore it because EnableKernelMemoryAccounting is
+       // automatically called for all memory limits.
+       if !cgroups.PathExists(filepath.Join(path, cgroupKernelMemoryLimit)) {
+               return nil
+       }
+       // We have to limit the kernel memory here as it won't be accounted at all
+       // until a limit is set on the cgroup and limit cannot be set once the
+       // cgroup has children, or if there are already tasks in the cgroup.
+       for _, i := range []int64{1, -1} {
+               if err := setKernelMemory(path, i); err != nil {
+                       return err
+               }
+       }
+       return nil
+}
+
+func setKernelMemory(path string, kernelMemoryLimit int64) error {
+       if path == "" {
+               return fmt.Errorf("no such directory for %s", cgroupKernelMemoryLimit)
+       }
+       if !cgroups.PathExists(filepath.Join(path, cgroupKernelMemoryLimit)) {
+               // We have specifically been asked to set a kmem limit. If the kernel
+               // doesn't support it we *must* error out.
+               return errors.New("kernel memory accounting not supported by this kernel")
+       }
+       if err := ioutil.WriteFile(filepath.Join(path, cgroupKernelMemoryLimit), []byte(strconv.FormatInt(kernelMemoryLimit, 10)), 0700); err != nil {
+               // Check if the error number returned by the syscall is "EBUSY"
+               // The EBUSY signal is returned on attempts to write to the
+               // memory.kmem.limit_in_bytes file if the cgroup has children or
+               // once tasks have been attached to the cgroup
+               if pathErr, ok := err.(*os.PathError); ok {
+                       if errNo, ok := pathErr.Err.(syscall.Errno); ok {
+                               if errNo == unix.EBUSY {
+                                       return fmt.Errorf("failed to set %s, because either tasks have already joined this cgroup or it has children", cgroupKernelMemoryLimit)
+                               }
+                       }
+               }
+               return fmt.Errorf("failed to write %v to %v: %v", kernelMemoryLimit, cgroupKernelMemoryLimit, err)
+       }
+       return nil
+}
diff --git a/libcontainer/cgroups/fs/kmem_disabled.go b/libcontainer/cgroups/fs/kmem_disabled.go
new file mode 100644 (file)
index 0000000..ac290fd
--- /dev/null
@@ -0,0 +1,15 @@
+// +build linux,nokmem
+
+package fs
+
+import (
+       "errors"
+)
+
+func EnableKernelMemoryAccounting(path string) error {
+       return nil
+}
+
+func setKernelMemory(path string, kernelMemoryLimit int64) error {
+       return errors.New("kernel memory accounting disabled in this runc build")
+}
diff --git a/libcontainer/cgroups/fs/memory.go b/libcontainer/cgroups/fs/memory.go
new file mode 100644 (file)
index 0000000..f81ed05
--- /dev/null
@@ -0,0 +1,271 @@
+// +build linux
+
+package fs
+
+import (
+       "bufio"
+       "fmt"
+       "os"
+       "path/filepath"
+       "strconv"
+       "strings"
+
+       "github.com/opencontainers/runc/libcontainer/cgroups"
+       "github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
+       "github.com/opencontainers/runc/libcontainer/configs"
+)
+
+const (
+       cgroupMemorySwapLimit = "memory.memsw.limit_in_bytes"
+       cgroupMemoryLimit     = "memory.limit_in_bytes"
+)
+
+type MemoryGroup struct {
+}
+
+func (s *MemoryGroup) Name() string {
+       return "memory"
+}
+
+func (s *MemoryGroup) Apply(d *cgroupData) (err error) {
+       path, err := d.path("memory")
+       if err != nil && !cgroups.IsNotFound(err) {
+               return err
+       } else if path == "" {
+               return nil
+       }
+       if memoryAssigned(d.config) {
+               if _, err := os.Stat(path); os.IsNotExist(err) {
+                       if err := os.MkdirAll(path, 0755); err != nil {
+                               return err
+                       }
+                       // Only enable kernel memory accouting when this cgroup
+                       // is created by libcontainer, otherwise we might get
+                       // error when people use `cgroupsPath` to join an existed
+                       // cgroup whose kernel memory is not initialized.
+                       if err := EnableKernelMemoryAccounting(path); err != nil {
+                               return err
+                       }
+               }
+       }
+       defer func() {
+               if err != nil {
+                       os.RemoveAll(path)
+               }
+       }()
+
+       // We need to join memory cgroup after set memory limits, because
+       // kmem.limit_in_bytes can only be set when the cgroup is empty.
+       _, err = d.join("memory")
+       if err != nil && !cgroups.IsNotFound(err) {
+               return err
+       }
+       return nil
+}
+
+func setMemoryAndSwap(path string, cgroup *configs.Cgroup) error {
+       // If the memory update is set to -1 we should also
+       // set swap to -1, it means unlimited memory.
+       if cgroup.Resources.Memory == -1 {
+               // Only set swap if it's enabled in kernel
+               if cgroups.PathExists(filepath.Join(path, cgroupMemorySwapLimit)) {
+                       cgroup.Resources.MemorySwap = -1
+               }
+       }
+
+       // When memory and swap memory are both set, we need to handle the cases
+       // for updating container.
+       if cgroup.Resources.Memory != 0 && cgroup.Resources.MemorySwap != 0 {
+               memoryUsage, err := getMemoryData(path, "")
+               if err != nil {
+                       return err
+               }
+
+               // When update memory limit, we should adapt the write sequence
+               // for memory and swap memory, so it won't fail because the new
+               // value and the old value don't fit kernel's validation.
+               if cgroup.Resources.MemorySwap == -1 || memoryUsage.Limit < uint64(cgroup.Resources.MemorySwap) {
+                       if err := fscommon.WriteFile(path, cgroupMemorySwapLimit, strconv.FormatInt(cgroup.Resources.MemorySwap, 10)); err != nil {
+                               return err
+                       }
+                       if err := fscommon.WriteFile(path, cgroupMemoryLimit, strconv.FormatInt(cgroup.Resources.Memory, 10)); err != nil {
+                               return err
+                       }
+               } else {
+                       if err := fscommon.WriteFile(path, cgroupMemoryLimit, strconv.FormatInt(cgroup.Resources.Memory, 10)); err != nil {
+                               return err
+                       }
+                       if err := fscommon.WriteFile(path, cgroupMemorySwapLimit, strconv.FormatInt(cgroup.Resources.MemorySwap, 10)); err != nil {
+                               return err
+                       }
+               }
+       } else {
+               if cgroup.Resources.Memory != 0 {
+                       if err := fscommon.WriteFile(path, cgroupMemoryLimit, strconv.FormatInt(cgroup.Resources.Memory, 10)); err != nil {
+                               return err
+                       }
+               }
+               if cgroup.Resources.MemorySwap != 0 {
+                       if err := fscommon.WriteFile(path, cgroupMemorySwapLimit, strconv.FormatInt(cgroup.Resources.MemorySwap, 10)); err != nil {
+                               return err
+                       }
+               }
+       }
+
+       return nil
+}
+
+func (s *MemoryGroup) Set(path string, cgroup *configs.Cgroup) error {
+       if err := setMemoryAndSwap(path, cgroup); err != nil {
+               return err
+       }
+
+       if cgroup.Resources.KernelMemory != 0 {
+               if err := setKernelMemory(path, cgroup.Resources.KernelMemory); err != nil {
+                       return err
+               }
+       }
+
+       if cgroup.Resources.MemoryReservation != 0 {
+               if err := fscommon.WriteFile(path, "memory.soft_limit_in_bytes", strconv.FormatInt(cgroup.Resources.MemoryReservation, 10)); err != nil {
+                       return err
+               }
+       }
+
+       if cgroup.Resources.KernelMemoryTCP != 0 {
+               if err := fscommon.WriteFile(path, "memory.kmem.tcp.limit_in_bytes", strconv.FormatInt(cgroup.Resources.KernelMemoryTCP, 10)); err != nil {
+                       return err
+               }
+       }
+       if cgroup.Resources.OomKillDisable {
+               if err := fscommon.WriteFile(path, "memory.oom_control", "1"); err != nil {
+                       return err
+               }
+       }
+       if cgroup.Resources.MemorySwappiness == nil || int64(*cgroup.Resources.MemorySwappiness) == -1 {
+               return nil
+       } else if *cgroup.Resources.MemorySwappiness <= 100 {
+               if err := fscommon.WriteFile(path, "memory.swappiness", strconv.FormatUint(*cgroup.Resources.MemorySwappiness, 10)); err != nil {
+                       return err
+               }
+       } else {
+               return fmt.Errorf("invalid value:%d. valid memory swappiness range is 0-100", *cgroup.Resources.MemorySwappiness)
+       }
+
+       return nil
+}
+
+func (s *MemoryGroup) Remove(d *cgroupData) error {
+       return removePath(d.path("memory"))
+}
+
+func (s *MemoryGroup) GetStats(path string, stats *cgroups.Stats) error {
+       // Set stats from memory.stat.
+       statsFile, err := os.Open(filepath.Join(path, "memory.stat"))
+       if err != nil {
+               if os.IsNotExist(err) {
+                       return nil
+               }
+               return err
+       }
+       defer statsFile.Close()
+
+       sc := bufio.NewScanner(statsFile)
+       for sc.Scan() {
+               t, v, err := fscommon.GetCgroupParamKeyValue(sc.Text())
+               if err != nil {
+                       return fmt.Errorf("failed to parse memory.stat (%q) - %v", sc.Text(), err)
+               }
+               stats.MemoryStats.Stats[t] = v
+       }
+       stats.MemoryStats.Cache = stats.MemoryStats.Stats["cache"]
+
+       memoryUsage, err := getMemoryData(path, "")
+       if err != nil {
+               return err
+       }
+       stats.MemoryStats.Usage = memoryUsage
+       swapUsage, err := getMemoryData(path, "memsw")
+       if err != nil {
+               return err
+       }
+       stats.MemoryStats.SwapUsage = swapUsage
+       kernelUsage, err := getMemoryData(path, "kmem")
+       if err != nil {
+               return err
+       }
+       stats.MemoryStats.KernelUsage = kernelUsage
+       kernelTCPUsage, err := getMemoryData(path, "kmem.tcp")
+       if err != nil {
+               return err
+       }
+       stats.MemoryStats.KernelTCPUsage = kernelTCPUsage
+
+       useHierarchy := strings.Join([]string{"memory", "use_hierarchy"}, ".")
+       value, err := fscommon.GetCgroupParamUint(path, useHierarchy)
+       if err != nil {
+               return err
+       }
+       if value == 1 {
+               stats.MemoryStats.UseHierarchy = true
+       }
+       return nil
+}
+
+func memoryAssigned(cgroup *configs.Cgroup) bool {
+       return cgroup.Resources.Memory != 0 ||
+               cgroup.Resources.MemoryReservation != 0 ||
+               cgroup.Resources.MemorySwap > 0 ||
+               cgroup.Resources.KernelMemory > 0 ||
+               cgroup.Resources.KernelMemoryTCP > 0 ||
+               cgroup.Resources.OomKillDisable ||
+               (cgroup.Resources.MemorySwappiness != nil && int64(*cgroup.Resources.MemorySwappiness) != -1)
+}
+
+func getMemoryData(path, name string) (cgroups.MemoryData, error) {
+       memoryData := cgroups.MemoryData{}
+
+       moduleName := "memory"
+       if name != "" {
+               moduleName = strings.Join([]string{"memory", name}, ".")
+       }
+       usage := strings.Join([]string{moduleName, "usage_in_bytes"}, ".")
+       maxUsage := strings.Join([]string{moduleName, "max_usage_in_bytes"}, ".")
+       failcnt := strings.Join([]string{moduleName, "failcnt"}, ".")
+       limit := strings.Join([]string{moduleName, "limit_in_bytes"}, ".")
+
+       value, err := fscommon.GetCgroupParamUint(path, usage)
+       if err != nil {
+               if moduleName != "memory" && os.IsNotExist(err) {
+                       return cgroups.MemoryData{}, nil
+               }
+               return cgroups.MemoryData{}, fmt.Errorf("failed to parse %s - %v", usage, err)
+       }
+       memoryData.Usage = value
+       value, err = fscommon.GetCgroupParamUint(path, maxUsage)
+       if err != nil {
+               if moduleName != "memory" && os.IsNotExist(err) {
+                       return cgroups.MemoryData{}, nil
+               }
+               return cgroups.MemoryData{}, fmt.Errorf("failed to parse %s - %v", maxUsage, err)
+       }
+       memoryData.MaxUsage = value
+       value, err = fscommon.GetCgroupParamUint(path, failcnt)
+       if err != nil {
+               if moduleName != "memory" && os.IsNotExist(err) {
+                       return cgroups.MemoryData{}, nil
+               }
+               return cgroups.MemoryData{}, fmt.Errorf("failed to parse %s - %v", failcnt, err)
+       }
+       memoryData.Failcnt = value
+       value, err = fscommon.GetCgroupParamUint(path, limit)
+       if err != nil {
+               if moduleName != "memory" && os.IsNotExist(err) {
+                       return cgroups.MemoryData{}, nil
+               }
+               return cgroups.MemoryData{}, fmt.Errorf("failed to parse %s - %v", limit, err)
+       }
+       memoryData.Limit = value
+
+       return memoryData, nil
+}
diff --git a/libcontainer/cgroups/fs/memory_test.go b/libcontainer/cgroups/fs/memory_test.go
new file mode 100644 (file)
index 0000000..62de563
--- /dev/null
@@ -0,0 +1,456 @@
+// +build linux
+
+package fs
+
+import (
+       "strconv"
+       "testing"
+
+       "github.com/opencontainers/runc/libcontainer/cgroups"
+       "github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
+)
+
+const (
+       memoryStatContents = `cache 512
+rss 1024`
+       memoryUsageContents        = "2048\n"
+       memoryMaxUsageContents     = "4096\n"
+       memoryFailcnt              = "100\n"
+       memoryLimitContents        = "8192\n"
+       memoryUseHierarchyContents = "1\n"
+)
+
+func TestMemorySetMemory(t *testing.T) {
+       helper := NewCgroupTestUtil("memory", t)
+       defer helper.cleanup()
+
+       const (
+               memoryBefore      = 314572800 // 300M
+               memoryAfter       = 524288000 // 500M
+               reservationBefore = 209715200 // 200M
+               reservationAfter  = 314572800 // 300M
+       )
+
+       helper.writeFileContents(map[string]string{
+               "memory.limit_in_bytes":      strconv.Itoa(memoryBefore),
+               "memory.soft_limit_in_bytes": strconv.Itoa(reservationBefore),
+       })
+
+       helper.CgroupData.config.Resources.Memory = memoryAfter
+       helper.CgroupData.config.Resources.MemoryReservation = reservationAfter
+       memory := &MemoryGroup{}
+       if err := memory.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+               t.Fatal(err)
+       }
+
+       value, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "memory.limit_in_bytes")
+       if err != nil {
+               t.Fatalf("Failed to parse memory.limit_in_bytes - %s", err)
+       }
+       if value != memoryAfter {
+               t.Fatal("Got the wrong value, set memory.limit_in_bytes failed.")
+       }
+
+       value, err = fscommon.GetCgroupParamUint(helper.CgroupPath, "memory.soft_limit_in_bytes")
+       if err != nil {
+               t.Fatalf("Failed to parse memory.soft_limit_in_bytes - %s", err)
+       }
+       if value != reservationAfter {
+               t.Fatal("Got the wrong value, set memory.soft_limit_in_bytes failed.")
+       }
+}
+
+func TestMemorySetMemoryswap(t *testing.T) {
+       helper := NewCgroupTestUtil("memory", t)
+       defer helper.cleanup()
+
+       const (
+               memoryswapBefore = 314572800 // 300M
+               memoryswapAfter  = 524288000 // 500M
+       )
+
+       helper.writeFileContents(map[string]string{
+               "memory.memsw.limit_in_bytes": strconv.Itoa(memoryswapBefore),
+       })
+
+       helper.CgroupData.config.Resources.MemorySwap = memoryswapAfter
+       memory := &MemoryGroup{}
+       if err := memory.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+               t.Fatal(err)
+       }
+
+       value, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "memory.memsw.limit_in_bytes")
+       if err != nil {
+               t.Fatalf("Failed to parse memory.memsw.limit_in_bytes - %s", err)
+       }
+       if value != memoryswapAfter {
+               t.Fatal("Got the wrong value, set memory.memsw.limit_in_bytes failed.")
+       }
+}
+
+func TestMemorySetMemoryLargerThanSwap(t *testing.T) {
+       helper := NewCgroupTestUtil("memory", t)
+       defer helper.cleanup()
+
+       const (
+               memoryBefore     = 314572800 // 300M
+               memoryswapBefore = 524288000 // 500M
+               memoryAfter      = 629145600 // 600M
+               memoryswapAfter  = 838860800 // 800M
+       )
+
+       helper.writeFileContents(map[string]string{
+               "memory.limit_in_bytes":       strconv.Itoa(memoryBefore),
+               "memory.memsw.limit_in_bytes": strconv.Itoa(memoryswapBefore),
+               // Set will call getMemoryData when memory and swap memory are
+               // both set, fake these fields so we don't get error.
+               "memory.usage_in_bytes":     "0",
+               "memory.max_usage_in_bytes": "0",
+               "memory.failcnt":            "0",
+       })
+
+       helper.CgroupData.config.Resources.Memory = memoryAfter
+       helper.CgroupData.config.Resources.MemorySwap = memoryswapAfter
+       memory := &MemoryGroup{}
+       if err := memory.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+               t.Fatal(err)
+       }
+
+       value, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "memory.limit_in_bytes")
+       if err != nil {
+               t.Fatalf("Failed to parse memory.limit_in_bytes - %s", err)
+       }
+       if value != memoryAfter {
+               t.Fatal("Got the wrong value, set memory.limit_in_bytes failed.")
+       }
+       value, err = fscommon.GetCgroupParamUint(helper.CgroupPath, "memory.memsw.limit_in_bytes")
+       if err != nil {
+               t.Fatalf("Failed to parse memory.memsw.limit_in_bytes - %s", err)
+       }
+       if value != memoryswapAfter {
+               t.Fatal("Got the wrong value, set memory.memsw.limit_in_bytes failed.")
+       }
+}
+
+func TestMemorySetSwapSmallerThanMemory(t *testing.T) {
+       helper := NewCgroupTestUtil("memory", t)
+       defer helper.cleanup()
+
+       const (
+               memoryBefore     = 629145600 // 600M
+               memoryswapBefore = 838860800 // 800M
+               memoryAfter      = 314572800 // 300M
+               memoryswapAfter  = 524288000 // 500M
+       )
+
+       helper.writeFileContents(map[string]string{
+               "memory.limit_in_bytes":       strconv.Itoa(memoryBefore),
+               "memory.memsw.limit_in_bytes": strconv.Itoa(memoryswapBefore),
+               // Set will call getMemoryData when memory and swap memory are
+               // both set, fake these fields so we don't get error.
+               "memory.usage_in_bytes":     "0",
+               "memory.max_usage_in_bytes": "0",
+               "memory.failcnt":            "0",
+       })
+
+       helper.CgroupData.config.Resources.Memory = memoryAfter
+       helper.CgroupData.config.Resources.MemorySwap = memoryswapAfter
+       memory := &MemoryGroup{}
+       if err := memory.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+               t.Fatal(err)
+       }
+
+       value, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "memory.limit_in_bytes")
+       if err != nil {
+               t.Fatalf("Failed to parse memory.limit_in_bytes - %s", err)
+       }
+       if value != memoryAfter {
+               t.Fatal("Got the wrong value, set memory.limit_in_bytes failed.")
+       }
+       value, err = fscommon.GetCgroupParamUint(helper.CgroupPath, "memory.memsw.limit_in_bytes")
+       if err != nil {
+               t.Fatalf("Failed to parse memory.memsw.limit_in_bytes - %s", err)
+       }
+       if value != memoryswapAfter {
+               t.Fatal("Got the wrong value, set memory.memsw.limit_in_bytes failed.")
+       }
+}
+
+func TestMemorySetKernelMemory(t *testing.T) {
+       helper := NewCgroupTestUtil("memory", t)
+       defer helper.cleanup()
+
+       const (
+               kernelMemoryBefore = 314572800 // 300M
+               kernelMemoryAfter  = 524288000 // 500M
+       )
+
+       helper.writeFileContents(map[string]string{
+               "memory.kmem.limit_in_bytes": strconv.Itoa(kernelMemoryBefore),
+       })
+
+       helper.CgroupData.config.Resources.KernelMemory = kernelMemoryAfter
+       memory := &MemoryGroup{}
+       if err := memory.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+               t.Fatal(err)
+       }
+
+       value, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "memory.kmem.limit_in_bytes")
+       if err != nil {
+               t.Fatalf("Failed to parse memory.kmem.limit_in_bytes - %s", err)
+       }
+       if value != kernelMemoryAfter {
+               t.Fatal("Got the wrong value, set memory.kmem.limit_in_bytes failed.")
+       }
+}
+
+func TestMemorySetKernelMemoryTCP(t *testing.T) {
+       helper := NewCgroupTestUtil("memory", t)
+       defer helper.cleanup()
+
+       const (
+               kernelMemoryTCPBefore = 314572800 // 300M
+               kernelMemoryTCPAfter  = 524288000 // 500M
+       )
+
+       helper.writeFileContents(map[string]string{
+               "memory.kmem.tcp.limit_in_bytes": strconv.Itoa(kernelMemoryTCPBefore),
+       })
+
+       helper.CgroupData.config.Resources.KernelMemoryTCP = kernelMemoryTCPAfter
+       memory := &MemoryGroup{}
+       if err := memory.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+               t.Fatal(err)
+       }
+
+       value, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "memory.kmem.tcp.limit_in_bytes")
+       if err != nil {
+               t.Fatalf("Failed to parse memory.kmem.tcp.limit_in_bytes - %s", err)
+       }
+       if value != kernelMemoryTCPAfter {
+               t.Fatal("Got the wrong value, set memory.kmem.tcp.limit_in_bytes failed.")
+       }
+}
+
+func TestMemorySetMemorySwappinessDefault(t *testing.T) {
+       helper := NewCgroupTestUtil("memory", t)
+       defer helper.cleanup()
+
+       swappinessBefore := 60 //default is 60
+       swappinessAfter := uint64(0)
+
+       helper.writeFileContents(map[string]string{
+               "memory.swappiness": strconv.Itoa(swappinessBefore),
+       })
+
+       helper.CgroupData.config.Resources.MemorySwappiness = &swappinessAfter
+       memory := &MemoryGroup{}
+       if err := memory.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+               t.Fatal(err)
+       }
+
+       value, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "memory.swappiness")
+       if err != nil {
+               t.Fatalf("Failed to parse memory.swappiness - %s", err)
+       }
+       if value != swappinessAfter {
+               t.Fatalf("Got the wrong value (%d), set memory.swappiness = %d failed.", value, swappinessAfter)
+       }
+}
+
+func TestMemoryStats(t *testing.T) {
+       helper := NewCgroupTestUtil("memory", t)
+       defer helper.cleanup()
+       helper.writeFileContents(map[string]string{
+               "memory.stat":                     memoryStatContents,
+               "memory.usage_in_bytes":           memoryUsageContents,
+               "memory.limit_in_bytes":           memoryLimitContents,
+               "memory.max_usage_in_bytes":       memoryMaxUsageContents,
+               "memory.failcnt":                  memoryFailcnt,
+               "memory.memsw.usage_in_bytes":     memoryUsageContents,
+               "memory.memsw.max_usage_in_bytes": memoryMaxUsageContents,
+               "memory.memsw.failcnt":            memoryFailcnt,
+               "memory.memsw.limit_in_bytes":     memoryLimitContents,
+               "memory.kmem.usage_in_bytes":      memoryUsageContents,
+               "memory.kmem.max_usage_in_bytes":  memoryMaxUsageContents,
+               "memory.kmem.failcnt":             memoryFailcnt,
+               "memory.kmem.limit_in_bytes":      memoryLimitContents,
+               "memory.use_hierarchy":            memoryUseHierarchyContents,
+       })
+
+       memory := &MemoryGroup{}
+       actualStats := *cgroups.NewStats()
+       err := memory.GetStats(helper.CgroupPath, &actualStats)
+       if err != nil {
+               t.Fatal(err)
+       }
+       expectedStats := cgroups.MemoryStats{Cache: 512, Usage: cgroups.MemoryData{Usage: 2048, MaxUsage: 4096, Failcnt: 100, Limit: 8192}, SwapUsage: cgroups.MemoryData{Usage: 2048, MaxUsage: 4096, Failcnt: 100, Limit: 8192}, KernelUsage: cgroups.MemoryData{Usage: 2048, MaxUsage: 4096, Failcnt: 100, Limit: 8192}, Stats: map[string]uint64{"cache": 512, "rss": 1024}, UseHierarchy: true}
+       expectMemoryStatEquals(t, expectedStats, actualStats.MemoryStats)
+}
+
+func TestMemoryStatsNoStatFile(t *testing.T) {
+       helper := NewCgroupTestUtil("memory", t)
+       defer helper.cleanup()
+       helper.writeFileContents(map[string]string{
+               "memory.usage_in_bytes":     memoryUsageContents,
+               "memory.max_usage_in_bytes": memoryMaxUsageContents,
+               "memory.limit_in_bytes":     memoryLimitContents,
+       })
+
+       memory := &MemoryGroup{}
+       actualStats := *cgroups.NewStats()
+       err := memory.GetStats(helper.CgroupPath, &actualStats)
+       if err != nil {
+               t.Fatal(err)
+       }
+}
+
+func TestMemoryStatsNoUsageFile(t *testing.T) {
+       helper := NewCgroupTestUtil("memory", t)
+       defer helper.cleanup()
+       helper.writeFileContents(map[string]string{
+               "memory.stat":               memoryStatContents,
+               "memory.max_usage_in_bytes": memoryMaxUsageContents,
+               "memory.limit_in_bytes":     memoryLimitContents,
+       })
+
+       memory := &MemoryGroup{}
+       actualStats := *cgroups.NewStats()
+       err := memory.GetStats(helper.CgroupPath, &actualStats)
+       if err == nil {
+               t.Fatal("Expected failure")
+       }
+}
+
+func TestMemoryStatsNoMaxUsageFile(t *testing.T) {
+       helper := NewCgroupTestUtil("memory", t)
+       defer helper.cleanup()
+       helper.writeFileContents(map[string]string{
+               "memory.stat":           memoryStatContents,
+               "memory.usage_in_bytes": memoryUsageContents,
+               "memory.limit_in_bytes": memoryLimitContents,
+       })
+
+       memory := &MemoryGroup{}
+       actualStats := *cgroups.NewStats()
+       err := memory.GetStats(helper.CgroupPath, &actualStats)
+       if err == nil {
+               t.Fatal("Expected failure")
+       }
+}
+
+func TestMemoryStatsNoLimitInBytesFile(t *testing.T) {
+       helper := NewCgroupTestUtil("memory", t)
+       defer helper.cleanup()
+       helper.writeFileContents(map[string]string{
+               "memory.stat":               memoryStatContents,
+               "memory.usage_in_bytes":     memoryUsageContents,
+               "memory.max_usage_in_bytes": memoryMaxUsageContents,
+       })
+
+       memory := &MemoryGroup{}
+       actualStats := *cgroups.NewStats()
+       err := memory.GetStats(helper.CgroupPath, &actualStats)
+       if err == nil {
+               t.Fatal("Expected failure")
+       }
+}
+
+func TestMemoryStatsBadStatFile(t *testing.T) {
+       helper := NewCgroupTestUtil("memory", t)
+       defer helper.cleanup()
+       helper.writeFileContents(map[string]string{
+               "memory.stat":               "rss rss",
+               "memory.usage_in_bytes":     memoryUsageContents,
+               "memory.max_usage_in_bytes": memoryMaxUsageContents,
+               "memory.limit_in_bytes":     memoryLimitContents,
+       })
+
+       memory := &MemoryGroup{}
+       actualStats := *cgroups.NewStats()
+       err := memory.GetStats(helper.CgroupPath, &actualStats)
+       if err == nil {
+               t.Fatal("Expected failure")
+       }
+}
+
+func TestMemoryStatsBadUsageFile(t *testing.T) {
+       helper := NewCgroupTestUtil("memory", t)
+       defer helper.cleanup()
+       helper.writeFileContents(map[string]string{
+               "memory.stat":               memoryStatContents,
+               "memory.usage_in_bytes":     "bad",
+               "memory.max_usage_in_bytes": memoryMaxUsageContents,
+               "memory.limit_in_bytes":     memoryLimitContents,
+       })
+
+       memory := &MemoryGroup{}
+       actualStats := *cgroups.NewStats()
+       err := memory.GetStats(helper.CgroupPath, &actualStats)
+       if err == nil {
+               t.Fatal("Expected failure")
+       }
+}
+
+func TestMemoryStatsBadMaxUsageFile(t *testing.T) {
+       helper := NewCgroupTestUtil("memory", t)
+       defer helper.cleanup()
+       helper.writeFileContents(map[string]string{
+               "memory.stat":               memoryStatContents,
+               "memory.usage_in_bytes":     memoryUsageContents,
+               "memory.max_usage_in_bytes": "bad",
+               "memory.limit_in_bytes":     memoryLimitContents,
+       })
+
+       memory := &MemoryGroup{}
+       actualStats := *cgroups.NewStats()
+       err := memory.GetStats(helper.CgroupPath, &actualStats)
+       if err == nil {
+               t.Fatal("Expected failure")
+       }
+}
+
+func TestMemoryStatsBadLimitInBytesFile(t *testing.T) {
+       helper := NewCgroupTestUtil("memory", t)
+       defer helper.cleanup()
+       helper.writeFileContents(map[string]string{
+               "memory.stat":               memoryStatContents,
+               "memory.usage_in_bytes":     memoryUsageContents,
+               "memory.max_usage_in_bytes": memoryMaxUsageContents,
+               "memory.limit_in_bytes":     "bad",
+       })
+
+       memory := &MemoryGroup{}
+       actualStats := *cgroups.NewStats()
+       err := memory.GetStats(helper.CgroupPath, &actualStats)
+       if err == nil {
+               t.Fatal("Expected failure")
+       }
+}
+
+func TestMemorySetOomControl(t *testing.T) {
+       helper := NewCgroupTestUtil("memory", t)
+       defer helper.cleanup()
+
+       const (
+               oomKillDisable = 1 // disable oom killer, default is 0
+       )
+
+       helper.writeFileContents(map[string]string{
+               "memory.oom_control": strconv.Itoa(oomKillDisable),
+       })
+
+       memory := &MemoryGroup{}
+       if err := memory.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+               t.Fatal(err)
+       }
+
+       value, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "memory.oom_control")
+       if err != nil {
+               t.Fatalf("Failed to parse memory.oom_control - %s", err)
+       }
+
+       if value != oomKillDisable {
+               t.Fatalf("Got the wrong value, set memory.oom_control failed.")
+       }
+}
diff --git a/libcontainer/cgroups/fs/name.go b/libcontainer/cgroups/fs/name.go
new file mode 100644 (file)
index 0000000..d8cf1d8
--- /dev/null
@@ -0,0 +1,40 @@
+// +build linux
+
+package fs
+
+import (
+       "github.com/opencontainers/runc/libcontainer/cgroups"
+       "github.com/opencontainers/runc/libcontainer/configs"
+)
+
+type NameGroup struct {
+       GroupName string
+       Join      bool
+}
+
+func (s *NameGroup) Name() string {
+       return s.GroupName
+}
+
+func (s *NameGroup) Apply(d *cgroupData) error {
+       if s.Join {
+               // ignore errors if the named cgroup does not exist
+               d.join(s.GroupName)
+       }
+       return nil
+}
+
+func (s *NameGroup) Set(path string, cgroup *configs.Cgroup) error {
+       return nil
+}
+
+func (s *NameGroup) Remove(d *cgroupData) error {
+       if s.Join {
+               removePath(d.path(s.GroupName))
+       }
+       return nil
+}
+
+func (s *NameGroup) GetStats(path string, stats *cgroups.Stats) error {
+       return nil
+}
diff --git a/libcontainer/cgroups/fs/net_cls.go b/libcontainer/cgroups/fs/net_cls.go
new file mode 100644 (file)
index 0000000..0212015
--- /dev/null
@@ -0,0 +1,44 @@
+// +build linux
+
+package fs
+
+import (
+       "strconv"
+
+       "github.com/opencontainers/runc/libcontainer/cgroups"
+       "github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
+       "github.com/opencontainers/runc/libcontainer/configs"
+)
+
+type NetClsGroup struct {
+}
+
+func (s *NetClsGroup) Name() string {
+       return "net_cls"
+}
+
+func (s *NetClsGroup) Apply(d *cgroupData) error {
+       _, err := d.join("net_cls")
+       if err != nil && !cgroups.IsNotFound(err) {
+               return err
+       }
+       return nil
+}
+
+func (s *NetClsGroup) Set(path string, cgroup *configs.Cgroup) error {
+       if cgroup.Resources.NetClsClassid != 0 {
+               if err := fscommon.WriteFile(path, "net_cls.classid", strconv.FormatUint(uint64(cgroup.Resources.NetClsClassid), 10)); err != nil {
+                       return err
+               }
+       }
+
+       return nil
+}
+
+func (s *NetClsGroup) Remove(d *cgroupData) error {
+       return removePath(d.path("net_cls"))
+}
+
+func (s *NetClsGroup) GetStats(path string, stats *cgroups.Stats) error {
+       return nil
+}
diff --git a/libcontainer/cgroups/fs/net_cls_test.go b/libcontainer/cgroups/fs/net_cls_test.go
new file mode 100644 (file)
index 0000000..602133a
--- /dev/null
@@ -0,0 +1,41 @@
+// +build linux
+
+package fs
+
+import (
+       "strconv"
+       "testing"
+
+       "github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
+)
+
+const (
+       classidBefore = 0x100002
+       classidAfter  = 0x100001
+)
+
+func TestNetClsSetClassid(t *testing.T) {
+       helper := NewCgroupTestUtil("net_cls", t)
+       defer helper.cleanup()
+
+       helper.writeFileContents(map[string]string{
+               "net_cls.classid": strconv.FormatUint(classidBefore, 10),
+       })
+
+       helper.CgroupData.config.Resources.NetClsClassid = classidAfter
+       netcls := &NetClsGroup{}
+       if err := netcls.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+               t.Fatal(err)
+       }
+
+       // As we are in mock environment, we can't get correct value of classid from
+       // net_cls.classid.
+       // So. we just judge if we successfully write classid into file
+       value, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "net_cls.classid")
+       if err != nil {
+               t.Fatalf("Failed to parse net_cls.classid - %s", err)
+       }
+       if value != classidAfter {
+               t.Fatal("Got the wrong value, set net_cls.classid failed.")
+       }
+}
diff --git a/libcontainer/cgroups/fs/net_prio.go b/libcontainer/cgroups/fs/net_prio.go
new file mode 100644 (file)
index 0000000..2bdeedf
--- /dev/null
@@ -0,0 +1,42 @@
+// +build linux
+
+package fs
+
+import (
+       "github.com/opencontainers/runc/libcontainer/cgroups"
+       "github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
+       "github.com/opencontainers/runc/libcontainer/configs"
+)
+
+type NetPrioGroup struct {
+}
+
+func (s *NetPrioGroup) Name() string {
+       return "net_prio"
+}
+
+func (s *NetPrioGroup) Apply(d *cgroupData) error {
+       _, err := d.join("net_prio")
+       if err != nil && !cgroups.IsNotFound(err) {
+               return err
+       }
+       return nil
+}
+
+func (s *NetPrioGroup) Set(path string, cgroup *configs.Cgroup) error {
+       for _, prioMap := range cgroup.Resources.NetPrioIfpriomap {
+               if err := fscommon.WriteFile(path, "net_prio.ifpriomap", prioMap.CgroupString()); err != nil {
+                       return err
+               }
+       }
+
+       return nil
+}
+
+func (s *NetPrioGroup) Remove(d *cgroupData) error {
+       return removePath(d.path("net_prio"))
+}
+
+func (s *NetPrioGroup) GetStats(path string, stats *cgroups.Stats) error {
+       return nil
+}
diff --git a/libcontainer/cgroups/fs/net_prio_test.go b/libcontainer/cgroups/fs/net_prio_test.go
new file mode 100644 (file)
index 0000000..2ce8e19
--- /dev/null
@@ -0,0 +1,39 @@
+// +build linux
+
+package fs
+
+import (
+       "strings"
+       "testing"
+
+       "github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
+       "github.com/opencontainers/runc/libcontainer/configs"
+)
+
+var (
+       prioMap = []*configs.IfPrioMap{
+               {
+                       Interface: "test",
+                       Priority:  5,
+               },
+       }
+)
+
+func TestNetPrioSetIfPrio(t *testing.T) {
+       helper := NewCgroupTestUtil("net_prio", t)
+       defer helper.cleanup()
+
+       helper.CgroupData.config.Resources.NetPrioIfpriomap = prioMap
+       netPrio := &NetPrioGroup{}
+       if err := netPrio.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+               t.Fatal(err)
+       }
+
+       value, err := fscommon.GetCgroupParamString(helper.CgroupPath, "net_prio.ifpriomap")
+       if err != nil {
+               t.Fatalf("Failed to parse net_prio.ifpriomap - %s", err)
+       }
+       if !strings.Contains(value, "test 5") {
+               t.Fatal("Got the wrong value, set net_prio.ifpriomap failed.")
+       }
+}
diff --git a/libcontainer/cgroups/fs/perf_event.go b/libcontainer/cgroups/fs/perf_event.go
new file mode 100644 (file)
index 0000000..5693676
--- /dev/null
@@ -0,0 +1,35 @@
+// +build linux
+
+package fs
+
+import (
+       "github.com/opencontainers/runc/libcontainer/cgroups"
+       "github.com/opencontainers/runc/libcontainer/configs"
+)
+
+type PerfEventGroup struct {
+}
+
+func (s *PerfEventGroup) Name() string {
+       return "perf_event"
+}
+
+func (s *PerfEventGroup) Apply(d *cgroupData) error {
+       // we just want to join this group even though we don't set anything
+       if _, err := d.join("perf_event"); err != nil && !cgroups.IsNotFound(err) {
+               return err
+       }
+       return nil
+}
+
+func (s *PerfEventGroup) Set(path string, cgroup *configs.Cgroup) error {
+       return nil
+}
+
+func (s *PerfEventGroup) Remove(d *cgroupData) error {
+       return removePath(d.path("perf_event"))
+}
+
+func (s *PerfEventGroup) GetStats(path string, stats *cgroups.Stats) error {
+       return nil
+}
diff --git a/libcontainer/cgroups/fs/pids.go b/libcontainer/cgroups/fs/pids.go
new file mode 100644 (file)
index 0000000..7bf6801
--- /dev/null
@@ -0,0 +1,74 @@
+// +build linux
+
+package fs
+
+import (
+       "fmt"
+       "path/filepath"
+       "strconv"
+
+       "github.com/opencontainers/runc/libcontainer/cgroups"
+       "github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
+       "github.com/opencontainers/runc/libcontainer/configs"
+)
+
+type PidsGroup struct {
+}
+
+func (s *PidsGroup) Name() string {
+       return "pids"
+}
+
+func (s *PidsGroup) Apply(d *cgroupData) error {
+       _, err := d.join("pids")
+       if err != nil && !cgroups.IsNotFound(err) {
+               return err
+       }
+       return nil
+}
+
+func (s *PidsGroup) Set(path string, cgroup *configs.Cgroup) error {
+       if cgroup.Resources.PidsLimit != 0 {
+               // "max" is the fallback value.
+               limit := "max"
+
+               if cgroup.Resources.PidsLimit > 0 {
+                       limit = strconv.FormatInt(cgroup.Resources.PidsLimit, 10)
+               }
+
+               if err := fscommon.WriteFile(path, "pids.max", limit); err != nil {
+                       return err
+               }
+       }
+
+       return nil
+}
+
+func (s *PidsGroup) Remove(d *cgroupData) error {
+       return removePath(d.path("pids"))
+}
+
+func (s *PidsGroup) GetStats(path string, stats *cgroups.Stats) error {
+       current, err := fscommon.GetCgroupParamUint(path, "pids.current")
+       if err != nil {
+               return fmt.Errorf("failed to parse pids.current - %s", err)
+       }
+
+       maxString, err := fscommon.GetCgroupParamString(path, "pids.max")
+       if err != nil {
+               return fmt.Errorf("failed to parse pids.max - %s", err)
+       }
+
+       // Default if pids.max == "max" is 0 -- which represents "no limit".
+       var max uint64
+       if maxString != "max" {
+               max, err = fscommon.ParseUint(maxString, 10, 64)
+               if err != nil {
+                       return fmt.Errorf("failed to parse pids.max - unable to parse %q as a uint from Cgroup file %q", maxString, filepath.Join(path, "pids.max"))
+               }
+       }
+
+       stats.PidsStats.Current = current
+       stats.PidsStats.Limit = max
+       return nil
+}
diff --git a/libcontainer/cgroups/fs/pids_test.go b/libcontainer/cgroups/fs/pids_test.go
new file mode 100644 (file)
index 0000000..66f3aa3
--- /dev/null
@@ -0,0 +1,112 @@
+// +build linux
+
+package fs
+
+import (
+       "strconv"
+       "testing"
+
+       "github.com/opencontainers/runc/libcontainer/cgroups"
+       "github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
+)
+
+const (
+       maxUnlimited = -1
+       maxLimited   = 1024
+)
+
+func TestPidsSetMax(t *testing.T) {
+       helper := NewCgroupTestUtil("pids", t)
+       defer helper.cleanup()
+
+       helper.writeFileContents(map[string]string{
+               "pids.max": "max",
+       })
+
+       helper.CgroupData.config.Resources.PidsLimit = maxLimited
+       pids := &PidsGroup{}
+       if err := pids.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+               t.Fatal(err)
+       }
+
+       value, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "pids.max")
+       if err != nil {
+               t.Fatalf("Failed to parse pids.max - %s", err)
+       }
+
+       if value != maxLimited {
+               t.Fatalf("Expected %d, got %d for setting pids.max - limited", maxLimited, value)
+       }
+}
+
+func TestPidsSetUnlimited(t *testing.T) {
+       helper := NewCgroupTestUtil("pids", t)
+       defer helper.cleanup()
+
+       helper.writeFileContents(map[string]string{
+               "pids.max": strconv.Itoa(maxLimited),
+       })
+
+       helper.CgroupData.config.Resources.PidsLimit = maxUnlimited
+       pids := &PidsGroup{}
+       if err := pids.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+               t.Fatal(err)
+       }
+
+       value, err := fscommon.GetCgroupParamString(helper.CgroupPath, "pids.max")
+       if err != nil {
+               t.Fatalf("Failed to parse pids.max - %s", err)
+       }
+
+       if value != "max" {
+               t.Fatalf("Expected %s, got %s for setting pids.max - unlimited", "max", value)
+       }
+}
+
+func TestPidsStats(t *testing.T) {
+       helper := NewCgroupTestUtil("pids", t)
+       defer helper.cleanup()
+
+       helper.writeFileContents(map[string]string{
+               "pids.current": strconv.Itoa(1337),
+               "pids.max":     strconv.Itoa(maxLimited),
+       })
+
+       pids := &PidsGroup{}
+       stats := *cgroups.NewStats()
+       if err := pids.GetStats(helper.CgroupPath, &stats); err != nil {
+               t.Fatal(err)
+       }
+
+       if stats.PidsStats.Current != 1337 {
+               t.Fatalf("Expected %d, got %d for pids.current", 1337, stats.PidsStats.Current)
+       }
+
+       if stats.PidsStats.Limit != maxLimited {
+               t.Fatalf("Expected %d, got %d for pids.max", maxLimited, stats.PidsStats.Limit)
+       }
+}
+
+func TestPidsStatsUnlimited(t *testing.T) {
+       helper := NewCgroupTestUtil("pids", t)
+       defer helper.cleanup()
+
+       helper.writeFileContents(map[string]string{
+               "pids.current": strconv.Itoa(4096),
+               "pids.max":     "max",
+       })
+
+       pids := &PidsGroup{}
+       stats := *cgroups.NewStats()
+       if err := pids.GetStats(helper.CgroupPath, &stats); err != nil {
+               t.Fatal(err)
+       }
+
+       if stats.PidsStats.Current != 4096 {
+               t.Fatalf("Expected %d, got %d for pids.current", 4096, stats.PidsStats.Current)
+       }
+
+       if stats.PidsStats.Limit != 0 {
+               t.Fatalf("Expected %d, got %d for pids.max", 0, stats.PidsStats.Limit)
+       }
+}
diff --git a/libcontainer/cgroups/fs/stats_util_test.go b/libcontainer/cgroups/fs/stats_util_test.go
new file mode 100644 (file)
index 0000000..c5a8d18
--- /dev/null
@@ -0,0 +1,123 @@
+// +build linux
+
+package fs
+
+import (
+       "fmt"
+       "testing"
+
+       "github.com/opencontainers/runc/libcontainer/cgroups"
+
+       "github.com/sirupsen/logrus"
+)
+
+func blkioStatEntryEquals(expected, actual []cgroups.BlkioStatEntry) error {
+       if len(expected) != len(actual) {
+               return fmt.Errorf("blkioStatEntries length do not match")
+       }
+       for i, expValue := range expected {
+               actValue := actual[i]
+               if expValue != actValue {
+                       return fmt.Errorf("Expected blkio stat entry %v but found %v", expValue, actValue)
+               }
+       }
+       return nil
+}
+
+func expectBlkioStatsEquals(t *testing.T, expected, actual cgroups.BlkioStats) {
+       if err := blkioStatEntryEquals(expected.IoServiceBytesRecursive, actual.IoServiceBytesRecursive); err != nil {
+               logrus.Printf("blkio IoServiceBytesRecursive do not match - %s\n", err)
+               t.Fail()
+       }
+
+       if err := blkioStatEntryEquals(expected.IoServicedRecursive, actual.IoServicedRecursive); err != nil {
+               logrus.Printf("blkio IoServicedRecursive do not match - %s\n", err)
+               t.Fail()
+       }
+
+       if err := blkioStatEntryEquals(expected.IoQueuedRecursive, actual.IoQueuedRecursive); err != nil {
+               logrus.Printf("blkio IoQueuedRecursive do not match - %s\n", err)
+               t.Fail()
+       }
+
+       if err := blkioStatEntryEquals(expected.SectorsRecursive, actual.SectorsRecursive); err != nil {
+               logrus.Printf("blkio SectorsRecursive do not match - %s\n", err)
+               t.Fail()
+       }
+
+       if err := blkioStatEntryEquals(expected.IoServiceTimeRecursive, actual.IoServiceTimeRecursive); err != nil {
+               logrus.Printf("blkio IoServiceTimeRecursive do not match - %s\n", err)
+               t.Fail()
+       }
+
+       if err := blkioStatEntryEquals(expected.IoWaitTimeRecursive, actual.IoWaitTimeRecursive); err != nil {
+               logrus.Printf("blkio IoWaitTimeRecursive do not match - %s\n", err)
+               t.Fail()
+       }
+
+       if err := blkioStatEntryEquals(expected.IoMergedRecursive, actual.IoMergedRecursive); err != nil {
+               logrus.Printf("blkio IoMergedRecursive do not match - %v vs %v\n", expected.IoMergedRecursive, actual.IoMergedRecursive)
+               t.Fail()
+       }
+
+       if err := blkioStatEntryEquals(expected.IoTimeRecursive, actual.IoTimeRecursive); err != nil {
+               logrus.Printf("blkio IoTimeRecursive do not match - %s\n", err)
+               t.Fail()
+       }
+}
+
+func expectThrottlingDataEquals(t *testing.T, expected, actual cgroups.ThrottlingData) {
+       if expected != actual {
+               logrus.Printf("Expected throttling data %v but found %v\n", expected, actual)
+               t.Fail()
+       }
+}
+
+func expectHugetlbStatEquals(t *testing.T, expected, actual cgroups.HugetlbStats) {
+       if expected != actual {
+               logrus.Printf("Expected hugetlb stats %v but found %v\n", expected, actual)
+               t.Fail()
+       }
+}
+
+func expectMemoryStatEquals(t *testing.T, expected, actual cgroups.MemoryStats) {
+       expectMemoryDataEquals(t, expected.Usage, actual.Usage)
+       expectMemoryDataEquals(t, expected.SwapUsage, actual.SwapUsage)
+       expectMemoryDataEquals(t, expected.KernelUsage, actual.KernelUsage)
+
+       if expected.UseHierarchy != actual.UseHierarchy {
+               logrus.Printf("Expected memory use hierarchy %v, but found %v\n", expected.UseHierarchy, actual.UseHierarchy)
+               t.Fail()
+       }
+
+       for key, expValue := range expected.Stats {
+               actValue, ok := actual.Stats[key]
+               if !ok {
+                       logrus.Printf("Expected memory stat key %s not found\n", key)
+                       t.Fail()
+               }
+               if expValue != actValue {
+                       logrus.Printf("Expected memory stat value %d but found %d\n", expValue, actValue)
+                       t.Fail()
+               }
+       }
+}
+
+func expectMemoryDataEquals(t *testing.T, expected, actual cgroups.MemoryData) {
+       if expected.Usage != actual.Usage {
+               logrus.Printf("Expected memory usage %d but found %d\n", expected.Usage, actual.Usage)
+               t.Fail()
+       }
+       if expected.MaxUsage != actual.MaxUsage {
+               logrus.Printf("Expected memory max usage %d but found %d\n", expected.MaxUsage, actual.MaxUsage)
+               t.Fail()
+       }
+       if expected.Failcnt != actual.Failcnt {
+               logrus.Printf("Expected memory failcnt %d but found %d\n", expected.Failcnt, actual.Failcnt)
+               t.Fail()
+       }
+       if expected.Limit != actual.Limit {
+               logrus.Printf("Expected memory limit %d but found %d\n", expected.Limit, actual.Limit)
+               t.Fail()
+       }
+}
diff --git a/libcontainer/cgroups/fs/util_test.go b/libcontainer/cgroups/fs/util_test.go
new file mode 100644 (file)
index 0000000..2c50d6f
--- /dev/null
@@ -0,0 +1,68 @@
+// +build linux
+
+/*
+Utility for testing cgroup operations.
+
+Creates a mock of the cgroup filesystem for the duration of the test.
+*/
+package fs
+
+import (
+       "io/ioutil"
+       "os"
+       "path/filepath"
+       "testing"
+
+       "github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
+       "github.com/opencontainers/runc/libcontainer/configs"
+)
+
+type cgroupTestUtil struct {
+       // cgroup data to use in tests.
+       CgroupData *cgroupData
+
+       // Path to the mock cgroup directory.
+       CgroupPath string
+
+       // Temporary directory to store mock cgroup filesystem.
+       tempDir string
+       t       *testing.T
+}
+
+// Creates a new test util for the specified subsystem
+func NewCgroupTestUtil(subsystem string, t *testing.T) *cgroupTestUtil {
+       d := &cgroupData{
+               config: &configs.Cgroup{},
+       }
+       d.config.Resources = &configs.Resources{}
+       tempDir, err := ioutil.TempDir("", "cgroup_test")
+       if err != nil {
+               t.Fatal(err)
+       }
+       d.root = tempDir
+       testCgroupPath := filepath.Join(d.root, subsystem)
+       if err != nil {
+               t.Fatal(err)
+       }
+
+       // Ensure the full mock cgroup path exists.
+       err = os.MkdirAll(testCgroupPath, 0755)
+       if err != nil {
+               t.Fatal(err)
+       }
+       return &cgroupTestUtil{CgroupData: d, CgroupPath: testCgroupPath, tempDir: tempDir, t: t}
+}
+
+func (c *cgroupTestUtil) cleanup() {
+       os.RemoveAll(c.tempDir)
+}
+
+// Write the specified contents on the mock of the specified cgroup files.
+func (c *cgroupTestUtil) writeFileContents(fileContents map[string]string) {
+       for file, contents := range fileContents {
+               err := fscommon.WriteFile(c.CgroupPath, file, contents)
+               if err != nil {
+                       c.t.Fatal(err)
+               }
+       }
+}
diff --git a/libcontainer/cgroups/fs2/cpu.go b/libcontainer/cgroups/fs2/cpu.go
new file mode 100644 (file)
index 0000000..f0f5df0
--- /dev/null
@@ -0,0 +1,56 @@
+// +build linux
+
+package fs2
+
+import (
+       "bufio"
+       "os"
+       "path/filepath"
+       "strconv"
+
+       "github.com/opencontainers/runc/libcontainer/cgroups"
+       "github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
+       "github.com/opencontainers/runc/libcontainer/configs"
+)
+
+func setCpu(dirPath string, cgroup *configs.Cgroup) error {
+       if cgroup.Resources.CpuWeight != 0 {
+               if err := fscommon.WriteFile(dirPath, "cpu.weight", strconv.FormatUint(cgroup.Resources.CpuWeight, 10)); err != nil {
+                       return err
+               }
+       }
+
+       if cgroup.Resources.CpuMax != "" {
+               if err := fscommon.WriteFile(dirPath, "cpu.max", cgroup.Resources.CpuMax); err != nil {
+                       return err
+               }
+       }
+
+       return nil
+}
+func statCpu(dirPath string, stats *cgroups.Stats) error {
+       f, err := os.Open(filepath.Join(dirPath, "cpu.stat"))
+       if err != nil {
+               return err
+       }
+       defer f.Close()
+
+       sc := bufio.NewScanner(f)
+       for sc.Scan() {
+               t, v, err := fscommon.GetCgroupParamKeyValue(sc.Text())
+               if err != nil {
+                       return err
+               }
+               switch t {
+               case "usage_usec":
+                       stats.CpuStats.CpuUsage.TotalUsage = v * 1000
+
+               case "user_usec":
+                       stats.CpuStats.CpuUsage.UsageInUsermode = v * 1000
+
+               case "system_usec":
+                       stats.CpuStats.CpuUsage.UsageInKernelmode = v * 1000
+               }
+       }
+       return nil
+}
diff --git a/libcontainer/cgroups/fs2/cpuset.go b/libcontainer/cgroups/fs2/cpuset.go
new file mode 100644 (file)
index 0000000..6492ac9
--- /dev/null
@@ -0,0 +1,22 @@
+// +build linux
+
+package fs2
+
+import (
+       "github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
+       "github.com/opencontainers/runc/libcontainer/configs"
+)
+
+func setCpuset(dirPath string, cgroup *configs.Cgroup) error {
+       if cgroup.Resources.CpusetCpus != "" {
+               if err := fscommon.WriteFile(dirPath, "cpuset.cpus", cgroup.Resources.CpusetCpus); err != nil {
+                       return err
+               }
+       }
+       if cgroup.Resources.CpusetMems != "" {
+               if err := fscommon.WriteFile(dirPath, "cpuset.mems", cgroup.Resources.CpusetMems); err != nil {
+                       return err
+               }
+       }
+       return nil
+}
diff --git a/libcontainer/cgroups/fs2/defaultpath.go b/libcontainer/cgroups/fs2/defaultpath.go
new file mode 100644 (file)
index 0000000..e84b33f
--- /dev/null
@@ -0,0 +1,99 @@
+/*
+   Copyright The containerd Authors.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+*/
+
+package fs2
+
+import (
+       "bufio"
+       "io"
+       "os"
+       "path/filepath"
+       "strings"
+
+       "github.com/opencontainers/runc/libcontainer/configs"
+       libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils"
+       "github.com/pkg/errors"
+)
+
+const UnifiedMountpoint = "/sys/fs/cgroup"
+
+func defaultDirPath(c *configs.Cgroup) (string, error) {
+       if (c.Name != "" || c.Parent != "") && c.Path != "" {
+               return "", errors.Errorf("cgroup: either Path or Name and Parent should be used, got %+v", c)
+       }
+       if len(c.Paths) != 0 {
+               // never set by specconv
+               return "", errors.Errorf("cgroup: Paths is unsupported, use Path, got %+v", c)
+       }
+
+       // XXX: Do not remove this code. Path safety is important! -- cyphar
+       cgPath := libcontainerUtils.CleanPath(c.Path)
+       cgParent := libcontainerUtils.CleanPath(c.Parent)
+       cgName := libcontainerUtils.CleanPath(c.Name)
+
+       ownCgroup, err := parseCgroupFile("/proc/self/cgroup")
+       if err != nil {
+               return "", err
+       }
+       return _defaultDirPath(UnifiedMountpoint, cgPath, cgParent, cgName, ownCgroup)
+}
+
+func _defaultDirPath(root, cgPath, cgParent, cgName, ownCgroup string) (string, error) {
+       if (cgName != "" || cgParent != "") && cgPath != "" {
+               return "", errors.New("cgroup: either Path or Name and Parent should be used")
+       }
+       innerPath := cgPath
+       if innerPath == "" {
+               innerPath = filepath.Join(cgParent, cgName)
+       }
+       if filepath.IsAbs(innerPath) {
+               return filepath.Join(root, innerPath), nil
+       }
+       return filepath.Join(root, ownCgroup, innerPath), nil
+}
+
+// parseCgroupFile parses /proc/PID/cgroup file and return string
+func parseCgroupFile(path string) (string, error) {
+       f, err := os.Open(path)
+       if err != nil {
+               return "", err
+       }
+       defer f.Close()
+       return parseCgroupFromReader(f)
+}
+
+func parseCgroupFromReader(r io.Reader) (string, error) {
+       var (
+               s = bufio.NewScanner(r)
+       )
+       for s.Scan() {
+               if err := s.Err(); err != nil {
+                       return "", err
+               }
+               var (
+                       text  = s.Text()
+                       parts = strings.SplitN(text, ":", 3)
+               )
+               if len(parts) < 3 {
+                       return "", errors.Errorf("invalid cgroup entry: %q", text)
+               }
+               // text is like "0::/user.slice/user-1001.slice/session-1.scope"
+               if parts[0] == "0" && parts[1] == "" {
+                       return parts[2], nil
+               }
+       }
+       return "", errors.New("cgroup path not found")
+}
diff --git a/libcontainer/cgroups/fs2/defaultpath_test.go b/libcontainer/cgroups/fs2/defaultpath_test.go
new file mode 100644 (file)
index 0000000..6d5d117
--- /dev/null
@@ -0,0 +1,76 @@
+/*
+   Copyright The containerd Authors.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+*/
+
+package fs2
+
+import (
+       "strings"
+       "testing"
+)
+
+func TestParseCgroupFromReader(t *testing.T) {
+       cases := map[string]string{
+               "0::/user.slice/user-1001.slice/session-1.scope\n":                                  "/user.slice/user-1001.slice/session-1.scope",
+               "2:cpuset:/foo\n1:name=systemd:/\n":                                                 "",
+               "2:cpuset:/foo\n1:name=systemd:/\n0::/user.slice/user-1001.slice/session-1.scope\n": "/user.slice/user-1001.slice/session-1.scope",
+       }
+       for s, expected := range cases {
+               g, err := parseCgroupFromReader(strings.NewReader(s))
+               if expected != "" {
+                       if string(g) != expected {
+                               t.Errorf("expected %q, got %q", expected, string(g))
+                       }
+                       if err != nil {
+                               t.Error(err)
+                       }
+               } else {
+                       if err == nil {
+                               t.Error("error is expected")
+                       }
+               }
+       }
+}
+
+func TestDefaultDirPath(t *testing.T) {
+       root := "/sys/fs/cgroup"
+       cases := []struct {
+               cgPath    string
+               cgParent  string
+               cgName    string
+               ownCgroup string
+               expected  string
+       }{
+               {
+                       cgPath:    "/foo/bar",
+                       ownCgroup: "/apple/banana",
+                       expected:  "/sys/fs/cgroup/foo/bar",
+               },
+               {
+                       cgPath:    "foo/bar",
+                       ownCgroup: "/apple/banana",
+                       expected:  "/sys/fs/cgroup/apple/banana/foo/bar",
+               },
+       }
+       for _, c := range cases {
+               got, err := _defaultDirPath(root, c.cgPath, c.cgParent, c.cgName, c.ownCgroup)
+               if err != nil {
+                       t.Fatal(err)
+               }
+               if got != c.expected {
+                       t.Fatalf("expected %q, got %q", c.expected, got)
+               }
+       }
+}
diff --git a/libcontainer/cgroups/fs2/devices.go b/libcontainer/cgroups/fs2/devices.go
new file mode 100644 (file)
index 0000000..e0fd685
--- /dev/null
@@ -0,0 +1,73 @@
+// +build linux
+
+package fs2
+
+import (
+       "github.com/opencontainers/runc/libcontainer/cgroups/ebpf"
+       "github.com/opencontainers/runc/libcontainer/cgroups/ebpf/devicefilter"
+       "github.com/opencontainers/runc/libcontainer/configs"
+       "github.com/pkg/errors"
+       "golang.org/x/sys/unix"
+)
+
+func isRWM(cgroupPermissions string) bool {
+       r := false
+       w := false
+       m := false
+       for _, rn := range cgroupPermissions {
+               switch rn {
+               case 'r':
+                       r = true
+               case 'w':
+                       w = true
+               case 'm':
+                       m = true
+               }
+       }
+       return r && w && m
+}
+
+// the logic is from crun
+// https://github.com/containers/crun/blob/0.10.2/src/libcrun/cgroup.c#L1644-L1652
+func canSkipEBPFError(cgroup *configs.Cgroup) bool {
+       for _, dev := range cgroup.Resources.Devices {
+               if dev.Allow || !isRWM(dev.Permissions) {
+                       return false
+               }
+       }
+       return true
+}
+
+func setDevices(dirPath string, cgroup *configs.Cgroup) error {
+       devices := cgroup.Devices
+       if allowAllDevices := cgroup.Resources.AllowAllDevices; allowAllDevices != nil {
+               // never set by OCI specconv, but *allowAllDevices=false is still used by the integration test
+               if *allowAllDevices == true {
+                       return errors.New("libcontainer AllowAllDevices is not supported, use Devices")
+               }
+               for _, ad := range cgroup.Resources.AllowedDevices {
+                       d := *ad
+                       d.Allow = true
+                       devices = append(devices, &d)
+               }
+       }
+       if len(cgroup.Resources.DeniedDevices) != 0 {
+               // never set by OCI specconv
+               return errors.New("libcontainer DeniedDevices is not supported, use Devices")
+       }
+       insts, license, err := devicefilter.DeviceFilter(devices)
+       if err != nil {
+               return err
+       }
+       dirFD, err := unix.Open(dirPath, unix.O_DIRECTORY|unix.O_RDONLY, 0600)
+       if err != nil {
+               return errors.Errorf("cannot get dir FD for %s", dirPath)
+       }
+       defer unix.Close(dirFD)
+       if _, err := ebpf.LoadAttachCgroupDeviceFilter(insts, license, dirFD); err != nil {
+               if !canSkipEBPFError(cgroup) {
+                       return err
+               }
+       }
+       return nil
+}
diff --git a/libcontainer/cgroups/fs2/freezer.go b/libcontainer/cgroups/fs2/freezer.go
new file mode 100644 (file)
index 0000000..130c63f
--- /dev/null
@@ -0,0 +1,53 @@
+// +build linux
+
+package fs2
+
+import (
+       "strconv"
+       "strings"
+
+       "github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
+       "github.com/opencontainers/runc/libcontainer/configs"
+       "github.com/pkg/errors"
+)
+
+func setFreezer(dirPath string, state configs.FreezerState) error {
+       var desired int
+       switch state {
+       case configs.Undefined:
+               return nil
+       case configs.Frozen:
+               desired = 1
+       case configs.Thawed:
+               desired = 0
+       default:
+               return errors.Errorf("unknown freezer state %+v", state)
+       }
+       supportedErr := supportsFreezer(dirPath)
+       if supportedErr != nil && desired != 0 {
+               // can ignore error if desired == 1
+               return errors.Wrap(supportedErr, "freezer not supported")
+       }
+       return freezeWithInt(dirPath, desired)
+}
+
+func supportsFreezer(dirPath string) error {
+       _, err := fscommon.ReadFile(dirPath, "cgroup.freeze")
+       return err
+}
+
+// freeze writes desired int to "cgroup.freeze".
+func freezeWithInt(dirPath string, desired int) error {
+       desiredS := strconv.Itoa(desired)
+       if err := fscommon.WriteFile(dirPath, "cgroup.freeze", desiredS); err != nil {
+               return err
+       }
+       got, err := fscommon.ReadFile(dirPath, "cgroup.freeze")
+       if err != nil {
+               return err
+       }
+       if gotS := strings.TrimSpace(string(got)); gotS != desiredS {
+               return errors.Errorf("expected \"cgroup.freeze\" in %q to be %q, got %q", dirPath, desiredS, gotS)
+       }
+       return nil
+}
diff --git a/libcontainer/cgroups/fs2/fs2.go b/libcontainer/cgroups/fs2/fs2.go
new file mode 100644 (file)
index 0000000..4bb7091
--- /dev/null
@@ -0,0 +1,214 @@
+// +build linux
+
+package fs2
+
+import (
+       "io/ioutil"
+       "os"
+       "path/filepath"
+       "strings"
+
+       securejoin "github.com/cyphar/filepath-securejoin"
+       "github.com/opencontainers/runc/libcontainer/cgroups"
+       "github.com/opencontainers/runc/libcontainer/configs"
+       "github.com/pkg/errors"
+)
+
+// NewManager creates a manager for cgroup v2 unified hierarchy.
+// dirPath is like "/sys/fs/cgroup/user.slice/user-1001.slice/session-1.scope".
+// If dirPath is empty, it is automatically set using config.
+func NewManager(config *configs.Cgroup, dirPath string, rootless bool) (cgroups.Manager, error) {
+       if config == nil {
+               config = &configs.Cgroup{}
+       }
+       if dirPath != "" {
+               if filepath.Clean(dirPath) != dirPath || !filepath.IsAbs(dirPath) {
+                       return nil, errors.Errorf("invalid dir path %q", dirPath)
+               }
+       } else {
+               var err error
+               dirPath, err = defaultDirPath(config)
+               if err != nil {
+                       return nil, err
+               }
+       }
+       controllers, err := detectControllers(dirPath)
+       if err != nil && !rootless {
+               return nil, err
+       }
+
+       m := &manager{
+               config:      config,
+               dirPath:     dirPath,
+               controllers: controllers,
+               rootless:    rootless,
+       }
+       return m, nil
+}
+
+func detectControllers(dirPath string) (map[string]struct{}, error) {
+       if err := os.MkdirAll(dirPath, 0755); err != nil {
+               return nil, err
+       }
+       controllersPath, err := securejoin.SecureJoin(dirPath, "cgroup.controllers")
+       if err != nil {
+               return nil, err
+       }
+       controllersData, err := ioutil.ReadFile(controllersPath)
+       if err != nil {
+               return nil, err
+       }
+       controllersFields := strings.Fields(string(controllersData))
+       controllers := make(map[string]struct{}, len(controllersFields))
+       for _, c := range controllersFields {
+               controllers[c] = struct{}{}
+       }
+       return controllers, nil
+}
+
+type manager struct {
+       config *configs.Cgroup
+       // dirPath is like "/sys/fs/cgroup/user.slice/user-1001.slice/session-1.scope"
+       dirPath string
+       // controllers is content of "cgroup.controllers" file.
+       // excludes pseudo-controllers ("devices" and "freezer").
+       controllers map[string]struct{}
+       rootless    bool
+}
+
+func (m *manager) Apply(pid int) error {
+       if err := cgroups.WriteCgroupProc(m.dirPath, pid); err != nil && !m.rootless {
+               return err
+       }
+       return nil
+}
+
+func (m *manager) GetPids() ([]int, error) {
+       return cgroups.GetPids(m.dirPath)
+}
+
+func (m *manager) GetAllPids() ([]int, error) {
+       return cgroups.GetAllPids(m.dirPath)
+}
+
+func (m *manager) GetStats() (*cgroups.Stats, error) {
+       var (
+               st   cgroups.Stats
+               errs []error
+       )
+       // pids (since kernel 4.5)
+       if _, ok := m.controllers["pids"]; ok {
+               if err := statPids(m.dirPath, &st); err != nil {
+                       errs = append(errs, err)
+               }
+       } else {
+               if err := statPidsWithoutController(m.dirPath, &st); err != nil {
+                       errs = append(errs, err)
+               }
+       }
+       // memory (since kenrel 4.5)
+       if _, ok := m.controllers["memory"]; ok {
+               if err := statMemory(m.dirPath, &st); err != nil {
+                       errs = append(errs, err)
+               }
+       }
+       // io (since kernel 4.5)
+       if _, ok := m.controllers["io"]; ok {
+               if err := statIo(m.dirPath, &st); err != nil {
+                       errs = append(errs, err)
+               }
+       }
+       // cpu (since kernel 4.15)
+       if _, ok := m.controllers["cpu"]; ok {
+               if err := statCpu(m.dirPath, &st); err != nil {
+                       errs = append(errs, err)
+               }
+       }
+       if len(errs) > 0 && !m.rootless {
+               return &st, errors.Errorf("error while statting cgroup v2: %+v", errs)
+       }
+       return &st, nil
+}
+
+func (m *manager) Freeze(state configs.FreezerState) error {
+       if err := setFreezer(m.dirPath, state); err != nil {
+               return err
+       }
+       m.config.Resources.Freezer = state
+       return nil
+}
+
+func (m *manager) Destroy() error {
+       return os.RemoveAll(m.dirPath)
+}
+
+// GetPaths is for compatibility purpose and should be removed in future
+func (m *manager) GetPaths() map[string]string {
+       paths := map[string]string{
+               // pseudo-controller for compatibility
+               "devices": m.dirPath,
+               "freezer": m.dirPath,
+       }
+       for c := range m.controllers {
+               paths[c] = m.dirPath
+       }
+       return paths
+}
+
+func (m *manager) GetUnifiedPath() (string, error) {
+       return m.dirPath, nil
+}
+
+func (m *manager) Set(container *configs.Config) error {
+       if container == nil || container.Cgroups == nil {
+               return nil
+       }
+       var errs []error
+       // pids (since kernel 4.5)
+       if _, ok := m.controllers["pids"]; ok {
+               if err := setPids(m.dirPath, container.Cgroups); err != nil {
+                       errs = append(errs, err)
+               }
+       }
+       // memory (since kernel 4.5)
+       if _, ok := m.controllers["memory"]; ok {
+               if err := setMemory(m.dirPath, container.Cgroups); err != nil {
+                       errs = append(errs, err)
+               }
+       }
+       // io (since kernel 4.5)
+       if _, ok := m.controllers["io"]; ok {
+               if err := setIo(m.dirPath, container.Cgroups); err != nil {
+                       errs = append(errs, err)
+               }
+       }
+       // cpu (since kernel 4.15)
+       if _, ok := m.controllers["cpu"]; ok {
+               if err := setCpu(m.dirPath, container.Cgroups); err != nil {
+                       errs = append(errs, err)
+               }
+       }
+       // devices (since kernel 4.15, pseudo-controller)
+       if err := setDevices(m.dirPath, container.Cgroups); err != nil {
+               errs = append(errs, err)
+       }
+       // cpuset (since kernel 5.0)
+       if _, ok := m.controllers["cpuset"]; ok {
+               if err := setCpuset(m.dirPath, container.Cgroups); err != nil {
+                       errs = append(errs, err)
+               }
+       }
+       // freezer (since kernel 5.2, pseudo-controller)
+       if err := setFreezer(m.dirPath, container.Cgroups.Freezer); err != nil {
+               errs = append(errs, err)
+       }
+       if len(errs) > 0 && !m.rootless {
+               return errors.Errorf("error while setting cgroup v2: %+v", errs)
+       }
+       m.config = container.Cgroups
+       return nil
+}
+
+func (m *manager) GetCgroups() (*configs.Cgroup, error) {
+       return m.config, nil
+}
diff --git a/libcontainer/cgroups/fs2/io.go b/libcontainer/cgroups/fs2/io.go
new file mode 100644 (file)
index 0000000..9a07308
--- /dev/null
@@ -0,0 +1,124 @@
+// +build linux
+
+package fs2
+
+import (
+       "bufio"
+       "os"
+       "path/filepath"
+       "strconv"
+       "strings"
+
+       "github.com/opencontainers/runc/libcontainer/cgroups"
+       "github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
+       "github.com/opencontainers/runc/libcontainer/configs"
+)
+
+func setIo(dirPath string, cgroup *configs.Cgroup) error {
+       if cgroup.Resources.BlkioWeight != 0 {
+               filename := "io.bfq.weight"
+               if err := fscommon.WriteFile(dirPath, filename, strconv.FormatUint(uint64(cgroup.Resources.BlkioWeight), 10)); err != nil {
+                       return err
+               }
+       }
+
+       for _, td := range cgroup.Resources.BlkioThrottleReadBpsDevice {
+               if err := fscommon.WriteFile(dirPath, "io.max", td.StringName("rbps")); err != nil {
+                       return err
+               }
+       }
+       for _, td := range cgroup.Resources.BlkioThrottleWriteBpsDevice {
+               if err := fscommon.WriteFile(dirPath, "io.max", td.StringName("wbps")); err != nil {
+                       return err
+               }
+       }
+       for _, td := range cgroup.Resources.BlkioThrottleReadIOPSDevice {
+               if err := fscommon.WriteFile(dirPath, "io.max", td.StringName("riops")); err != nil {
+                       return err
+               }
+       }
+       for _, td := range cgroup.Resources.BlkioThrottleWriteIOPSDevice {
+               if err := fscommon.WriteFile(dirPath, "io.max", td.StringName("wiops")); err != nil {
+                       return err
+               }
+       }
+
+       return nil
+}
+
+func readCgroup2MapFile(dirPath string, name string) (map[string][]string, error) {
+       ret := map[string][]string{}
+       p := filepath.Join(dirPath, name)
+       f, err := os.Open(p)
+       if err != nil {
+               return nil, err
+       }
+       defer f.Close()
+       scanner := bufio.NewScanner(f)
+       for scanner.Scan() {
+               line := scanner.Text()
+               parts := strings.Fields(line)
+               if len(parts) < 2 {
+                       continue
+               }
+               ret[parts[0]] = parts[1:]
+       }
+       if err := scanner.Err(); err != nil {
+               return nil, err
+       }
+       return ret, nil
+}
+
+func statIo(dirPath string, stats *cgroups.Stats) error {
+       // more details on the io.stat file format: https://www.kernel.org/doc/Documentation/cgroup-v2.txt
+       var ioServiceBytesRecursive []cgroups.BlkioStatEntry
+       values, err := readCgroup2MapFile(dirPath, "io.stat")
+       if err != nil {
+               return err
+       }
+       for k, v := range values {
+               d := strings.Split(k, ":")
+               if len(d) != 2 {
+                       continue
+               }
+               minor, err := strconv.ParseUint(d[0], 10, 0)
+               if err != nil {
+                       return err
+               }
+               major, err := strconv.ParseUint(d[1], 10, 0)
+               if err != nil {
+                       return err
+               }
+
+               for _, item := range v {
+                       d := strings.Split(item, "=")
+                       if len(d) != 2 {
+                               continue
+                       }
+                       op := d[0]
+
+                       // Accommodate the cgroup v1 naming
+                       switch op {
+                       case "rbytes":
+                               op = "read"
+                       case "wbytes":
+                               op = "write"
+                       }
+
+                       value, err := strconv.ParseUint(d[1], 10, 0)
+                       if err != nil {
+                               return err
+                       }
+
+                       entry := cgroups.BlkioStatEntry{
+                               Op:    op,
+                               Major: major,
+                               Minor: minor,
+                               Value: value,
+                       }
+                       ioServiceBytesRecursive = append(ioServiceBytesRecursive, entry)
+               }
+       }
+       stats.BlkioStats = cgroups.BlkioStats{IoServiceBytesRecursive: ioServiceBytesRecursive}
+       return nil
+}
diff --git a/libcontainer/cgroups/fs2/memory.go b/libcontainer/cgroups/fs2/memory.go
new file mode 100644 (file)
index 0000000..23eccbe
--- /dev/null
@@ -0,0 +1,103 @@
+// +build linux
+
+package fs2
+
+import (
+       "bufio"
+       "os"
+       "path/filepath"
+       "strconv"
+       "strings"
+
+       "github.com/opencontainers/runc/libcontainer/cgroups"
+       "github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
+       "github.com/opencontainers/runc/libcontainer/configs"
+       "github.com/pkg/errors"
+)
+
+func setMemory(dirPath string, cgroup *configs.Cgroup) error {
+       if cgroup.Resources.MemorySwap != 0 {
+               if err := fscommon.WriteFile(dirPath, "memory.swap.max", strconv.FormatInt(cgroup.Resources.MemorySwap, 10)); err != nil {
+                       return err
+               }
+       }
+       if cgroup.Resources.Memory != 0 {
+               if err := fscommon.WriteFile(dirPath, "memory.max", strconv.FormatInt(cgroup.Resources.Memory, 10)); err != nil {
+                       return err
+               }
+       }
+
+       // cgroup.Resources.KernelMemory is ignored
+
+       if cgroup.Resources.MemoryReservation != 0 {
+               if err := fscommon.WriteFile(dirPath, "memory.low", strconv.FormatInt(cgroup.Resources.MemoryReservation, 10)); err != nil {
+                       return err
+               }
+       }
+
+       return nil
+}
+
+func statMemory(dirPath string, stats *cgroups.Stats) error {
+       // Set stats from memory.stat.
+       statsFile, err := os.Open(filepath.Join(dirPath, "memory.stat"))
+       if err != nil {
+               return err
+       }
+       defer statsFile.Close()
+
+       sc := bufio.NewScanner(statsFile)
+       for sc.Scan() {
+               t, v, err := fscommon.GetCgroupParamKeyValue(sc.Text())
+               if err != nil {
+                       return errors.Wrapf(err, "failed to parse memory.stat (%q)", sc.Text())
+               }
+               stats.MemoryStats.Stats[t] = v
+       }
+       stats.MemoryStats.Cache = stats.MemoryStats.Stats["cache"]
+
+       memoryUsage, err := getMemoryDataV2(dirPath, "")
+       if err != nil {
+               return err
+       }
+       stats.MemoryStats.Usage = memoryUsage
+       swapUsage, err := getMemoryDataV2(dirPath, "swap")
+       if err != nil {
+               return err
+       }
+       stats.MemoryStats.SwapUsage = swapUsage
+
+       stats.MemoryStats.UseHierarchy = true
+       return nil
+}
+
+func getMemoryDataV2(path, name string) (cgroups.MemoryData, error) {
+       memoryData := cgroups.MemoryData{}
+
+       moduleName := "memory"
+       if name != "" {
+               moduleName = strings.Join([]string{"memory", name}, ".")
+       }
+       usage := strings.Join([]string{moduleName, "current"}, ".")
+       limit := strings.Join([]string{moduleName, "max"}, ".")
+
+       value, err := fscommon.GetCgroupParamUint(path, usage)
+       if err != nil {
+               if moduleName != "memory" && os.IsNotExist(err) {
+                       return cgroups.MemoryData{}, nil
+               }
+               return cgroups.MemoryData{}, errors.Wrapf(err, "failed to parse %s", usage)
+       }
+       memoryData.Usage = value
+
+       value, err = fscommon.GetCgroupParamUint(path, limit)
+       if err != nil {
+               if moduleName != "memory" && os.IsNotExist(err) {
+                       return cgroups.MemoryData{}, nil
+               }
+               return cgroups.MemoryData{}, errors.Wrapf(err, "failed to parse %s", limit)
+       }
+       memoryData.Limit = value
+
+       return memoryData, nil
+}
diff --git a/libcontainer/cgroups/fs2/pids.go b/libcontainer/cgroups/fs2/pids.go
new file mode 100644 (file)
index 0000000..db2d7ac
--- /dev/null
@@ -0,0 +1,90 @@
+// +build linux
+
+package fs2
+
+import (
+       "io/ioutil"
+       "os"
+       "path/filepath"
+       "strconv"
+       "strings"
+
+       "github.com/opencontainers/runc/libcontainer/cgroups"
+       "github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
+       "github.com/opencontainers/runc/libcontainer/configs"
+       "github.com/pkg/errors"
+       "golang.org/x/sys/unix"
+)
+
+func setPids(dirPath string, cgroup *configs.Cgroup) error {
+       if cgroup.Resources.PidsLimit != 0 {
+               // "max" is the fallback value.
+               limit := "max"
+
+               if cgroup.Resources.PidsLimit > 0 {
+                       limit = strconv.FormatInt(cgroup.Resources.PidsLimit, 10)
+               }
+
+               if err := fscommon.WriteFile(dirPath, "pids.max", limit); err != nil {
+                       return err
+               }
+       }
+
+       return nil
+}
+
+func isNOTSUP(err error) bool {
+       switch err := err.(type) {
+       case *os.PathError:
+               return err.Err == unix.ENOTSUP
+       default:
+               return false
+       }
+}
+
+func statPidsWithoutController(dirPath string, stats *cgroups.Stats) error {
+       // if the controller is not enabled, let's read PIDS from cgroups.procs
+       // (or threads if cgroup.threads is enabled)
+       contents, err := ioutil.ReadFile(filepath.Join(dirPath, "cgroup.procs"))
+       if err != nil && isNOTSUP(err) {
+               contents, err = ioutil.ReadFile(filepath.Join(dirPath, "cgroup.threads"))
+       }
+       if err != nil {
+               return err
+       }
+       pids := make(map[string]string)
+       for _, i := range strings.Split(string(contents), "\n") {
+               if i != "" {
+                       pids[i] = i
+               }
+       }
+       stats.PidsStats.Current = uint64(len(pids))
+       stats.PidsStats.Limit = 0
+       return nil
+}
+
+func statPids(dirPath string, stats *cgroups.Stats) error {
+       current, err := fscommon.GetCgroupParamUint(dirPath, "pids.current")
+       if err != nil {
+               return errors.Wrap(err, "failed to parse pids.current")
+       }
+
+       maxString, err := fscommon.GetCgroupParamString(dirPath, "pids.max")
+       if err != nil {
+               return errors.Wrap(err, "failed to parse pids.max")
+       }
+
+       // Default if pids.max == "max" is 0 -- which represents "no limit".
+       var max uint64
+       if maxString != "max" {
+               max, err = fscommon.ParseUint(maxString, 10, 64)
+               if err != nil {
+                       return errors.Wrapf(err, "failed to parse pids.max - unable to parse %q as a uint from Cgroup file %q",
+                               maxString, filepath.Join(dirPath, "pids.max"))
+               }
+       }
+
+       stats.PidsStats.Current = current
+       stats.PidsStats.Limit = max
+       return nil
+}
diff --git a/libcontainer/cgroups/fscommon/fscommon.go b/libcontainer/cgroups/fscommon/fscommon.go
new file mode 100644 (file)
index 0000000..dd92e8c
--- /dev/null
@@ -0,0 +1,36 @@
+// +build linux
+
+package fscommon
+
+import (
+       "io/ioutil"
+
+       securejoin "github.com/cyphar/filepath-securejoin"
+       "github.com/pkg/errors"
+)
+
+func WriteFile(dir, file, data string) error {
+       if dir == "" {
+               return errors.Errorf("no directory specified for %s", file)
+       }
+       path, err := securejoin.SecureJoin(dir, file)
+       if err != nil {
+               return err
+       }
+       if err := ioutil.WriteFile(path, []byte(data), 0700); err != nil {
+               return errors.Wrapf(err, "failed to write %q to %q", data, path)
+       }
+       return nil
+}
+
+func ReadFile(dir, file string) (string, error) {
+       if dir == "" {
+               return "", errors.Errorf("no directory specified for %s", file)
+       }
+       path, err := securejoin.SecureJoin(dir, file)
+       if err != nil {
+               return "", err
+       }
+       data, err := ioutil.ReadFile(path)
+       return string(data), err
+}
diff --git a/libcontainer/cgroups/fscommon/utils.go b/libcontainer/cgroups/fscommon/utils.go
new file mode 100644 (file)
index 0000000..46c3c77
--- /dev/null
@@ -0,0 +1,83 @@
+// +build linux
+
+package fscommon
+
+import (
+       "errors"
+       "fmt"
+       "io/ioutil"
+       "math"
+       "path/filepath"
+       "strconv"
+       "strings"
+)
+
+var (
+       ErrNotValidFormat = errors.New("line is not a valid key value format")
+)
+
+// Saturates negative values at zero and returns a uint64.
+// Due to kernel bugs, some of the memory cgroup stats can be negative.
+func ParseUint(s string, base, bitSize int) (uint64, error) {
+       value, err := strconv.ParseUint(s, base, bitSize)
+       if err != nil {
+               intValue, intErr := strconv.ParseInt(s, base, bitSize)
+               // 1. Handle negative values greater than MinInt64 (and)
+               // 2. Handle negative values lesser than MinInt64
+               if intErr == nil && intValue < 0 {
+                       return 0, nil
+               } else if intErr != nil && intErr.(*strconv.NumError).Err == strconv.ErrRange && intValue < 0 {
+                       return 0, nil
+               }
+
+               return value, err
+       }
+
+       return value, nil
+}
+
+// Parses a cgroup param and returns as name, value
+//  i.e. "io_service_bytes 1234" will return as io_service_bytes, 1234
+func GetCgroupParamKeyValue(t string) (string, uint64, error) {
+       parts := strings.Fields(t)
+       switch len(parts) {
+       case 2:
+               value, err := ParseUint(parts[1], 10, 64)
+               if err != nil {
+                       return "", 0, fmt.Errorf("unable to convert param value (%q) to uint64: %v", parts[1], err)
+               }
+
+               return parts[0], value, nil
+       default:
+               return "", 0, ErrNotValidFormat
+       }
+}
+
+// Gets a single uint64 value from the specified cgroup file.
+func GetCgroupParamUint(cgroupPath, cgroupFile string) (uint64, error) {
+       fileName := filepath.Join(cgroupPath, cgroupFile)
+       contents, err := ioutil.ReadFile(fileName)
+       if err != nil {
+               return 0, err
+       }
+       trimmed := strings.TrimSpace(string(contents))
+       if trimmed == "max" {
+               return math.MaxUint64, nil
+       }
+
+       res, err := ParseUint(trimmed, 10, 64)
+       if err != nil {
+               return res, fmt.Errorf("unable to parse %q as a uint from Cgroup file %q", string(contents), fileName)
+       }
+       return res, nil
+}
+
+// Gets a string value from the specified cgroup file
+func GetCgroupParamString(cgroupPath, cgroupFile string) (string, error) {
+       contents, err := ioutil.ReadFile(filepath.Join(cgroupPath, cgroupFile))
+       if err != nil {
+               return "", err
+       }
+
+       return strings.TrimSpace(string(contents)), nil
+}
diff --git a/libcontainer/cgroups/fscommon/utils_test.go b/libcontainer/cgroups/fscommon/utils_test.go
new file mode 100644 (file)
index 0000000..d0c5668
--- /dev/null
@@ -0,0 +1,97 @@
+// +build linux
+
+package fscommon
+
+import (
+       "io/ioutil"
+       "math"
+       "os"
+       "path/filepath"
+       "strconv"
+       "testing"
+)
+
+const (
+       cgroupFile  = "cgroup.file"
+       floatValue  = 2048.0
+       floatString = "2048"
+)
+
+func TestGetCgroupParamsInt(t *testing.T) {
+       // Setup tempdir.
+       tempDir, err := ioutil.TempDir("", "cgroup_utils_test")
+       if err != nil {
+               t.Fatal(err)
+       }
+       defer os.RemoveAll(tempDir)
+       tempFile := filepath.Join(tempDir, cgroupFile)
+
+       // Success.
+       err = ioutil.WriteFile(tempFile, []byte(floatString), 0755)
+       if err != nil {
+               t.Fatal(err)
+       }
+       value, err := GetCgroupParamUint(tempDir, cgroupFile)
+       if err != nil {
+               t.Fatal(err)
+       } else if value != floatValue {
+               t.Fatalf("Expected %d to equal %f", value, floatValue)
+       }
+
+       // Success with new line.
+       err = ioutil.WriteFile(tempFile, []byte(floatString+"\n"), 0755)
+       if err != nil {
+               t.Fatal(err)
+       }
+       value, err = GetCgroupParamUint(tempDir, cgroupFile)
+       if err != nil {
+               t.Fatal(err)
+       } else if value != floatValue {
+               t.Fatalf("Expected %d to equal %f", value, floatValue)
+       }
+
+       // Success with negative values
+       err = ioutil.WriteFile(tempFile, []byte("-12345"), 0755)
+       if err != nil {
+               t.Fatal(err)
+       }
+       value, err = GetCgroupParamUint(tempDir, cgroupFile)
+       if err != nil {
+               t.Fatal(err)
+       } else if value != 0 {
+               t.Fatalf("Expected %d to equal %d", value, 0)
+       }
+
+       // Success with negative values lesser than min int64
+       s := strconv.FormatFloat(math.MinInt64, 'f', -1, 64)
+       err = ioutil.WriteFile(tempFile, []byte(s), 0755)
+       if err != nil {
+               t.Fatal(err)
+       }
+       value, err = GetCgroupParamUint(tempDir, cgroupFile)
+       if err != nil {
+               t.Fatal(err)
+       } else if value != 0 {
+               t.Fatalf("Expected %d to equal %d", value, 0)
+       }
+
+       // Not a float.
+       err = ioutil.WriteFile(tempFile, []byte("not-a-float"), 0755)
+       if err != nil {
+               t.Fatal(err)
+       }
+       _, err = GetCgroupParamUint(tempDir, cgroupFile)
+       if err == nil {
+               t.Fatal("Expecting error, got none")
+       }
+
+       // Unknown file.
+       err = os.Remove(tempFile)
+       if err != nil {
+               t.Fatal(err)
+       }
+       _, err = GetCgroupParamUint(tempDir, cgroupFile)
+       if err == nil {
+               t.Fatal("Expecting error, got none")
+       }
+}
diff --git a/libcontainer/cgroups/stats.go b/libcontainer/cgroups/stats.go
new file mode 100644 (file)
index 0000000..8eeedc5
--- /dev/null
@@ -0,0 +1,108 @@
+// +build linux
+
+package cgroups
+
+type ThrottlingData struct {
+       // Number of periods with throttling active
+       Periods uint64 `json:"periods,omitempty"`
+       // Number of periods when the container hit its throttling limit.
+       ThrottledPeriods uint64 `json:"throttled_periods,omitempty"`
+       // Aggregate time the container was throttled for in nanoseconds.
+       ThrottledTime uint64 `json:"throttled_time,omitempty"`
+}
+
+// CpuUsage denotes the usage of a CPU.
+// All CPU stats are aggregate since container inception.
+type CpuUsage struct {
+       // Total CPU time consumed.
+       // Units: nanoseconds.
+       TotalUsage uint64 `json:"total_usage,omitempty"`
+       // Total CPU time consumed per core.
+       // Units: nanoseconds.
+       PercpuUsage []uint64 `json:"percpu_usage,omitempty"`
+       // Time spent by tasks of the cgroup in kernel mode.
+       // Units: nanoseconds.
+       UsageInKernelmode uint64 `json:"usage_in_kernelmode"`
+       // Time spent by tasks of the cgroup in user mode.
+       // Units: nanoseconds.
+       UsageInUsermode uint64 `json:"usage_in_usermode"`
+}
+
+type CpuStats struct {
+       CpuUsage       CpuUsage       `json:"cpu_usage,omitempty"`
+       ThrottlingData ThrottlingData `json:"throttling_data,omitempty"`
+}
+
+type MemoryData struct {
+       Usage    uint64 `json:"usage,omitempty"`
+       MaxUsage uint64 `json:"max_usage,omitempty"`
+       Failcnt  uint64 `json:"failcnt"`
+       Limit    uint64 `json:"limit"`
+}
+
+type MemoryStats struct {
+       // memory used for cache
+       Cache uint64 `json:"cache,omitempty"`
+       // usage of memory
+       Usage MemoryData `json:"usage,omitempty"`
+       // usage of memory + swap
+       SwapUsage MemoryData `json:"swap_usage,omitempty"`
+       // usage of kernel memory
+       KernelUsage MemoryData `json:"kernel_usage,omitempty"`
+       // usage of kernel TCP memory
+       KernelTCPUsage MemoryData `json:"kernel_tcp_usage,omitempty"`
+       // if true, memory usage is accounted for throughout a hierarchy of cgroups.
+       UseHierarchy bool `json:"use_hierarchy"`
+
+       Stats map[string]uint64 `json:"stats,omitempty"`
+}
+
+type PidsStats struct {
+       // number of pids in the cgroup
+       Current uint64 `json:"current,omitempty"`
+       // active pids hard limit
+       Limit uint64 `json:"limit,omitempty"`
+}
+
+type BlkioStatEntry struct {
+       Major uint64 `json:"major,omitempty"`
+       Minor uint64 `json:"minor,omitempty"`
+       Op    string `json:"op,omitempty"`
+       Value uint64 `json:"value,omitempty"`
+}
+
+type BlkioStats struct {
+       // number of bytes tranferred to and from the block device
+       IoServiceBytesRecursive []BlkioStatEntry `json:"io_service_bytes_recursive,omitempty"`
+       IoServicedRecursive     []BlkioStatEntry `json:"io_serviced_recursive,omitempty"`
+       IoQueuedRecursive       []BlkioStatEntry `json:"io_queue_recursive,omitempty"`
+       IoServiceTimeRecursive  []BlkioStatEntry `json:"io_service_time_recursive,omitempty"`
+       IoWaitTimeRecursive     []BlkioStatEntry `json:"io_wait_time_recursive,omitempty"`
+       IoMergedRecursive       []BlkioStatEntry `json:"io_merged_recursive,omitempty"`
+       IoTimeRecursive         []BlkioStatEntry `json:"io_time_recursive,omitempty"`
+       SectorsRecursive        []BlkioStatEntry `json:"sectors_recursive,omitempty"`
+}
+
+type HugetlbStats struct {
+       // current res_counter usage for hugetlb
+       Usage uint64 `json:"usage,omitempty"`
+       // maximum usage ever recorded.
+       MaxUsage uint64 `json:"max_usage,omitempty"`
+       // number of times hugetlb usage allocation failure.
+       Failcnt uint64 `json:"failcnt"`
+}
+
+type Stats struct {
+       CpuStats    CpuStats    `json:"cpu_stats,omitempty"`
+       MemoryStats MemoryStats `json:"memory_stats,omitempty"`
+       PidsStats   PidsStats   `json:"pids_stats,omitempty"`
+       BlkioStats  BlkioStats  `json:"blkio_stats,omitempty"`
+       // the map is in the format "size of hugepage: stats of the hugepage"
+       HugetlbStats map[string]HugetlbStats `json:"hugetlb_stats,omitempty"`
+}
+
+func NewStats() *Stats {
+       memoryStats := MemoryStats{Stats: make(map[string]uint64)}
+       hugetlbStats := make(map[string]HugetlbStats)
+       return &Stats{MemoryStats: memoryStats, HugetlbStats: hugetlbStats}
+}
diff --git a/libcontainer/cgroups/systemd/apply_nosystemd.go b/libcontainer/cgroups/systemd/apply_nosystemd.go
new file mode 100644 (file)
index 0000000..ef0db5a
--- /dev/null
@@ -0,0 +1,67 @@
+// +build !linux
+
+package systemd
+
+import (
+       "fmt"
+
+       "github.com/opencontainers/runc/libcontainer/cgroups"
+       "github.com/opencontainers/runc/libcontainer/configs"
+)
+
+type Manager struct {
+       Cgroups *configs.Cgroup
+       Paths   map[string]string
+}
+
+func UseSystemd() bool {
+       return false
+}
+
+func NewSystemdCgroupsManager() (func(config *configs.Cgroup, paths map[string]string) cgroups.Manager, error) {
+       return nil, fmt.Errorf("Systemd not supported")
+}
+
+func (m *Manager) Apply(pid int) error {
+       return fmt.Errorf("Systemd not supported")
+}
+
+func (m *Manager) GetPids() ([]int, error) {
+       return nil, fmt.Errorf("Systemd not supported")
+}
+
+func (m *Manager) GetAllPids() ([]int, error) {
+       return nil, fmt.Errorf("Systemd not supported")
+}
+
+func (m *Manager) Destroy() error {
+       return fmt.Errorf("Systemd not supported")
+}
+
+func (m *Manager) GetPaths() map[string]string {
+       return nil
+}
+
+func (m *Manager) GetUnifiedPath() (string, error) {
+       return "", fmt.Errorf("Systemd not supported")
+}
+
+func (m *Manager) GetStats() (*cgroups.Stats, error) {
+       return nil, fmt.Errorf("Systemd not supported")
+}
+
+func (m *Manager) Set(container *configs.Config) error {
+       return fmt.Errorf("Systemd not supported")
+}
+
+func (m *Manager) Freeze(state configs.FreezerState) error {
+       return fmt.Errorf("Systemd not supported")
+}
+
+func Freeze(c *configs.Cgroup, state configs.FreezerState) error {
+       return fmt.Errorf("Systemd not supported")
+}
+
+func (m *Manager) GetCgroups() (*configs.Cgroup, error) {
+       return nil, fmt.Errorf("Systemd not supported")
+}
diff --git a/libcontainer/cgroups/systemd/apply_systemd.go b/libcontainer/cgroups/systemd/apply_systemd.go
new file mode 100644 (file)
index 0000000..c4b19b3
--- /dev/null
@@ -0,0 +1,534 @@
+// +build linux
+
+package systemd
+
+import (
+       "errors"
+       "fmt"
+       "io/ioutil"
+       "math"
+       "os"
+       "path/filepath"
+       "strings"
+       "sync"
+       "time"
+
+       systemdDbus "github.com/coreos/go-systemd/dbus"
+       "github.com/godbus/dbus"
+       "github.com/opencontainers/runc/libcontainer/cgroups"
+       "github.com/opencontainers/runc/libcontainer/cgroups/fs"
+       "github.com/opencontainers/runc/libcontainer/configs"
+       "github.com/sirupsen/logrus"
+)
+
+type LegacyManager struct {
+       mu      sync.Mutex
+       Cgroups *configs.Cgroup
+       Paths   map[string]string
+}
+
+type subsystem interface {
+       // Name returns the name of the subsystem.
+       Name() string
+       // Returns the stats, as 'stats', corresponding to the cgroup under 'path'.
+       GetStats(path string, stats *cgroups.Stats) error
+       // Set the cgroup represented by cgroup.
+       Set(path string, cgroup *configs.Cgroup) error
+}
+
+var errSubsystemDoesNotExist = errors.New("cgroup: subsystem does not exist")
+
+type subsystemSet []subsystem
+
+func (s subsystemSet) Get(name string) (subsystem, error) {
+       for _, ss := range s {
+               if ss.Name() == name {
+                       return ss, nil
+               }
+       }
+       return nil, errSubsystemDoesNotExist
+}
+
+var legacySubsystems = subsystemSet{
+       &fs.CpusetGroup{},
+       &fs.DevicesGroup{},
+       &fs.MemoryGroup{},
+       &fs.CpuGroup{},
+       &fs.CpuacctGroup{},
+       &fs.PidsGroup{},
+       &fs.BlkioGroup{},
+       &fs.HugetlbGroup{},
+       &fs.PerfEventGroup{},
+       &fs.FreezerGroup{},
+       &fs.NetPrioGroup{},
+       &fs.NetClsGroup{},
+       &fs.NameGroup{GroupName: "name=systemd"},
+}
+
+const (
+       testScopeWait = 4
+       testSliceWait = 4
+)
+
+var (
+       connLock sync.Mutex
+       theConn  *systemdDbus.Conn
+)
+
+func newProp(name string, units interface{}) systemdDbus.Property {
+       return systemdDbus.Property{
+               Name:  name,
+               Value: dbus.MakeVariant(units),
+       }
+}
+
+// NOTE: This function comes from package github.com/coreos/go-systemd/util
+// It was borrowed here to avoid a dependency on cgo.
+//
+// IsRunningSystemd checks whether the host was booted with systemd as its init
+// system. This functions similarly to systemd's `sd_booted(3)`: internally, it
+// checks whether /run/systemd/system/ exists and is a directory.
+// http://www.freedesktop.org/software/systemd/man/sd_booted.html
+func isRunningSystemd() bool {
+       fi, err := os.Lstat("/run/systemd/system")
+       if err != nil {
+               return false
+       }
+       return fi.IsDir()
+}
+
+func UseSystemd() bool {
+       if !isRunningSystemd() {
+               return false
+       }
+
+       connLock.Lock()
+       defer connLock.Unlock()
+
+       if theConn == nil {
+               var err error
+               theConn, err = systemdDbus.New()
+               if err != nil {
+                       return false
+               }
+       }
+       return true
+}
+
+func NewSystemdCgroupsManager() (func(config *configs.Cgroup, paths map[string]string) cgroups.Manager, error) {
+       if !isRunningSystemd() {
+               return nil, fmt.Errorf("systemd not running on this host, can't use systemd as a cgroups.Manager")
+       }
+       if cgroups.IsCgroup2UnifiedMode() {
+               return func(config *configs.Cgroup, paths map[string]string) cgroups.Manager {
+                       return &UnifiedManager{
+                               Cgroups: config,
+                               Paths:   paths,
+                       }
+               }, nil
+       }
+       return func(config *configs.Cgroup, paths map[string]string) cgroups.Manager {
+               return &LegacyManager{
+                       Cgroups: config,
+                       Paths:   paths,
+               }
+       }, nil
+}
+
+func (m *LegacyManager) Apply(pid int) error {
+       var (
+               c          = m.Cgroups
+               unitName   = getUnitName(c)
+               slice      = "system.slice"
+               properties []systemdDbus.Property
+       )
+
+       if c.Paths != nil {
+               paths := make(map[string]string)
+               for name, path := range c.Paths {
+                       _, err := getSubsystemPath(m.Cgroups, name)
+                       if err != nil {
+                               // Don't fail if a cgroup hierarchy was not found, just skip this subsystem
+                               if cgroups.IsNotFound(err) {
+                                       continue
+                               }
+                               return err
+                       }
+                       paths[name] = path
+               }
+               m.Paths = paths
+               return cgroups.EnterPid(m.Paths, pid)
+       }
+
+       if c.Parent != "" {
+               slice = c.Parent
+       }
+
+       properties = append(properties, systemdDbus.PropDescription("libcontainer container "+c.Name))
+
+       // if we create a slice, the parent is defined via a Wants=
+       if strings.HasSuffix(unitName, ".slice") {
+               properties = append(properties, systemdDbus.PropWants(slice))
+       } else {
+               // otherwise, we use Slice=
+               properties = append(properties, systemdDbus.PropSlice(slice))
+       }
+
+       // only add pid if its valid, -1 is used w/ general slice creation.
+       if pid != -1 {
+               properties = append(properties, newProp("PIDs", []uint32{uint32(pid)}))
+       }
+
+       // Check if we can delegate. This is only supported on systemd versions 218 and above.
+       if !strings.HasSuffix(unitName, ".slice") {
+               // Assume scopes always support delegation.
+               properties = append(properties, newProp("Delegate", true))
+       }
+
+       // Always enable accounting, this gets us the same behaviour as the fs implementation,
+       // plus the kernel has some problems with joining the memory cgroup at a later time.
+       properties = append(properties,
+               newProp("MemoryAccounting", true),
+               newProp("CPUAccounting", true),
+               newProp("BlockIOAccounting", true))
+
+       // Assume DefaultDependencies= will always work (the check for it was previously broken.)
+       properties = append(properties,
+               newProp("DefaultDependencies", false))
+
+       if c.Resources.Memory != 0 {
+               properties = append(properties,
+                       newProp("MemoryLimit", uint64(c.Resources.Memory)))
+       }
+
+       if c.Resources.CpuShares != 0 {
+               properties = append(properties,
+                       newProp("CPUShares", c.Resources.CpuShares))
+       }
+
+       // cpu.cfs_quota_us and cpu.cfs_period_us are controlled by systemd.
+       if c.Resources.CpuQuota != 0 && c.Resources.CpuPeriod != 0 {
+               // corresponds to USEC_INFINITY in systemd
+               // if USEC_INFINITY is provided, CPUQuota is left unbound by systemd
+               // always setting a property value ensures we can apply a quota and remove it later
+               cpuQuotaPerSecUSec := uint64(math.MaxUint64)
+               if c.Resources.CpuQuota > 0 {
+                       // systemd converts CPUQuotaPerSecUSec (microseconds per CPU second) to CPUQuota
+                       // (integer percentage of CPU) internally.  This means that if a fractional percent of
+                       // CPU is indicated by Resources.CpuQuota, we need to round up to the nearest
+                       // 10ms (1% of a second) such that child cgroups can set the cpu.cfs_quota_us they expect.
+                       cpuQuotaPerSecUSec = uint64(c.Resources.CpuQuota*1000000) / c.Resources.CpuPeriod
+                       if cpuQuotaPerSecUSec%10000 != 0 {
+                               cpuQuotaPerSecUSec = ((cpuQuotaPerSecUSec / 10000) + 1) * 10000
+                       }
+               }
+               properties = append(properties,
+                       newProp("CPUQuotaPerSecUSec", cpuQuotaPerSecUSec))
+       }
+
+       if c.Resources.BlkioWeight != 0 {
+               properties = append(properties,
+                       newProp("BlockIOWeight", uint64(c.Resources.BlkioWeight)))
+       }
+
+       if c.Resources.PidsLimit > 0 {
+               properties = append(properties,
+                       newProp("TasksAccounting", true),
+                       newProp("TasksMax", uint64(c.Resources.PidsLimit)))
+       }
+
+       // We have to set kernel memory here, as we can't change it once
+       // processes have been attached to the cgroup.
+       if c.Resources.KernelMemory != 0 {
+               if err := setKernelMemory(c); err != nil {
+                       return err
+               }
+       }
+
+       statusChan := make(chan string, 1)
+       if _, err := theConn.StartTransientUnit(unitName, "replace", properties, statusChan); err == nil {
+               select {
+               case <-statusChan:
+               case <-time.After(time.Second):
+                       logrus.Warnf("Timed out while waiting for StartTransientUnit(%s) completion signal from dbus. Continuing...", unitName)
+               }
+       } else if !isUnitExists(err) {
+               return err
+       }
+
+       if err := joinCgroups(c, pid); err != nil {
+               return err
+       }
+
+       paths := make(map[string]string)
+       for _, s := range legacySubsystems {
+               subsystemPath, err := getSubsystemPath(m.Cgroups, s.Name())
+               if err != nil {
+                       // Don't fail if a cgroup hierarchy was not found, just skip this subsystem
+                       if cgroups.IsNotFound(err) {
+                               continue
+                       }
+                       return err
+               }
+               paths[s.Name()] = subsystemPath
+       }
+       m.Paths = paths
+       return nil
+}
+
+func (m *LegacyManager) Destroy() error {
+       if m.Cgroups.Paths != nil {
+               return nil
+       }
+       m.mu.Lock()
+       defer m.mu.Unlock()
+       theConn.StopUnit(getUnitName(m.Cgroups), "replace", nil)
+       if err := cgroups.RemovePaths(m.Paths); err != nil {
+               return err
+       }
+       m.Paths = make(map[string]string)
+       return nil
+}
+
+func (m *LegacyManager) GetPaths() map[string]string {
+       m.mu.Lock()
+       paths := m.Paths
+       m.mu.Unlock()
+       return paths
+}
+
+func (m *LegacyManager) GetUnifiedPath() (string, error) {
+       return "", errors.New("unified path is only supported when running in unified mode")
+}
+
+func join(c *configs.Cgroup, subsystem string, pid int) (string, error) {
+       path, err := getSubsystemPath(c, subsystem)
+       if err != nil {
+               return "", err
+       }
+
+       if err := os.MkdirAll(path, 0755); err != nil {
+               return "", err
+       }
+       if err := cgroups.WriteCgroupProc(path, pid); err != nil {
+               return "", err
+       }
+       return path, nil
+}
+
+func joinCgroups(c *configs.Cgroup, pid int) error {
+       for _, sys := range legacySubsystems {
+               name := sys.Name()
+               switch name {
+               case "name=systemd":
+                       // let systemd handle this
+               case "cpuset":
+                       path, err := getSubsystemPath(c, name)
+                       if err != nil && !cgroups.IsNotFound(err) {
+                               return err
+                       }
+                       s := &fs.CpusetGroup{}
+                       if err := s.ApplyDir(path, c, pid); err != nil {
+                               return err
+                       }
+               default:
+                       _, err := join(c, name, pid)
+                       if err != nil {
+                               // Even if it's `not found` error, we'll return err
+                               // because devices cgroup is hard requirement for
+                               // container security.
+                               if name == "devices" {
+                                       return err
+                               }
+                               // For other subsystems, omit the `not found` error
+                               // because they are optional.
+                               if !cgroups.IsNotFound(err) {
+                                       return err
+                               }
+                       }
+               }
+       }
+
+       return nil
+}
+
+// systemd represents slice hierarchy using `-`, so we need to follow suit when
+// generating the path of slice. Essentially, test-a-b.slice becomes
+// /test.slice/test-a.slice/test-a-b.slice.
+func ExpandSlice(slice string) (string, error) {
+       suffix := ".slice"
+       // Name has to end with ".slice", but can't be just ".slice".
+       if len(slice) < len(suffix) || !strings.HasSuffix(slice, suffix) {
+               return "", fmt.Errorf("invalid slice name: %s", slice)
+       }
+
+       // Path-separators are not allowed.
+       if strings.Contains(slice, "/") {
+               return "", fmt.Errorf("invalid slice name: %s", slice)
+       }
+
+       var path, prefix string
+       sliceName := strings.TrimSuffix(slice, suffix)
+       // if input was -.slice, we should just return root now
+       if sliceName == "-" {
+               return "/", nil
+       }
+       for _, component := range strings.Split(sliceName, "-") {
+               // test--a.slice isn't permitted, nor is -test.slice.
+               if component == "" {
+                       return "", fmt.Errorf("invalid slice name: %s", slice)
+               }
+
+               // Append the component to the path and to the prefix.
+               path += "/" + prefix + component + suffix
+               prefix += component + "-"
+       }
+       return path, nil
+}
+
+func getSubsystemPath(c *configs.Cgroup, subsystem string) (string, error) {
+       mountpoint, err := cgroups.FindCgroupMountpoint(c.Path, subsystem)
+       if err != nil {
+               return "", err
+       }
+
+       initPath, err := cgroups.GetInitCgroup(subsystem)
+       if err != nil {
+               return "", err
+       }
+       // if pid 1 is systemd 226 or later, it will be in init.scope, not the root
+       initPath = strings.TrimSuffix(filepath.Clean(initPath), "init.scope")
+
+       slice := "system.slice"
+       if c.Parent != "" {
+               slice = c.Parent
+       }
+
+       slice, err = ExpandSlice(slice)
+       if err != nil {
+               return "", err
+       }
+
+       return filepath.Join(mountpoint, initPath, slice, getUnitName(c)), nil
+}
+
+func (m *LegacyManager) Freeze(state configs.FreezerState) error {
+       path, err := getSubsystemPath(m.Cgroups, "freezer")
+       if err != nil {
+               return err
+       }
+       prevState := m.Cgroups.Resources.Freezer
+       m.Cgroups.Resources.Freezer = state
+       freezer, err := legacySubsystems.Get("freezer")
+       if err != nil {
+               return err
+       }
+       err = freezer.Set(path, m.Cgroups)
+       if err != nil {
+               m.Cgroups.Resources.Freezer = prevState
+               return err
+       }
+       return nil
+}
+
+func (m *LegacyManager) GetPids() ([]int, error) {
+       path, err := getSubsystemPath(m.Cgroups, "devices")
+       if err != nil {
+               return nil, err
+       }
+       return cgroups.GetPids(path)
+}
+
+func (m *LegacyManager) GetAllPids() ([]int, error) {
+       path, err := getSubsystemPath(m.Cgroups, "devices")
+       if err != nil {
+               return nil, err
+       }
+       return cgroups.GetAllPids(path)
+}
+
+func (m *LegacyManager) GetStats() (*cgroups.Stats, error) {
+       m.mu.Lock()
+       defer m.mu.Unlock()
+       stats := cgroups.NewStats()
+       for name, path := range m.Paths {
+               sys, err := legacySubsystems.Get(name)
+               if err == errSubsystemDoesNotExist || !cgroups.PathExists(path) {
+                       continue
+               }
+               if err := sys.GetStats(path, stats); err != nil {
+                       return nil, err
+               }
+       }
+
+       return stats, nil
+}
+
+func (m *LegacyManager) Set(container *configs.Config) error {
+       // If Paths are set, then we are just joining cgroups paths
+       // and there is no need to set any values.
+       if m.Cgroups.Paths != nil {
+               return nil
+       }
+       for _, sys := range legacySubsystems {
+               // Get the subsystem path, but don't error out for not found cgroups.
+               path, err := getSubsystemPath(container.Cgroups, sys.Name())
+               if err != nil && !cgroups.IsNotFound(err) {
+                       return err
+               }
+
+               if err := sys.Set(path, container.Cgroups); err != nil {
+                       return err
+               }
+       }
+
+       if m.Paths["cpu"] != "" {
+               if err := fs.CheckCpushares(m.Paths["cpu"], container.Cgroups.Resources.CpuShares); err != nil {
+                       return err
+               }
+       }
+       return nil
+}
+
+func getUnitName(c *configs.Cgroup) string {
+       // by default, we create a scope unless the user explicitly asks for a slice.
+       if !strings.HasSuffix(c.Name, ".slice") {
+               return fmt.Sprintf("%s-%s.scope", c.ScopePrefix, c.Name)
+       }
+       return c.Name
+}
+
+func setKernelMemory(c *configs.Cgroup) error {
+       path, err := getSubsystemPath(c, "memory")
+       if err != nil && !cgroups.IsNotFound(err) {
+               return err
+       }
+
+       if err := os.MkdirAll(path, 0755); err != nil {
+               return err
+       }
+       // do not try to enable the kernel memory if we already have
+       // tasks in the cgroup.
+       content, err := ioutil.ReadFile(filepath.Join(path, "tasks"))
+       if err != nil {
+               return err
+       }
+       if len(content) > 0 {
+               return nil
+       }
+       return fs.EnableKernelMemoryAccounting(path)
+}
+
+// isUnitExists returns true if the error is that a systemd unit already exists.
+func isUnitExists(err error) bool {
+       if err != nil {
+               if dbusError, ok := err.(dbus.Error); ok {
+                       return strings.Contains(dbusError.Name, "org.freedesktop.systemd1.UnitExists")
+               }
+       }
+       return false
+}
+
+func (m *LegacyManager) GetCgroups() (*configs.Cgroup, error) {
+       return m.Cgroups, nil
+}
diff --git a/libcontainer/cgroups/systemd/unified_hierarchy.go b/libcontainer/cgroups/systemd/unified_hierarchy.go
new file mode 100644 (file)
index 0000000..6605099
--- /dev/null
@@ -0,0 +1,312 @@
+// +build linux
+
+package systemd
+
+import (
+       "fmt"
+       "io/ioutil"
+       "math"
+       "os"
+       "path/filepath"
+       "strings"
+       "sync"
+       "time"
+
+       systemdDbus "github.com/coreos/go-systemd/dbus"
+       "github.com/opencontainers/runc/libcontainer/cgroups"
+       "github.com/opencontainers/runc/libcontainer/cgroups/fs2"
+       "github.com/opencontainers/runc/libcontainer/configs"
+       "github.com/pkg/errors"
+       "github.com/sirupsen/logrus"
+)
+
+type UnifiedManager struct {
+       mu      sync.Mutex
+       Cgroups *configs.Cgroup
+       Paths   map[string]string
+}
+
+func (m *UnifiedManager) Apply(pid int) error {
+       var (
+               c          = m.Cgroups
+               unitName   = getUnitName(c)
+               slice      = "system.slice"
+               properties []systemdDbus.Property
+       )
+
+       if c.Paths != nil {
+               paths := make(map[string]string)
+               for name, path := range c.Paths {
+                       _, err := getSubsystemPath(m.Cgroups, name)
+                       if err != nil {
+                               // Don't fail if a cgroup hierarchy was not found, just skip this subsystem
+                               if cgroups.IsNotFound(err) {
+                                       continue
+                               }
+                               return err
+                       }
+                       paths[name] = path
+               }
+               m.Paths = paths
+               return cgroups.EnterPid(m.Paths, pid)
+       }
+
+       if c.Parent != "" {
+               slice = c.Parent
+       }
+
+       properties = append(properties, systemdDbus.PropDescription("libcontainer container "+c.Name))
+
+       // if we create a slice, the parent is defined via a Wants=
+       if strings.HasSuffix(unitName, ".slice") {
+               properties = append(properties, systemdDbus.PropWants(slice))
+       } else {
+               // otherwise, we use Slice=
+               properties = append(properties, systemdDbus.PropSlice(slice))
+       }
+
+       // only add pid if its valid, -1 is used w/ general slice creation.
+       if pid != -1 {
+               properties = append(properties, newProp("PIDs", []uint32{uint32(pid)}))
+       }
+
+       // Check if we can delegate. This is only supported on systemd versions 218 and above.
+       if !strings.HasSuffix(unitName, ".slice") {
+               // Assume scopes always support delegation.
+               properties = append(properties, newProp("Delegate", true))
+       }
+
+       // Always enable accounting, this gets us the same behaviour as the fs implementation,
+       // plus the kernel has some problems with joining the memory cgroup at a later time.
+       properties = append(properties,
+               newProp("MemoryAccounting", true),
+               newProp("CPUAccounting", true),
+               newProp("BlockIOAccounting", true))
+
+       // Assume DefaultDependencies= will always work (the check for it was previously broken.)
+       properties = append(properties,
+               newProp("DefaultDependencies", false))
+
+       if c.Resources.Memory != 0 {
+               properties = append(properties,
+                       newProp("MemoryLimit", uint64(c.Resources.Memory)))
+       }
+
+       if c.Resources.CpuShares != 0 {
+               properties = append(properties,
+                       newProp("CPUShares", c.Resources.CpuShares))
+       }
+
+       // cpu.cfs_quota_us and cpu.cfs_period_us are controlled by systemd.
+       if c.Resources.CpuQuota != 0 && c.Resources.CpuPeriod != 0 {
+               // corresponds to USEC_INFINITY in systemd
+               // if USEC_INFINITY is provided, CPUQuota is left unbound by systemd
+               // always setting a property value ensures we can apply a quota and remove it later
+               cpuQuotaPerSecUSec := uint64(math.MaxUint64)
+               if c.Resources.CpuQuota > 0 {
+                       // systemd converts CPUQuotaPerSecUSec (microseconds per CPU second) to CPUQuota
+                       // (integer percentage of CPU) internally.  This means that if a fractional percent of
+                       // CPU is indicated by Resources.CpuQuota, we need to round up to the nearest
+                       // 10ms (1% of a second) such that child cgroups can set the cpu.cfs_quota_us they expect.
+                       cpuQuotaPerSecUSec = uint64(c.Resources.CpuQuota*1000000) / c.Resources.CpuPeriod
+                       if cpuQuotaPerSecUSec%10000 != 0 {
+                               cpuQuotaPerSecUSec = ((cpuQuotaPerSecUSec / 10000) + 1) * 10000
+                       }
+               }
+               properties = append(properties,
+                       newProp("CPUQuotaPerSecUSec", cpuQuotaPerSecUSec))
+       }
+
+       if c.Resources.BlkioWeight != 0 {
+               properties = append(properties,
+                       newProp("BlockIOWeight", uint64(c.Resources.BlkioWeight)))
+       }
+
+       if c.Resources.PidsLimit > 0 {
+               properties = append(properties,
+                       newProp("TasksAccounting", true),
+                       newProp("TasksMax", uint64(c.Resources.PidsLimit)))
+       }
+
+       // We have to set kernel memory here, as we can't change it once
+       // processes have been attached to the cgroup.
+       if c.Resources.KernelMemory != 0 {
+               if err := setKernelMemory(c); err != nil {
+                       return err
+               }
+       }
+
+       statusChan := make(chan string, 1)
+       if _, err := theConn.StartTransientUnit(unitName, "replace", properties, statusChan); err == nil {
+               select {
+               case <-statusChan:
+               case <-time.After(time.Second):
+                       logrus.Warnf("Timed out while waiting for StartTransientUnit(%s) completion signal from dbus. Continuing...", unitName)
+               }
+       } else if !isUnitExists(err) {
+               return err
+       }
+
+       if err := joinCgroupsV2(c, pid); err != nil {
+               return err
+       }
+
+       path, err := getSubsystemPath(m.Cgroups, "")
+       if err != nil {
+               return err
+       }
+       m.Paths = map[string]string{
+               "pids":    path,
+               "memory":  path,
+               "io":      path,
+               "cpu":     path,
+               "devices": path,
+               "cpuset":  path,
+               "freezer": path,
+       }
+       return nil
+}
+
+func (m *UnifiedManager) Destroy() error {
+       if m.Cgroups.Paths != nil {
+               return nil
+       }
+       m.mu.Lock()
+       defer m.mu.Unlock()
+       theConn.StopUnit(getUnitName(m.Cgroups), "replace", nil)
+       if err := cgroups.RemovePaths(m.Paths); err != nil {
+               return err
+       }
+       m.Paths = make(map[string]string)
+       return nil
+}
+
+func (m *UnifiedManager) GetPaths() map[string]string {
+       m.mu.Lock()
+       paths := m.Paths
+       m.mu.Unlock()
+       return paths
+}
+func (m *UnifiedManager) GetUnifiedPath() (string, error) {
+       unifiedPath := ""
+       m.mu.Lock()
+       defer m.mu.Unlock()
+       for k, v := range m.Paths {
+               if unifiedPath == "" {
+                       unifiedPath = v
+               } else if v != unifiedPath {
+                       return unifiedPath,
+                               errors.Errorf("expected %q path to be unified path %q, got %q", k, unifiedPath, v)
+               }
+       }
+       if unifiedPath == "" {
+               // FIXME: unified path could be detected even when no controller is available
+               return unifiedPath, errors.New("cannot detect unified path")
+       }
+       return unifiedPath, nil
+}
+func createCgroupsv2Path(path string) (Err error) {
+       content, err := ioutil.ReadFile("/sys/fs/cgroup/cgroup.controllers")
+       if err != nil {
+               return err
+       }
+       if !filepath.HasPrefix(path, "/sys/fs/cgroup") {
+               return fmt.Errorf("invalid cgroup path %s", path)
+       }
+
+       res := ""
+       for i, c := range strings.Split(strings.TrimSpace(string(content)), " ") {
+               if i == 0 {
+                       res = fmt.Sprintf("+%s", c)
+               } else {
+                       res = res + fmt.Sprintf(" +%s", c)
+               }
+       }
+       resByte := []byte(res)
+
+       current := "/sys/fs"
+       elements := strings.Split(path, "/")
+       for i, e := range elements[3:] {
+               current = filepath.Join(current, e)
+               if i > 0 {
+                       if err := os.Mkdir(current, 0755); err != nil {
+                               if !os.IsExist(err) {
+                                       return err
+                               }
+                       } else {
+                               // If the directory was created, be sure it is not left around on errors.
+                               defer func() {
+                                       if Err != nil {
+                                               os.Remove(current)
+                                       }
+                               }()
+                       }
+               }
+               if i < len(elements[3:])-1 {
+                       if err := ioutil.WriteFile(filepath.Join(current, "cgroup.subtree_control"), resByte, 0755); err != nil {
+                               return err
+                       }
+               }
+       }
+       return nil
+}
+
+func joinCgroupsV2(c *configs.Cgroup, pid int) error {
+       path, err := getSubsystemPath(c, "memory")
+       if err != nil {
+               return err
+       }
+       return createCgroupsv2Path(path)
+}
+
+func (m *UnifiedManager) fsManager() (cgroups.Manager, error) {
+       path, err := m.GetUnifiedPath()
+       if err != nil {
+               return nil, err
+       }
+       return fs2.NewManager(m.Cgroups, path, false)
+}
+
+func (m *UnifiedManager) Freeze(state configs.FreezerState) error {
+       fsMgr, err := m.fsManager()
+       if err != nil {
+               return err
+       }
+       return fsMgr.Freeze(state)
+}
+
+func (m *UnifiedManager) GetPids() ([]int, error) {
+       path, err := m.GetUnifiedPath()
+       if err != nil {
+               return nil, err
+       }
+       return cgroups.GetPids(path)
+}
+
+func (m *UnifiedManager) GetAllPids() ([]int, error) {
+       path, err := m.GetUnifiedPath()
+       if err != nil {
+               return nil, err
+       }
+       return cgroups.GetAllPids(path)
+}
+
+func (m *UnifiedManager) GetStats() (*cgroups.Stats, error) {
+       fsMgr, err := m.fsManager()
+       if err != nil {
+               return nil, err
+       }
+       return fsMgr.GetStats()
+}
+
+func (m *UnifiedManager) Set(container *configs.Config) error {
+       fsMgr, err := m.fsManager()
+       if err != nil {
+               return err
+       }
+       return fsMgr.Set(container)
+}
+
+func (m *UnifiedManager) GetCgroups() (*configs.Cgroup, error) {
+       return m.Cgroups, nil
+}
diff --git a/libcontainer/cgroups/utils.go b/libcontainer/cgroups/utils.go
new file mode 100644 (file)
index 0000000..dbcc58f
--- /dev/null
@@ -0,0 +1,588 @@
+// +build linux
+
+package cgroups
+
+import (
+       "bufio"
+       "fmt"
+       "io"
+       "io/ioutil"
+       "os"
+       "path/filepath"
+       "strconv"
+       "strings"
+       "sync"
+       "syscall"
+       "time"
+
+       units "github.com/docker/go-units"
+       "golang.org/x/sys/unix"
+)
+
+const (
+       CgroupNamePrefix  = "name="
+       CgroupProcesses   = "cgroup.procs"
+       unifiedMountpoint = "/sys/fs/cgroup"
+)
+
+var (
+       isUnifiedOnce sync.Once
+       isUnified     bool
+)
+
+// HugePageSizeUnitList is a list of the units used by the linux kernel when
+// naming the HugePage control files.
+// https://www.kernel.org/doc/Documentation/cgroup-v1/hugetlb.txt
+// TODO Since the kernel only use KB, MB and GB; TB and PB should be removed,
+// depends on https://github.com/docker/go-units/commit/a09cd47f892041a4fac473133d181f5aea6fa393
+var HugePageSizeUnitList = []string{"B", "KB", "MB", "GB", "TB", "PB"}
+
+// IsCgroup2UnifiedMode returns whether we are running in cgroup v2 unified mode.
+func IsCgroup2UnifiedMode() bool {
+       isUnifiedOnce.Do(func() {
+               var st syscall.Statfs_t
+               if err := syscall.Statfs(unifiedMountpoint, &st); err != nil {
+                       panic("cannot statfs cgroup root")
+               }
+               isUnified = st.Type == unix.CGROUP2_SUPER_MAGIC
+       })
+       return isUnified
+}
+
+// https://www.kernel.org/doc/Documentation/cgroup-v1/cgroups.txt
+func FindCgroupMountpoint(cgroupPath, subsystem string) (string, error) {
+       if IsCgroup2UnifiedMode() {
+               return unifiedMountpoint, nil
+       }
+       mnt, _, err := FindCgroupMountpointAndRoot(cgroupPath, subsystem)
+       return mnt, err
+}
+
+func FindCgroupMountpointAndRoot(cgroupPath, subsystem string) (string, string, error) {
+       // We are not using mount.GetMounts() because it's super-inefficient,
+       // parsing it directly sped up x10 times because of not using Sscanf.
+       // It was one of two major performance drawbacks in container start.
+       if !isSubsystemAvailable(subsystem) {
+               return "", "", NewNotFoundError(subsystem)
+       }
+
+       f, err := os.Open("/proc/self/mountinfo")
+       if err != nil {
+               return "", "", err
+       }
+       defer f.Close()
+
+       if IsCgroup2UnifiedMode() {
+               subsystem = ""
+       }
+
+       return findCgroupMountpointAndRootFromReader(f, cgroupPath, subsystem)
+}
+
+func findCgroupMountpointAndRootFromReader(reader io.Reader, cgroupPath, subsystem string) (string, string, error) {
+       scanner := bufio.NewScanner(reader)
+       for scanner.Scan() {
+               txt := scanner.Text()
+               fields := strings.Fields(txt)
+               if len(fields) < 9 {
+                       continue
+               }
+               if strings.HasPrefix(fields[4], cgroupPath) {
+                       for _, opt := range strings.Split(fields[len(fields)-1], ",") {
+                               if (subsystem == "" && fields[9] == "cgroup2") || opt == subsystem {
+                                       return fields[4], fields[3], nil
+                               }
+                       }
+               }
+       }
+       if err := scanner.Err(); err != nil {
+               return "", "", err
+       }
+
+       return "", "", NewNotFoundError(subsystem)
+}
+
+func isSubsystemAvailable(subsystem string) bool {
+       if IsCgroup2UnifiedMode() {
+               controllers, err := GetAllSubsystems()
+               if err != nil {
+                       return false
+               }
+               for _, c := range controllers {
+                       if c == subsystem {
+                               return true
+                       }
+               }
+               return false
+       }
+
+       cgroups, err := ParseCgroupFile("/proc/self/cgroup")
+       if err != nil {
+               return false
+       }
+       _, avail := cgroups[subsystem]
+       return avail
+}
+
+func GetClosestMountpointAncestor(dir, mountinfo string) string {
+       deepestMountPoint := ""
+       for _, mountInfoEntry := range strings.Split(mountinfo, "\n") {
+               mountInfoParts := strings.Fields(mountInfoEntry)
+               if len(mountInfoParts) < 5 {
+                       continue
+               }
+               mountPoint := mountInfoParts[4]
+               if strings.HasPrefix(mountPoint, deepestMountPoint) && strings.HasPrefix(dir, mountPoint) {
+                       deepestMountPoint = mountPoint
+               }
+       }
+       return deepestMountPoint
+}
+
+func FindCgroupMountpointDir() (string, error) {
+       f, err := os.Open("/proc/self/mountinfo")
+       if err != nil {
+               return "", err
+       }
+       defer f.Close()
+
+       scanner := bufio.NewScanner(f)
+       for scanner.Scan() {
+               text := scanner.Text()
+               fields := strings.Split(text, " ")
+               // Safe as mountinfo encodes mountpoints with spaces as \040.
+               index := strings.Index(text, " - ")
+               postSeparatorFields := strings.Fields(text[index+3:])
+               numPostFields := len(postSeparatorFields)
+
+               // This is an error as we can't detect if the mount is for "cgroup"
+               if numPostFields == 0 {
+                       return "", fmt.Errorf("Found no fields post '-' in %q", text)
+               }
+
+               if postSeparatorFields[0] == "cgroup" || postSeparatorFields[0] == "cgroup2" {
+                       // Check that the mount is properly formatted.
+                       if numPostFields < 3 {
+                               return "", fmt.Errorf("Error found less than 3 fields post '-' in %q", text)
+                       }
+
+                       return filepath.Dir(fields[4]), nil
+               }
+       }
+       if err := scanner.Err(); err != nil {
+               return "", err
+       }
+
+       return "", NewNotFoundError("cgroup")
+}
+
+type Mount struct {
+       Mountpoint string
+       Root       string
+       Subsystems []string
+}
+
+func (m Mount) GetOwnCgroup(cgroups map[string]string) (string, error) {
+       if len(m.Subsystems) == 0 {
+               return "", fmt.Errorf("no subsystem for mount")
+       }
+
+       return getControllerPath(m.Subsystems[0], cgroups)
+}
+
+func getCgroupMountsHelper(ss map[string]bool, mi io.Reader, all bool) ([]Mount, error) {
+       res := make([]Mount, 0, len(ss))
+       scanner := bufio.NewScanner(mi)
+       numFound := 0
+       for scanner.Scan() && numFound < len(ss) {
+               txt := scanner.Text()
+               sepIdx := strings.Index(txt, " - ")
+               if sepIdx == -1 {
+                       return nil, fmt.Errorf("invalid mountinfo format")
+               }
+               if txt[sepIdx+3:sepIdx+10] == "cgroup2" || txt[sepIdx+3:sepIdx+9] != "cgroup" {
+                       continue
+               }
+               fields := strings.Split(txt, " ")
+               m := Mount{
+                       Mountpoint: fields[4],
+                       Root:       fields[3],
+               }
+               for _, opt := range strings.Split(fields[len(fields)-1], ",") {
+                       seen, known := ss[opt]
+                       if !known || (!all && seen) {
+                               continue
+                       }
+                       ss[opt] = true
+                       if strings.HasPrefix(opt, CgroupNamePrefix) {
+                               opt = opt[len(CgroupNamePrefix):]
+                       }
+                       m.Subsystems = append(m.Subsystems, opt)
+                       numFound++
+               }
+               if len(m.Subsystems) > 0 || all {
+                       res = append(res, m)
+               }
+       }
+       if err := scanner.Err(); err != nil {
+               return nil, err
+       }
+       return res, nil
+}
+
+// GetCgroupMounts returns the mounts for the cgroup subsystems.
+// all indicates whether to return just the first instance or all the mounts.
+func GetCgroupMounts(all bool) ([]Mount, error) {
+       if IsCgroup2UnifiedMode() {
+               availableControllers, err := GetAllSubsystems()
+               if err != nil {
+                       return nil, err
+               }
+               m := Mount{
+                       Mountpoint: unifiedMountpoint,
+                       Root:       unifiedMountpoint,
+                       Subsystems: availableControllers,
+               }
+               return []Mount{m}, nil
+       }
+
+       f, err := os.Open("/proc/self/mountinfo")
+       if err != nil {
+               return nil, err
+       }
+       defer f.Close()
+
+       allSubsystems, err := ParseCgroupFile("/proc/self/cgroup")
+       if err != nil {
+               return nil, err
+       }
+
+       allMap := make(map[string]bool)
+       for s := range allSubsystems {
+               allMap[s] = false
+       }
+       return getCgroupMountsHelper(allMap, f, all)
+}
+
+// GetAllSubsystems returns all the cgroup subsystems supported by the kernel
+func GetAllSubsystems() ([]string, error) {
+       // /proc/cgroups is meaningless for v2
+       // https://github.com/torvalds/linux/blob/v5.3/Documentation/admin-guide/cgroup-v2.rst#deprecated-v1-core-features
+       if IsCgroup2UnifiedMode() {
+               // "pseudo" controllers do not appear in /sys/fs/cgroup/cgroup.controllers.
+               // - devices: implemented in kernel 4.15
+               // - freezer: implemented in kernel 5.2
+               // We assume these are always available, as it is hard to detect availability.
+               pseudo := []string{"devices", "freezer"}
+               data, err := ioutil.ReadFile("/sys/fs/cgroup/cgroup.controllers")
+               if err != nil {
+                       return nil, err
+               }
+               subsystems := append(pseudo, strings.Fields(string(data))...)
+               return subsystems, nil
+       }
+       f, err := os.Open("/proc/cgroups")
+       if err != nil {
+               return nil, err
+       }
+       defer f.Close()
+
+       subsystems := []string{}
+
+       s := bufio.NewScanner(f)
+       for s.Scan() {
+               text := s.Text()
+               if text[0] != '#' {
+                       parts := strings.Fields(text)
+                       if len(parts) >= 4 && parts[3] != "0" {
+                               subsystems = append(subsystems, parts[0])
+                       }
+               }
+       }
+       if err := s.Err(); err != nil {
+               return nil, err
+       }
+       return subsystems, nil
+}
+
+// GetOwnCgroup returns the relative path to the cgroup docker is running in.
+func GetOwnCgroup(subsystem string) (string, error) {
+       cgroups, err := ParseCgroupFile("/proc/self/cgroup")
+       if err != nil {
+               return "", err
+       }
+
+       return getControllerPath(subsystem, cgroups)
+}
+
+func GetOwnCgroupPath(subsystem string) (string, error) {
+       cgroup, err := GetOwnCgroup(subsystem)
+       if err != nil {
+               return "", err
+       }
+
+       return getCgroupPathHelper(subsystem, cgroup)
+}
+
+func GetInitCgroup(subsystem string) (string, error) {
+       cgroups, err := ParseCgroupFile("/proc/1/cgroup")
+       if err != nil {
+               return "", err
+       }
+
+       return getControllerPath(subsystem, cgroups)
+}
+
+func GetInitCgroupPath(subsystem string) (string, error) {
+       cgroup, err := GetInitCgroup(subsystem)
+       if err != nil {
+               return "", err
+       }
+
+       return getCgroupPathHelper(subsystem, cgroup)
+}
+
+func getCgroupPathHelper(subsystem, cgroup string) (string, error) {
+       mnt, root, err := FindCgroupMountpointAndRoot("", subsystem)
+       if err != nil {
+               return "", err
+       }
+
+       // This is needed for nested containers, because in /proc/self/cgroup we
+       // see paths from host, which don't exist in container.
+       relCgroup, err := filepath.Rel(root, cgroup)
+       if err != nil {
+               return "", err
+       }
+
+       return filepath.Join(mnt, relCgroup), nil
+}
+
+func readProcsFile(dir string) ([]int, error) {
+       f, err := os.Open(filepath.Join(dir, CgroupProcesses))
+       if err != nil {
+               return nil, err
+       }
+       defer f.Close()
+
+       var (
+               s   = bufio.NewScanner(f)
+               out = []int{}
+       )
+
+       for s.Scan() {
+               if t := s.Text(); t != "" {
+                       pid, err := strconv.Atoi(t)
+                       if err != nil {
+                               return nil, err
+                       }
+                       out = append(out, pid)
+               }
+       }
+       return out, nil
+}
+
+// ParseCgroupFile parses the given cgroup file, typically from
+// /proc/<pid>/cgroup, into a map of subgroups to cgroup names.
+func ParseCgroupFile(path string) (map[string]string, error) {
+       f, err := os.Open(path)
+       if err != nil {
+               return nil, err
+       }
+       defer f.Close()
+
+       return parseCgroupFromReader(f)
+}
+
+// helper function for ParseCgroupFile to make testing easier
+func parseCgroupFromReader(r io.Reader) (map[string]string, error) {
+       s := bufio.NewScanner(r)
+       cgroups := make(map[string]string)
+
+       for s.Scan() {
+               text := s.Text()
+               // from cgroups(7):
+               // /proc/[pid]/cgroup
+               // ...
+               // For each cgroup hierarchy ... there is one entry
+               // containing three colon-separated fields of the form:
+               //     hierarchy-ID:subsystem-list:cgroup-path
+               parts := strings.SplitN(text, ":", 3)
+               if len(parts) < 3 {
+                       return nil, fmt.Errorf("invalid cgroup entry: must contain at least two colons: %v", text)
+               }
+
+               for _, subs := range strings.Split(parts[1], ",") {
+                       cgroups[subs] = parts[2]
+               }
+       }
+       if err := s.Err(); err != nil {
+               return nil, err
+       }
+
+       return cgroups, nil
+}
+
+func getControllerPath(subsystem string, cgroups map[string]string) (string, error) {
+       if IsCgroup2UnifiedMode() {
+               return "/", nil
+       }
+
+       if p, ok := cgroups[subsystem]; ok {
+               return p, nil
+       }
+
+       if p, ok := cgroups[CgroupNamePrefix+subsystem]; ok {
+               return p, nil
+       }
+
+       return "", NewNotFoundError(subsystem)
+}
+
+func PathExists(path string) bool {
+       if _, err := os.Stat(path); err != nil {
+               return false
+       }
+       return true
+}
+
+func EnterPid(cgroupPaths map[string]string, pid int) error {
+       for _, path := range cgroupPaths {
+               if PathExists(path) {
+                       if err := WriteCgroupProc(path, pid); err != nil {
+                               return err
+                       }
+               }
+       }
+       return nil
+}
+
+// RemovePaths iterates over the provided paths removing them.
+// We trying to remove all paths five times with increasing delay between tries.
+// If after all there are not removed cgroups - appropriate error will be
+// returned.
+func RemovePaths(paths map[string]string) (err error) {
+       delay := 10 * time.Millisecond
+       for i := 0; i < 5; i++ {
+               if i != 0 {
+                       time.Sleep(delay)
+                       delay *= 2
+               }
+               for s, p := range paths {
+                       os.RemoveAll(p)
+                       // TODO: here probably should be logging
+                       _, err := os.Stat(p)
+                       // We need this strange way of checking cgroups existence because
+                       // RemoveAll almost always returns error, even on already removed
+                       // cgroups
+                       if os.IsNotExist(err) {
+                               delete(paths, s)
+                       }
+               }
+               if len(paths) == 0 {
+                       return nil
+               }
+       }
+       return fmt.Errorf("Failed to remove paths: %v", paths)
+}
+
+func GetHugePageSize() ([]string, error) {
+       files, err := ioutil.ReadDir("/sys/kernel/mm/hugepages")
+       if err != nil {
+               return []string{}, err
+       }
+       var fileNames []string
+       for _, st := range files {
+               fileNames = append(fileNames, st.Name())
+       }
+       return getHugePageSizeFromFilenames(fileNames)
+}
+
+func getHugePageSizeFromFilenames(fileNames []string) ([]string, error) {
+       var pageSizes []string
+       for _, fileName := range fileNames {
+               nameArray := strings.Split(fileName, "-")
+               pageSize, err := units.RAMInBytes(nameArray[1])
+               if err != nil {
+                       return []string{}, err
+               }
+               sizeString := units.CustomSize("%g%s", float64(pageSize), 1024.0, HugePageSizeUnitList)
+               pageSizes = append(pageSizes, sizeString)
+       }
+
+       return pageSizes, nil
+}
+
+// GetPids returns all pids, that were added to cgroup at path.
+func GetPids(path string) ([]int, error) {
+       return readProcsFile(path)
+}
+
+// GetAllPids returns all pids, that were added to cgroup at path and to all its
+// subcgroups.
+func GetAllPids(path string) ([]int, error) {
+       var pids []int
+       // collect pids from all sub-cgroups
+       err := filepath.Walk(path, func(p string, info os.FileInfo, iErr error) error {
+               dir, file := filepath.Split(p)
+               if file != CgroupProcesses {
+                       return nil
+               }
+               if iErr != nil {
+                       return iErr
+               }
+               cPids, err := readProcsFile(dir)
+               if err != nil {
+                       return err
+               }
+               pids = append(pids, cPids...)
+               return nil
+       })
+       return pids, err
+}
+
+// WriteCgroupProc writes the specified pid into the cgroup's cgroup.procs file
+func WriteCgroupProc(dir string, pid int) error {
+       // Normally dir should not be empty, one case is that cgroup subsystem
+       // is not mounted, we will get empty dir, and we want it fail here.
+       if dir == "" {
+               return fmt.Errorf("no such directory for %s", CgroupProcesses)
+       }
+
+       // Dont attach any pid to the cgroup if -1 is specified as a pid
+       if pid == -1 {
+               return nil
+       }
+
+       cgroupProcessesFile, err := os.OpenFile(filepath.Join(dir, CgroupProcesses), os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0700)
+       if err != nil {
+               return fmt.Errorf("failed to write %v to %v: %v", pid, CgroupProcesses, err)
+       }
+       defer cgroupProcessesFile.Close()
+
+       for i := 0; i < 5; i++ {
+               _, err = cgroupProcessesFile.WriteString(strconv.Itoa(pid))
+               if err == nil {
+                       return nil
+               }
+
+               // EINVAL might mean that the task being added to cgroup.procs is in state
+               // TASK_NEW. We should attempt to do so again.
+               if isEINVAL(err) {
+                       time.Sleep(30 * time.Millisecond)
+                       continue
+               }
+
+               return fmt.Errorf("failed to write %v to %v: %v", pid, CgroupProcesses, err)
+       }
+       return err
+}
+
+func isEINVAL(err error) bool {
+       switch err := err.(type) {
+       case *os.PathError:
+               return err.Err == unix.EINVAL
+       default:
+               return false
+       }
+}
diff --git a/libcontainer/cgroups/utils_test.go b/libcontainer/cgroups/utils_test.go
new file mode 100644 (file)
index 0000000..3214b9d
--- /dev/null
@@ -0,0 +1,459 @@
+// +build linux
+
+package cgroups
+
+import (
+       "bytes"
+       "errors"
+       "fmt"
+       "reflect"
+       "strings"
+       "testing"
+)
+
+const fedoraMountinfo = `15 35 0:3 / /proc rw,nosuid,nodev,noexec,relatime shared:5 - proc proc rw
+16 35 0:14 / /sys rw,nosuid,nodev,noexec,relatime shared:6 - sysfs sysfs rw,seclabel
+17 35 0:5 / /dev rw,nosuid shared:2 - devtmpfs devtmpfs rw,seclabel,size=8056484k,nr_inodes=2014121,mode=755
+18 16 0:15 / /sys/kernel/security rw,nosuid,nodev,noexec,relatime shared:7 - securityfs securityfs rw
+19 16 0:13 / /sys/fs/selinux rw,relatime shared:8 - selinuxfs selinuxfs rw
+20 17 0:16 / /dev/shm rw,nosuid,nodev shared:3 - tmpfs tmpfs rw,seclabel
+21 17 0:10 / /dev/pts rw,nosuid,noexec,relatime shared:4 - devpts devpts rw,seclabel,gid=5,mode=620,ptmxmode=000
+22 35 0:17 / /run rw,nosuid,nodev shared:21 - tmpfs tmpfs rw,seclabel,mode=755
+23 16 0:18 / /sys/fs/cgroup rw,nosuid,nodev,noexec shared:9 - tmpfs tmpfs rw,seclabel,mode=755
+24 23 0:19 / /sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime shared:10 - cgroup cgroup rw,xattr,release_agent=/usr/lib/systemd/systemd-cgroups-agent,name=systemd
+25 16 0:20 / /sys/fs/pstore rw,nosuid,nodev,noexec,relatime shared:20 - pstore pstore rw
+26 23 0:21 / /sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime shared:11 - cgroup cgroup rw,cpuset,clone_children
+27 23 0:22 / /sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime shared:12 - cgroup cgroup rw,cpuacct,cpu,clone_children
+28 23 0:23 / /sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:13 - cgroup cgroup rw,memory,clone_children
+29 23 0:24 / /sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:14 - cgroup cgroup rw,devices,clone_children
+30 23 0:25 / /sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime shared:15 - cgroup cgroup rw,freezer,clone_children
+31 23 0:26 / /sys/fs/cgroup/net_cls rw,nosuid,nodev,noexec,relatime shared:16 - cgroup cgroup rw,net_cls,clone_children
+32 23 0:27 / /sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime shared:17 - cgroup cgroup rw,blkio,clone_children
+33 23 0:28 / /sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime shared:18 - cgroup cgroup rw,perf_event,clone_children
+34 23 0:29 / /sys/fs/cgroup/hugetlb rw,nosuid,nodev,noexec,relatime shared:19 - cgroup cgroup rw,hugetlb,clone_children
+35 1 253:2 / / rw,relatime shared:1 - ext4 /dev/mapper/ssd-root--f20 rw,seclabel,data=ordered
+36 15 0:30 / /proc/sys/fs/binfmt_misc rw,relatime shared:22 - autofs systemd-1 rw,fd=38,pgrp=1,timeout=300,minproto=5,maxproto=5,direct
+37 17 0:12 / /dev/mqueue rw,relatime shared:23 - mqueue mqueue rw,seclabel
+38 35 0:31 / /tmp rw shared:24 - tmpfs tmpfs rw,seclabel
+39 17 0:32 / /dev/hugepages rw,relatime shared:25 - hugetlbfs hugetlbfs rw,seclabel
+40 16 0:7 / /sys/kernel/debug rw,relatime shared:26 - debugfs debugfs rw
+41 16 0:33 / /sys/kernel/config rw,relatime shared:27 - configfs configfs rw
+42 35 0:34 / /var/lib/nfs/rpc_pipefs rw,relatime shared:28 - rpc_pipefs sunrpc rw
+43 15 0:35 / /proc/fs/nfsd rw,relatime shared:29 - nfsd sunrpc rw
+45 35 8:17 / /boot rw,relatime shared:30 - ext4 /dev/sdb1 rw,seclabel,data=ordered
+46 35 253:4 / /home rw,relatime shared:31 - ext4 /dev/mapper/ssd-home rw,seclabel,data=ordered
+47 35 253:5 / /var/lib/libvirt/images rw,noatime,nodiratime shared:32 - ext4 /dev/mapper/ssd-virt rw,seclabel,discard,data=ordered
+48 35 253:12 / /mnt/old rw,relatime shared:33 - ext4 /dev/mapper/HelpDeskRHEL6-FedoraRoot rw,seclabel,data=ordered
+121 22 0:36 / /run/user/1000/gvfs rw,nosuid,nodev,relatime shared:104 - fuse.gvfsd-fuse gvfsd-fuse rw,user_id=1000,group_id=1000
+124 16 0:37 / /sys/fs/fuse/connections rw,relatime shared:107 - fusectl fusectl rw
+165 38 253:3 / /tmp/mnt rw,relatime shared:147 - ext4 /dev/mapper/ssd-root rw,seclabel,data=ordered
+167 35 253:15 / /var/lib/docker/devicemapper/mnt/aae4076022f0e2b80a2afbf8fc6df450c52080191fcef7fb679a73e6f073e5c2 rw,relatime shared:149 - ext4 /dev/mapper/docker-253:2-425882-aae4076022f0e2b80a2afbf8fc6df450c52080191fcef7fb679a73e6f073e5c2 rw,seclabel,discard,stripe=16,data=ordered
+171 35 253:16 / /var/lib/docker/devicemapper/mnt/c71be651f114db95180e472f7871b74fa597ee70a58ccc35cb87139ddea15373 rw,relatime shared:153 - ext4 /dev/mapper/docker-253:2-425882-c71be651f114db95180e472f7871b74fa597ee70a58ccc35cb87139ddea15373 rw,seclabel,discard,stripe=16,data=ordered
+175 35 253:17 / /var/lib/docker/devicemapper/mnt/1bac6ab72862d2d5626560df6197cf12036b82e258c53d981fa29adce6f06c3c rw,relatime shared:157 - ext4 /dev/mapper/docker-253:2-425882-1bac6ab72862d2d5626560df6197cf12036b82e258c53d981fa29adce6f06c3c rw,seclabel,discard,stripe=16,data=ordered
+179 35 253:18 / /var/lib/docker/devicemapper/mnt/d710a357d77158e80d5b2c55710ae07c94e76d34d21ee7bae65ce5418f739b09 rw,relatime shared:161 - ext4 /dev/mapper/docker-253:2-425882-d710a357d77158e80d5b2c55710ae07c94e76d34d21ee7bae65ce5418f739b09 rw,seclabel,discard,stripe=16,data=ordered
+183 35 253:19 / /var/lib/docker/devicemapper/mnt/6479f52366114d5f518db6837254baab48fab39f2ac38d5099250e9a6ceae6c7 rw,relatime shared:165 - ext4 /dev/mapper/docker-253:2-425882-6479f52366114d5f518db6837254baab48fab39f2ac38d5099250e9a6ceae6c7 rw,seclabel,discard,stripe=16,data=ordered
+187 35 253:20 / /var/lib/docker/devicemapper/mnt/8d9df91c4cca5aef49eeb2725292aab324646f723a7feab56be34c2ad08268e1 rw,relatime shared:169 - ext4 /dev/mapper/docker-253:2-425882-8d9df91c4cca5aef49eeb2725292aab324646f723a7feab56be34c2ad08268e1 rw,seclabel,discard,stripe=16,data=ordered
+191 35 253:21 / /var/lib/docker/devicemapper/mnt/c8240b768603d32e920d365dc9d1dc2a6af46cd23e7ae819947f969e1b4ec661 rw,relatime shared:173 - ext4 /dev/mapper/docker-253:2-425882-c8240b768603d32e920d365dc9d1dc2a6af46cd23e7ae819947f969e1b4ec661 rw,seclabel,discard,stripe=16,data=ordered
+195 35 253:22 / /var/lib/docker/devicemapper/mnt/2eb3a01278380bbf3ed12d86ac629eaa70a4351301ee307a5cabe7b5f3b1615f rw,relatime shared:177 - ext4 /dev/mapper/docker-253:2-425882-2eb3a01278380bbf3ed12d86ac629eaa70a4351301ee307a5cabe7b5f3b1615f rw,seclabel,discard,stripe=16,data=ordered
+199 35 253:23 / /var/lib/docker/devicemapper/mnt/37a17fb7c9d9b80821235d5f2662879bd3483915f245f9b49cdaa0e38779b70b rw,relatime shared:181 - ext4 /dev/mapper/docker-253:2-425882-37a17fb7c9d9b80821235d5f2662879bd3483915f245f9b49cdaa0e38779b70b rw,seclabel,discard,stripe=16,data=ordered
+203 35 253:24 / /var/lib/docker/devicemapper/mnt/aea459ae930bf1de913e2f29428fd80ee678a1e962d4080019d9f9774331ee2b rw,relatime shared:185 - ext4 /dev/mapper/docker-253:2-425882-aea459ae930bf1de913e2f29428fd80ee678a1e962d4080019d9f9774331ee2b rw,seclabel,discard,stripe=16,data=ordered
+207 35 253:25 / /var/lib/docker/devicemapper/mnt/928ead0bc06c454bd9f269e8585aeae0a6bd697f46dc8754c2a91309bc810882 rw,relatime shared:189 - ext4 /dev/mapper/docker-253:2-425882-928ead0bc06c454bd9f269e8585aeae0a6bd697f46dc8754c2a91309bc810882 rw,seclabel,discard,stripe=16,data=ordered
+211 35 253:26 / /var/lib/docker/devicemapper/mnt/0f284d18481d671644706e7a7244cbcf63d590d634cc882cb8721821929d0420 rw,relatime shared:193 - ext4 /dev/mapper/docker-253:2-425882-0f284d18481d671644706e7a7244cbcf63d590d634cc882cb8721821929d0420 rw,seclabel,discard,stripe=16,data=ordered
+215 35 253:27 / /var/lib/docker/devicemapper/mnt/d9dd16722ab34c38db2733e23f69e8f4803ce59658250dd63e98adff95d04919 rw,relatime shared:197 - ext4 /dev/mapper/docker-253:2-425882-d9dd16722ab34c38db2733e23f69e8f4803ce59658250dd63e98adff95d04919 rw,seclabel,discard,stripe=16,data=ordered
+219 35 253:28 / /var/lib/docker/devicemapper/mnt/bc4500479f18c2c08c21ad5282e5f826a016a386177d9874c2764751c031d634 rw,relatime shared:201 - ext4 /dev/mapper/docker-253:2-425882-bc4500479f18c2c08c21ad5282e5f826a016a386177d9874c2764751c031d634 rw,seclabel,discard,stripe=16,data=ordered
+223 35 253:29 / /var/lib/docker/devicemapper/mnt/7770c8b24eb3d5cc159a065910076938910d307ab2f5d94e1dc3b24c06ee2c8a rw,relatime shared:205 - ext4 /dev/mapper/docker-253:2-425882-7770c8b24eb3d5cc159a065910076938910d307ab2f5d94e1dc3b24c06ee2c8a rw,seclabel,discard,stripe=16,data=ordered
+227 35 253:30 / /var/lib/docker/devicemapper/mnt/c280cd3d0bf0aa36b478b292279671624cceafc1a67eaa920fa1082601297adf rw,relatime shared:209 - ext4 /dev/mapper/docker-253:2-425882-c280cd3d0bf0aa36b478b292279671624cceafc1a67eaa920fa1082601297adf rw,seclabel,discard,stripe=16,data=ordered
+231 35 253:31 / /var/lib/docker/devicemapper/mnt/8b59a7d9340279f09fea67fd6ad89ddef711e9e7050eb647984f8b5ef006335f rw,relatime shared:213 - ext4 /dev/mapper/docker-253:2-425882-8b59a7d9340279f09fea67fd6ad89ddef711e9e7050eb647984f8b5ef006335f rw,seclabel,discard,stripe=16,data=ordered
+235 35 253:32 / /var/lib/docker/devicemapper/mnt/1a28059f29eda821578b1bb27a60cc71f76f846a551abefabce6efd0146dce9f rw,relatime shared:217 - ext4 /dev/mapper/docker-253:2-425882-1a28059f29eda821578b1bb27a60cc71f76f846a551abefabce6efd0146dce9f rw,seclabel,discard,stripe=16,data=ordered
+239 35 253:33 / /var/lib/docker/devicemapper/mnt/e9aa60c60128cad1 rw,relatime shared:221 - ext4 /dev/mapper/docker-253:2-425882-e9aa60c60128cad1 rw,seclabel,discard,stripe=16,data=ordered
+243 35 253:34 / /var/lib/docker/devicemapper/mnt/5fec11304b6f4713fea7b6ccdcc1adc0a1966187f590fe25a8227428a8df275d-init rw,relatime shared:225 - ext4 /dev/mapper/docker-253:2-425882-5fec11304b6f4713fea7b6ccdcc1adc0a1966187f590fe25a8227428a8df275d-init rw,seclabel,discard,stripe=16,data=ordered
+247 35 253:35 / /var/lib/docker/devicemapper/mnt/5fec11304b6f4713fea7b6ccdcc1adc0a1966187f590fe25a8227428a8df275d rw,relatime shared:229 - ext4 /dev/mapper/docker-253:2-425882-5fec11304b6f4713fea7b6ccdcc1adc0a1966187f590fe25a8227428a8df275d rw,seclabel,discard,stripe=16,data=ordered
+31 21 0:23 / /DATA/foo_bla_bla rw,relatime - cifs //foo/BLA\040BLA\040BLA/ rw,sec=ntlm,cache=loose,unc=\\foo\BLA BLA BLA,username=my_login,domain=mydomain.com,uid=12345678,forceuid,gid=12345678,forcegid,addr=10.1.30.10,file_mode=0755,dir_mode=0755,nounix,rsize=61440,wsize=65536,actimeo=1`
+
+const systemdMountinfo = `115 83 0:32 / / rw,relatime - aufs none rw,si=c0bd3d3,dio,dirperm1
+116 115 0:35 / /proc rw,nosuid,nodev,noexec,relatime - proc proc rw
+117 115 0:36 / /dev rw,nosuid - tmpfs tmpfs rw,mode=755
+118 117 0:37 / /dev/pts rw,nosuid,noexec,relatime - devpts devpts rw,gid=5,mode=620,ptmxmode=666
+119 115 0:38 / /sys rw,nosuid,nodev,noexec,relatime - sysfs sysfs rw
+120 119 0:39 / /sys/fs/cgroup rw,nosuid,nodev,noexec,relatime - tmpfs tmpfs rw,mode=755
+121 120 0:19 /system.slice/docker-dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e.scope /sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,xattr,release_agent=/lib/systemd/systemd-cgroups-agent,name=systemd
+122 120 0:20 /system.slice/docker-dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e.scope /sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,devices
+123 120 0:21 /system.slice/docker-dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e.scope /sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,freezer
+124 120 0:22 /system.slice/docker-dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e.scope /sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,memory
+125 120 0:23 /system.slice/docker-dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e.scope /sys/fs/cgroup/net_cls,net_prio rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,net_cls,net_prio
+126 120 0:24 /system.slice/docker-dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e.scope /sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,blkio
+127 120 0:25 /system.slice/docker-dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e.scope /sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,cpuset,clone_children
+128 120 0:26 /system.slice/docker-dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e.scope /sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,cpu,cpuacct
+129 120 0:27 /system.slice/docker-dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e.scope /sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,perf_event,release_agent=/run/cgmanager/agents/cgm-release-agent.perf_event
+130 115 43:0 /var/lib/docker/volumes/a44a712176377f57c094397330ee04387284c478364eb25f4c3d25f775f25c26/_data /var/lib/docker rw,relatime - ext4 /dev/nbd0 rw,data=ordered
+131 115 43:0 /var/lib/docker/containers/dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e/resolv.conf /etc/resolv.conf rw,relatime - ext4 /dev/nbd0 rw,data=ordered
+132 115 43:0 /var/lib/docker/containers/dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e/hostname /etc/hostname rw,relatime - ext4 /dev/nbd0 rw,data=ordered
+133 115 43:0 /var/lib/docker/containers/dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e/hosts /etc/hosts rw,relatime - ext4 /dev/nbd0 rw,data=ordered
+134 117 0:33 / /dev/shm rw,nosuid,nodev,noexec,relatime - tmpfs shm rw,size=65536k
+135 117 0:13 / /dev/mqueue rw,nosuid,nodev,noexec,relatime - mqueue mqueue rw
+136 117 0:12 /1 /dev/console rw,nosuid,noexec,relatime - devpts none rw,gid=5,mode=620,ptmxmode=000
+84 115 0:40 / /tmp rw,relatime - tmpfs none rw`
+
+const bedrockMountinfo = `120 17 0:28 / /sys/fs/cgroup ro,nosuid,nodev,noexec shared:16 - tmpfs tmpfs ro,mode=755
+124 28 0:28 / /bedrock/strata/arch/sys/fs/cgroup rw,nosuid,nodev,noexec shared:16 - tmpfs tmpfs ro,mode=755
+123 53 0:28 / /bedrock/strata/fallback/sys/fs/cgroup rw,nosuid,nodev,noexec shared:16 - tmpfs tmpfs ro,mode=755
+122 71 0:28 / /bedrock/strata/gentoo/sys/fs/cgroup rw,nosuid,nodev,noexec shared:16 - tmpfs tmpfs ro,mode=755
+121 89 0:28 / /bedrock/strata/kde/sys/fs/cgroup rw,nosuid,nodev,noexec shared:16 - tmpfs tmpfs ro,mode=755
+125 120 0:29 / /sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime shared:17 - cgroup cgroup rw,xattr,release_agent=/lib/systemd/systemd-cgroups-agent,name=systemd
+129 124 0:29 / /bedrock/strata/arch/sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime shared:17 - cgroup cgroup rw,xattr,release_agent=/lib/systemd/systemd-cgroups-agent,name=systemd
+128 123 0:29 / /bedrock/strata/fallback/sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime shared:17 - cgroup cgroup rw,xattr,release_agent=/lib/systemd/systemd-cgroups-agent,name=systemd
+127 122 0:29 / /bedrock/strata/gentoo/sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime shared:17 - cgroup cgroup rw,xattr,release_agent=/lib/systemd/systemd-cgroups-agent,name=systemd
+126 121 0:29 / /bedrock/strata/kde/sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime shared:17 - cgroup cgroup rw,xattr,release_agent=/lib/systemd/systemd-cgroups-agent,name=systemd
+140 120 0:32 / /sys/fs/cgroup/net_cls,net_prio rw,nosuid,nodev,noexec,relatime shared:48 - cgroup cgroup rw,net_cls,net_prio
+144 124 0:32 / /bedrock/strata/arch/sys/fs/cgroup/net_cls,net_prio rw,nosuid,nodev,noexec,relatime shared:48 - cgroup cgroup rw,net_cls,net_prio
+143 123 0:32 / /bedrock/strata/fallback/sys/fs/cgroup/net_cls,net_prio rw,nosuid,nodev,noexec,relatime shared:48 - cgroup cgroup rw,net_cls,net_prio
+142 122 0:32 / /bedrock/strata/gentoo/sys/fs/cgroup/net_cls,net_prio rw,nosuid,nodev,noexec,relatime shared:48 - cgroup cgroup rw,net_cls,net_prio
+141 121 0:32 / /bedrock/strata/kde/sys/fs/cgroup/net_cls,net_prio rw,nosuid,nodev,noexec,relatime shared:48 - cgroup cgroup rw,net_cls,net_prio
+145 120 0:33 / /sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime shared:49 - cgroup cgroup rw,blkio
+149 124 0:33 / /bedrock/strata/arch/sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime shared:49 - cgroup cgroup rw,blkio
+148 123 0:33 / /bedrock/strata/fallback/sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime shared:49 - cgroup cgroup rw,blkio
+147 122 0:33 / /bedrock/strata/gentoo/sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime shared:49 - cgroup cgroup rw,blkio
+146 121 0:33 / /bedrock/strata/kde/sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime shared:49 - cgroup cgroup rw,blkio
+150 120 0:34 / /sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime shared:50 - cgroup cgroup rw,cpu,cpuacct
+154 124 0:34 / /bedrock/strata/arch/sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime shared:50 - cgroup cgroup rw,cpu,cpuacct
+153 123 0:34 / /bedrock/strata/fallback/sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime shared:50 - cgroup cgroup rw,cpu,cpuacct
+152 122 0:34 / /bedrock/strata/gentoo/sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime shared:50 - cgroup cgroup rw,cpu,cpuacct
+151 121 0:34 / /bedrock/strata/kde/sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime shared:50 - cgroup cgroup rw,cpu,cpuacct
+155 120 0:35 / /sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime shared:51 - cgroup cgroup rw,cpuset
+159 124 0:35 / /bedrock/strata/arch/sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime shared:51 - cgroup cgroup rw,cpuset
+158 123 0:35 / /bedrock/strata/fallback/sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime shared:51 - cgroup cgroup rw,cpuset
+157 122 0:35 / /bedrock/strata/gentoo/sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime shared:51 - cgroup cgroup rw,cpuset
+156 121 0:35 / /bedrock/strata/kde/sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime shared:51 - cgroup cgroup rw,cpuset
+160 120 0:36 / /sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:52 - cgroup cgroup rw,devices
+164 124 0:36 / /bedrock/strata/arch/sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:52 - cgroup cgroup rw,devices
+163 123 0:36 / /bedrock/strata/fallback/sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:52 - cgroup cgroup rw,devices
+162 122 0:36 / /bedrock/strata/gentoo/sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:52 - cgroup cgroup rw,devices
+161 121 0:36 / /bedrock/strata/kde/sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:52 - cgroup cgroup rw,devices
+165 120 0:37 / /sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:53 - cgroup cgroup rw,memory
+169 124 0:37 / /bedrock/strata/arch/sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:53 - cgroup cgroup rw,memory
+168 123 0:37 / /bedrock/strata/fallback/sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:53 - cgroup cgroup rw,memory
+167 122 0:37 / /bedrock/strata/gentoo/sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:53 - cgroup cgroup rw,memory
+166 121 0:37 / /bedrock/strata/kde/sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:53 - cgroup cgroup rw,memory
+170 120 0:38 / /sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime shared:54 - cgroup cgroup rw,freezer
+174 124 0:38 / /bedrock/strata/arch/sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime shared:54 - cgroup cgroup rw,freezer
+173 123 0:38 / /bedrock/strata/fallback/sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime shared:54 - cgroup cgroup rw,freezer
+172 122 0:38 / /bedrock/strata/gentoo/sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime shared:54 - cgroup cgroup rw,freezer
+171 121 0:38 / /bedrock/strata/kde/sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime shared:54 - cgroup cgroup rw,freezer
+175 120 0:39 / /sys/fs/cgroup/pids rw,nosuid,nodev,noexec,relatime shared:55 - cgroup cgroup rw,pids
+179 124 0:39 / /bedrock/strata/arch/sys/fs/cgroup/pids rw,nosuid,nodev,noexec,relatime shared:55 - cgroup cgroup rw,pids
+178 123 0:39 / /bedrock/strata/fallback/sys/fs/cgroup/pids rw,nosuid,nodev,noexec,relatime shared:55 - cgroup cgroup rw,pids
+177 122 0:39 / /bedrock/strata/gentoo/sys/fs/cgroup/pids rw,nosuid,nodev,noexec,relatime shared:55 - cgroup cgroup rw,pids
+176 121 0:39 / /bedrock/strata/kde/sys/fs/cgroup/pids rw,nosuid,nodev,noexec,relatime shared:55 - cgroup cgroup rw,pids
+180 120 0:40 / /sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime shared:56 - cgroup cgroup rw,perf_event
+184 124 0:40 / /bedrock/strata/arch/sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime shared:56 - cgroup cgroup rw,perf_event
+183 123 0:40 / /bedrock/strata/fallback/sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime shared:56 - cgroup cgroup rw,perf_event
+182 122 0:40 / /bedrock/strata/gentoo/sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime shared:56 - cgroup cgroup rw,perf_event
+181 121 0:40 / /bedrock/strata/kde/sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime shared:56 - cgroup cgroup rw,perf_event`
+
+const cgroup2Mountinfo = `18 64 0:18 / /sys rw,nosuid,nodev,noexec,relatime shared:6 - sysfs sysfs rw,seclabel
+19 64 0:4 / /proc rw,nosuid,nodev,noexec,relatime shared:5 - proc proc rw
+20 64 0:6 / /dev rw,nosuid shared:2 - devtmpfs devtmpfs rw,seclabel,size=8171204k,nr_inodes=2042801,mode=755
+21 18 0:19 / /sys/kernel/security rw,nosuid,nodev,noexec,relatime shared:7 - securityfs securityfs rw
+22 20 0:20 / /dev/shm rw,nosuid,nodev shared:3 - tmpfs tmpfs rw,seclabel
+23 20 0:21 / /dev/pts rw,nosuid,noexec,relatime shared:4 - devpts devpts rw,seclabel,gid=5,mode=620,ptmxmode=000
+24 64 0:22 / /run rw,nosuid,nodev shared:24 - tmpfs tmpfs rw,seclabel,mode=755
+25 18 0:23 / /sys/fs/cgroup ro,nosuid,nodev,noexec shared:8 - tmpfs tmpfs ro,seclabel,mode=755
+26 25 0:24 / /sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime shared:9 - cgroup2 cgroup rw
+27 18 0:25 / /sys/fs/pstore rw,nosuid,nodev,noexec,relatime shared:20 - pstore pstore rw,seclabel
+28 18 0:26 / /sys/firmware/efi/efivars rw,nosuid,nodev,noexec,relatime shared:21 - efivarfs efivarfs rw
+29 25 0:27 / /sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime shared:10 - cgroup cgroup rw,cpu,cpuacct
+30 25 0:28 / /sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:11 - cgroup cgroup rw,memory
+31 25 0:29 / /sys/fs/cgroup/net_cls,net_prio rw,nosuid,nodev,noexec,relatime shared:12 - cgroup cgroup rw,net_cls,net_prio
+32 25 0:30 / /sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime shared:13 - cgroup cgroup rw,blkio
+33 25 0:31 / /sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime shared:14 - cgroup cgroup rw,perf_event
+34 25 0:32 / /sys/fs/cgroup/hugetlb rw,nosuid,nodev,noexec,relatime shared:15 - cgroup cgroup rw,hugetlb
+35 25 0:33 / /sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime shared:16 - cgroup cgroup rw,freezer
+36 25 0:34 / /sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime shared:17 - cgroup cgroup rw,cpuset
+37 25 0:35 / /sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:18 - cgroup cgroup rw,devices
+38 25 0:36 / /sys/fs/cgroup/pids rw,nosuid,nodev,noexec,relatime shared:19 - cgroup cgroup rw,pids
+61 18 0:37 / /sys/kernel/config rw,relatime shared:22 - configfs configfs rw
+64 0 253:0 / / rw,relatime shared:1 - ext4 /dev/mapper/fedora_dhcp--16--129-root rw,seclabel,data=ordered
+39 18 0:17 / /sys/fs/selinux rw,relatime shared:23 - selinuxfs selinuxfs rw
+40 20 0:16 / /dev/mqueue rw,relatime shared:25 - mqueue mqueue rw,seclabel
+41 20 0:39 / /dev/hugepages rw,relatime shared:26 - hugetlbfs hugetlbfs rw,seclabel
+`
+
+func TestGetCgroupMounts(t *testing.T) {
+       type testData struct {
+               mountInfo  string
+               root       string
+               subsystems map[string]bool
+       }
+       testTable := []testData{
+               {
+                       mountInfo: fedoraMountinfo,
+                       root:      "/",
+                       subsystems: map[string]bool{
+                               "cpuset":     false,
+                               "cpu":        false,
+                               "cpuacct":    false,
+                               "memory":     false,
+                               "devices":    false,
+                               "freezer":    false,
+                               "net_cls":    false,
+                               "blkio":      false,
+                               "perf_event": false,
+                               "hugetlb":    false,
+                       },
+               },
+               {
+                       mountInfo: systemdMountinfo,
+                       root:      "/system.slice/docker-dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e.scope",
+                       subsystems: map[string]bool{
+                               "cpuset":     false,
+                               "cpu":        false,
+                               "cpuacct":    false,
+                               "memory":     false,
+                               "devices":    false,
+                               "freezer":    false,
+                               "net_cls":    false,
+                               "blkio":      false,
+                               "perf_event": false,
+                       },
+               },
+               {
+                       mountInfo: bedrockMountinfo,
+                       root:      "/",
+                       subsystems: map[string]bool{
+                               "cpuset":     false,
+                               "cpu":        false,
+                               "cpuacct":    false,
+                               "memory":     false,
+                               "devices":    false,
+                               "freezer":    false,
+                               "net_cls":    false,
+                               "blkio":      false,
+                               "perf_event": false,
+                       },
+               },
+       }
+       for _, td := range testTable {
+               mi := bytes.NewBufferString(td.mountInfo)
+               cgMounts, err := getCgroupMountsHelper(td.subsystems, mi, false)
+               if err != nil {
+                       t.Fatal(err)
+               }
+               cgMap := make(map[string]Mount)
+               for _, m := range cgMounts {
+                       for _, ss := range m.Subsystems {
+                               cgMap[ss] = m
+                       }
+               }
+               for ss := range td.subsystems {
+                       m, ok := cgMap[ss]
+                       if !ok {
+                               t.Fatalf("%s not found", ss)
+                       }
+                       if m.Root != td.root {
+                               t.Fatalf("unexpected root for %s: %s", ss, m.Root)
+                       }
+                       if !strings.HasPrefix(m.Mountpoint, "/sys/fs/cgroup/") && !strings.Contains(m.Mountpoint, ss) {
+                               t.Fatalf("unexpected mountpoint for %s: %s", ss, m.Mountpoint)
+                       }
+                       var ssFound bool
+                       for _, mss := range m.Subsystems {
+                               if mss == ss {
+                                       ssFound = true
+                                       break
+                               }
+                       }
+                       if !ssFound {
+                               t.Fatalf("subsystem %s not found in Subsystems field %v", ss, m.Subsystems)
+                       }
+               }
+       }
+}
+
+func BenchmarkGetCgroupMounts(b *testing.B) {
+       subsystems := map[string]bool{
+               "cpuset":     false,
+               "cpu":        false,
+               "cpuacct":    false,
+               "memory":     false,
+               "devices":    false,
+               "freezer":    false,
+               "net_cls":    false,
+               "blkio":      false,
+               "perf_event": false,
+               "hugetlb":    false,
+       }
+       b.ResetTimer()
+       for i := 0; i < b.N; i++ {
+               b.StopTimer()
+               mi := bytes.NewBufferString(fedoraMountinfo)
+               b.StartTimer()
+               if _, err := getCgroupMountsHelper(subsystems, mi, false); err != nil {
+                       b.Fatal(err)
+               }
+       }
+}
+
+func TestParseCgroupString(t *testing.T) {
+       testCases := []struct {
+               input          string
+               expectedError  error
+               expectedOutput map[string]string
+       }{
+               {
+                       // Taken from a CoreOS instance running systemd 225 with CPU/Mem
+                       // accounting enabled in systemd
+                       input: `9:blkio:/
+8:freezer:/
+7:perf_event:/
+6:devices:/system.slice/system-sshd.slice
+5:cpuset:/
+4:cpu,cpuacct:/system.slice/system-sshd.slice/sshd@126-10.240.0.15:22-xxx.yyy.zzz.aaa:33678.service
+3:net_cls,net_prio:/
+2:memory:/system.slice/system-sshd.slice/sshd@126-10.240.0.15:22-xxx.yyy.zzz.aaa:33678.service
+1:name=systemd:/system.slice/system-sshd.slice/sshd@126-10.240.0.15:22-xxx.yyy.zzz.aaa:33678.service`,
+                       expectedOutput: map[string]string{
+                               "name=systemd": "/system.slice/system-sshd.slice/sshd@126-10.240.0.15:22-xxx.yyy.zzz.aaa:33678.service",
+                               "blkio":        "/",
+                               "freezer":      "/",
+                               "perf_event":   "/",
+                               "devices":      "/system.slice/system-sshd.slice",
+                               "cpuset":       "/",
+                               "cpu":          "/system.slice/system-sshd.slice/sshd@126-10.240.0.15:22-xxx.yyy.zzz.aaa:33678.service",
+                               "cpuacct":      "/system.slice/system-sshd.slice/sshd@126-10.240.0.15:22-xxx.yyy.zzz.aaa:33678.service",
+                               "net_cls":      "/",
+                               "net_prio":     "/",
+                               "memory":       "/system.slice/system-sshd.slice/sshd@126-10.240.0.15:22-xxx.yyy.zzz.aaa:33678.service",
+                       },
+               },
+               {
+                       input:         `malformed input`,
+                       expectedError: fmt.Errorf(`invalid cgroup entry: must contain at least two colons: malformed input`),
+               },
+       }
+
+       for ndx, testCase := range testCases {
+               out, err := parseCgroupFromReader(strings.NewReader(testCase.input))
+               if err != nil {
+                       if testCase.expectedError == nil || testCase.expectedError.Error() != err.Error() {
+                               t.Errorf("%v: expected error %v, got error %v", ndx, testCase.expectedError, err)
+                       }
+               } else {
+                       if !reflect.DeepEqual(testCase.expectedOutput, out) {
+                               t.Errorf("%v: expected output %v, got error %v", ndx, testCase.expectedOutput, out)
+                       }
+               }
+       }
+
+}
+
+func TestIgnoreCgroup2Mount(t *testing.T) {
+       subsystems := map[string]bool{
+               "cpuset":       false,
+               "cpu":          false,
+               "cpuacct":      false,
+               "memory":       false,
+               "devices":      false,
+               "freezer":      false,
+               "net_cls":      false,
+               "blkio":        false,
+               "perf_event":   false,
+               "pids":         false,
+               "name=systemd": false,
+       }
+
+       mi := bytes.NewBufferString(cgroup2Mountinfo)
+       cgMounts, err := getCgroupMountsHelper(subsystems, mi, false)
+       if err != nil {
+               t.Fatal(err)
+       }
+       for _, m := range cgMounts {
+               if m.Mountpoint == "/sys/fs/cgroup/systemd" {
+                       t.Errorf("parsed a cgroup2 mount at /sys/fs/cgroup/systemd instead of ignoring it")
+               }
+       }
+}
+
+func TestGetClosestMountpointAncestor(t *testing.T) {
+       fakeMountInfo := ` 18 24 0:17 / /sys rw,nosuid,nodev,noexec,relatime - sysfs sysfs rw
+100 99 1:31 / /foo/bar rw,relatime - fake fake rw,fake
+100 99 1:31 / /foo/bar/baz2 rw,relatime - fake fake rw,fake
+100 99 1:31 / /foo/bar/baz rw,relatime - fake fake rw,fake
+100 99 1:31 / /foo/bar/bazza rw,relatime - fake fake rw,fake
+100 99 1:31 / /foo/bar/baz3 rw,relatime - fake fake rw,fake
+100 99 1:31 / /foo rw,relatime - fake fake rw,fake
+100 99 1:31 / /unrelated rw,relatime - fake fake rw,fake
+100 99 1:31 / / rw,relatime - fake fake rw,fake
+`
+       testCases := []struct {
+               input  string
+               output string
+       }{
+               {input: "/foo/bar/baz/a/b/c", output: "/foo/bar/baz"},
+               {input: "/foo/bar/baz", output: "/foo/bar/baz"},
+               {input: "/foo/bar/bazza", output: "/foo/bar/bazza"},
+               {input: "/a/b/c/d", output: "/"},
+       }
+
+       for _, c := range testCases {
+               mountpoint := GetClosestMountpointAncestor(c.input, fakeMountInfo)
+               if mountpoint != c.output {
+                       t.Errorf("expected %s, got %s", c.output, mountpoint)
+               }
+       }
+}
+
+func TestFindCgroupMountpointAndRoot(t *testing.T) {
+       fakeMountInfo := `
+35 27 0:29 / /foo rw,nosuid,nodev,noexec,relatime shared:18 - cgroup cgroup rw,devices
+35 27 0:29 / /sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:18 - cgroup cgroup rw,devices
+`
+       testCases := []struct {
+               cgroupPath string
+               output     string
+       }{
+               {cgroupPath: "/sys/fs", output: "/sys/fs/cgroup/devices"},
+               {cgroupPath: "", output: "/foo"},
+       }
+
+       for _, c := range testCases {
+               mountpoint, _, _ := findCgroupMountpointAndRootFromReader(strings.NewReader(fakeMountInfo), c.cgroupPath, "devices")
+               if mountpoint != c.output {
+                       t.Errorf("expected %s, got %s", c.output, mountpoint)
+               }
+       }
+}
+
+func TestGetHugePageSizeImpl(t *testing.T) {
+
+       testCases := []struct {
+               inputFiles      []string
+               outputPageSizes []string
+               err             error
+       }{
+               {
+                       inputFiles:      []string{"hugepages-1048576kB", "hugepages-2048kB", "hugepages-32768kB", "hugepages-64kB"},
+                       outputPageSizes: []string{"1GB", "2MB", "32MB", "64KB"},
+                       err:             nil,
+               },
+               {
+                       inputFiles:      []string{},
+                       outputPageSizes: []string{},
+                       err:             nil,
+               },
+               {
+                       inputFiles:      []string{"hugepages-a"},
+                       outputPageSizes: []string{},
+                       err:             errors.New("invalid size: 'a'"),
+               },
+       }
+
+       for _, c := range testCases {
+               pageSizes, err := getHugePageSizeFromFilenames(c.inputFiles)
+               if len(pageSizes) != 0 && len(c.outputPageSizes) != 0 && !reflect.DeepEqual(pageSizes, c.outputPageSizes) {
+                       t.Errorf("expected %s, got %s", c.outputPageSizes, pageSizes)
+               }
+               if err != nil && err.Error() != c.err.Error() {
+                       t.Errorf("expected error %s, got %s", c.err, err)
+               }
+       }
+}
diff --git a/libcontainer/configs/blkio_device.go b/libcontainer/configs/blkio_device.go
new file mode 100644 (file)
index 0000000..fa195bf
--- /dev/null
@@ -0,0 +1,66 @@
+package configs
+
+import "fmt"
+
+// blockIODevice holds major:minor format supported in blkio cgroup
+type blockIODevice struct {
+       // Major is the device's major number
+       Major int64 `json:"major"`
+       // Minor is the device's minor number
+       Minor int64 `json:"minor"`
+}
+
+// WeightDevice struct holds a `major:minor weight`|`major:minor leaf_weight` pair
+type WeightDevice struct {
+       blockIODevice
+       // Weight is the bandwidth rate for the device, range is from 10 to 1000
+       Weight uint16 `json:"weight"`
+       // LeafWeight is the bandwidth rate for the device while competing with the cgroup's child cgroups, range is from 10 to 1000, cfq scheduler only
+       LeafWeight uint16 `json:"leafWeight"`
+}
+
+// NewWeightDevice returns a configured WeightDevice pointer
+func NewWeightDevice(major, minor int64, weight, leafWeight uint16) *WeightDevice {
+       wd := &WeightDevice{}
+       wd.Major = major
+       wd.Minor = minor
+       wd.Weight = weight
+       wd.LeafWeight = leafWeight
+       return wd
+}
+
+// WeightString formats the struct to be writable to the cgroup specific file
+func (wd *WeightDevice) WeightString() string {
+       return fmt.Sprintf("%d:%d %d", wd.Major, wd.Minor, wd.Weight)
+}
+
+// LeafWeightString formats the struct to be writable to the cgroup specific file
+func (wd *WeightDevice) LeafWeightString() string {
+       return fmt.Sprintf("%d:%d %d", wd.Major, wd.Minor, wd.LeafWeight)
+}
+
+// ThrottleDevice struct holds a `major:minor rate_per_second` pair
+type ThrottleDevice struct {
+       blockIODevice
+       // Rate is the IO rate limit per cgroup per device
+       Rate uint64 `json:"rate"`
+}
+
+// NewThrottleDevice returns a configured ThrottleDevice pointer
+func NewThrottleDevice(major, minor int64, rate uint64) *ThrottleDevice {
+       td := &ThrottleDevice{}
+       td.Major = major
+       td.Minor = minor
+       td.Rate = rate
+       return td
+}
+
+// String formats the struct to be writable to the cgroup specific file
+func (td *ThrottleDevice) String() string {
+       return fmt.Sprintf("%d:%d %d", td.Major, td.Minor, td.Rate)
+}
+
+// StringName formats the struct to be writable to the cgroup specific file
+func (td *ThrottleDevice) StringName(name string) string {
+       return fmt.Sprintf("%d:%d %s=%d", td.Major, td.Minor, name, td.Rate)
+}
diff --git a/libcontainer/configs/cgroup_linux.go b/libcontainer/configs/cgroup_linux.go
new file mode 100644 (file)
index 0000000..58ed19c
--- /dev/null
@@ -0,0 +1,130 @@
+package configs
+
+type FreezerState string
+
+const (
+       Undefined FreezerState = ""
+       Frozen    FreezerState = "FROZEN"
+       Thawed    FreezerState = "THAWED"
+)
+
+type Cgroup struct {
+       // Deprecated, use Path instead
+       Name string `json:"name,omitempty"`
+
+       // name of parent of cgroup or slice
+       // Deprecated, use Path instead
+       Parent string `json:"parent,omitempty"`
+
+       // Path specifies the path to cgroups that are created and/or joined by the container.
+       // The path is assumed to be relative to the host system cgroup mountpoint.
+       Path string `json:"path"`
+
+       // ScopePrefix describes prefix for the scope name
+       ScopePrefix string `json:"scope_prefix"`
+
+       // Paths represent the absolute cgroups paths to join.
+       // This takes precedence over Path.
+       Paths map[string]string
+
+       // Resources contains various cgroups settings to apply
+       *Resources
+}
+
+type Resources struct {
+       // If this is true allow access to any kind of device within the container.  If false, allow access only to devices explicitly listed in the allowed_devices list.
+       // Deprecated
+       AllowAllDevices *bool `json:"allow_all_devices,omitempty"`
+       // Deprecated
+       AllowedDevices []*Device `json:"allowed_devices,omitempty"`
+       // Deprecated
+       DeniedDevices []*Device `json:"denied_devices,omitempty"`
+
+       Devices []*Device `json:"devices"`
+
+       // Memory limit (in bytes)
+       Memory int64 `json:"memory"`
+
+       // Memory reservation or soft_limit (in bytes)
+       MemoryReservation int64 `json:"memory_reservation"`
+
+       // Total memory usage (memory + swap); set `-1` to enable unlimited swap
+       MemorySwap int64 `json:"memory_swap"`
+
+       // Kernel memory limit (in bytes)
+       KernelMemory int64 `json:"kernel_memory"`
+
+       // Kernel memory limit for TCP use (in bytes)
+       KernelMemoryTCP int64 `json:"kernel_memory_tcp"`
+
+       // CPU shares (relative weight vs. other containers)
+       CpuShares uint64 `json:"cpu_shares"`
+
+       // CPU hardcap limit (in usecs). Allowed cpu time in a given period.
+       CpuQuota int64 `json:"cpu_quota"`
+
+       // CPU period to be used for hardcapping (in usecs). 0 to use system default.
+       CpuPeriod uint64 `json:"cpu_period"`
+
+       // How many time CPU will use in realtime scheduling (in usecs).
+       CpuRtRuntime int64 `json:"cpu_rt_quota"`
+
+       // CPU period to be used for realtime scheduling (in usecs).
+       CpuRtPeriod uint64 `json:"cpu_rt_period"`
+
+       // CPU to use
+       CpusetCpus string `json:"cpuset_cpus"`
+
+       // MEM to use
+       CpusetMems string `json:"cpuset_mems"`
+
+       // Process limit; set <= `0' to disable limit.
+       PidsLimit int64 `json:"pids_limit"`
+
+       // Specifies per cgroup weight, range is from 10 to 1000.
+       BlkioWeight uint16 `json:"blkio_weight"`
+
+       // Specifies tasks' weight in the given cgroup while competing with the cgroup's child cgroups, range is from 10 to 1000, cfq scheduler only
+       BlkioLeafWeight uint16 `json:"blkio_leaf_weight"`
+
+       // Weight per cgroup per device, can override BlkioWeight.
+       BlkioWeightDevice []*WeightDevice `json:"blkio_weight_device"`
+
+       // IO read rate limit per cgroup per device, bytes per second.
+       BlkioThrottleReadBpsDevice []*ThrottleDevice `json:"blkio_throttle_read_bps_device"`
+
+       // IO write rate limit per cgroup per device, bytes per second.
+       BlkioThrottleWriteBpsDevice []*ThrottleDevice `json:"blkio_throttle_write_bps_device"`
+
+       // IO read rate limit per cgroup per device, IO per second.
+       BlkioThrottleReadIOPSDevice []*ThrottleDevice `json:"blkio_throttle_read_iops_device"`
+
+       // IO write rate limit per cgroup per device, IO per second.
+       BlkioThrottleWriteIOPSDevice []*ThrottleDevice `json:"blkio_throttle_write_iops_device"`
+
+       // set the freeze value for the process
+       Freezer FreezerState `json:"freezer"`
+
+       // Hugetlb limit (in bytes)
+       HugetlbLimit []*HugepageLimit `json:"hugetlb_limit"`
+
+       // Whether to disable OOM Killer
+       OomKillDisable bool `json:"oom_kill_disable"`
+
+       // Tuning swappiness behaviour per cgroup
+       MemorySwappiness *uint64 `json:"memory_swappiness"`
+
+       // Set priority of network traffic for container
+       NetPrioIfpriomap []*IfPrioMap `json:"net_prio_ifpriomap"`
+
+       // Set class identifier for container's network packets
+       NetClsClassid uint32 `json:"net_cls_classid_u"`
+
+       // Used on cgroups v2:
+
+       // CpuWeight sets a proportional bandwidth limit.
+       CpuWeight uint64 `json:"cpu_weight"`
+
+       // CpuMax sets she maximum bandwidth limit (format: max period).
+       CpuMax string `json:"cpu_max"`
+}
diff --git a/libcontainer/configs/cgroup_unsupported.go b/libcontainer/configs/cgroup_unsupported.go
new file mode 100644 (file)
index 0000000..c0c23d7
--- /dev/null
@@ -0,0 +1,8 @@
+// +build !linux
+
+package configs
+
+// TODO Windows: This can ultimately be entirely factored out on Windows as
+// cgroups are a Unix-specific construct.
+type Cgroup struct {
+}
diff --git a/libcontainer/configs/config.go b/libcontainer/configs/config.go
new file mode 100644 (file)
index 0000000..24989e9
--- /dev/null
@@ -0,0 +1,354 @@
+package configs
+
+import (
+       "bytes"
+       "encoding/json"
+       "fmt"
+       "os/exec"
+       "time"
+
+       "github.com/opencontainers/runtime-spec/specs-go"
+
+       "github.com/sirupsen/logrus"
+)
+
+type Rlimit struct {
+       Type int    `json:"type"`
+       Hard uint64 `json:"hard"`
+       Soft uint64 `json:"soft"`
+}
+
+// IDMap represents UID/GID Mappings for User Namespaces.
+type IDMap struct {
+       ContainerID int `json:"container_id"`
+       HostID      int `json:"host_id"`
+       Size        int `json:"size"`
+}
+
+// Seccomp represents syscall restrictions
+// By default, only the native architecture of the kernel is allowed to be used
+// for syscalls. Additional architectures can be added by specifying them in
+// Architectures.
+type Seccomp struct {
+       DefaultAction Action     `json:"default_action"`
+       Architectures []string   `json:"architectures"`
+       Syscalls      []*Syscall `json:"syscalls"`
+}
+
+// Action is taken upon rule match in Seccomp
+type Action int
+
+const (
+       Kill Action = iota + 1
+       Errno
+       Trap
+       Allow
+       Trace
+       Log
+)
+
+// Operator is a comparison operator to be used when matching syscall arguments in Seccomp
+type Operator int
+
+const (
+       EqualTo Operator = iota + 1
+       NotEqualTo
+       GreaterThan
+       GreaterThanOrEqualTo
+       LessThan
+       LessThanOrEqualTo
+       MaskEqualTo
+)
+
+// Arg is a rule to match a specific syscall argument in Seccomp
+type Arg struct {
+       Index    uint     `json:"index"`
+       Value    uint64   `json:"value"`
+       ValueTwo uint64   `json:"value_two"`
+       Op       Operator `json:"op"`
+}
+
+// Syscall is a rule to match a syscall in Seccomp
+type Syscall struct {
+       Name   string `json:"name"`
+       Action Action `json:"action"`
+       Args   []*Arg `json:"args"`
+}
+
+// TODO Windows. Many of these fields should be factored out into those parts
+// which are common across platforms, and those which are platform specific.
+
+// Config defines configuration options for executing a process inside a contained environment.
+type Config struct {
+       // NoPivotRoot will use MS_MOVE and a chroot to jail the process into the container's rootfs
+       // This is a common option when the container is running in ramdisk
+       NoPivotRoot bool `json:"no_pivot_root"`
+
+       // ParentDeathSignal specifies the signal that is sent to the container's process in the case
+       // that the parent process dies.
+       ParentDeathSignal int `json:"parent_death_signal"`
+
+       // Path to a directory containing the container's root filesystem.
+       Rootfs string `json:"rootfs"`
+
+       // Readonlyfs will remount the container's rootfs as readonly where only externally mounted
+       // bind mounts are writtable.
+       Readonlyfs bool `json:"readonlyfs"`
+
+       // Specifies the mount propagation flags to be applied to /.
+       RootPropagation int `json:"rootPropagation"`
+
+       // Mounts specify additional source and destination paths that will be mounted inside the container's
+       // rootfs and mount namespace if specified
+       Mounts []*Mount `json:"mounts"`
+
+       // The device nodes that should be automatically created within the container upon container start.  Note, make sure that the node is marked as allowed in the cgroup as well!
+       Devices []*Device `json:"devices"`
+
+       MountLabel string `json:"mount_label"`
+
+       // Hostname optionally sets the container's hostname if provided
+       Hostname string `json:"hostname"`
+
+       // Namespaces specifies the container's namespaces that it should setup when cloning the init process
+       // If a namespace is not provided that namespace is shared from the container's parent process
+       Namespaces Namespaces `json:"namespaces"`
+
+       // Capabilities specify the capabilities to keep when executing the process inside the container
+       // All capabilities not specified will be dropped from the processes capability mask
+       Capabilities *Capabilities `json:"capabilities"`
+
+       // Networks specifies the container's network setup to be created
+       Networks []*Network `json:"networks"`
+
+       // Routes can be specified to create entries in the route table as the container is started
+       Routes []*Route `json:"routes"`
+
+       // Cgroups specifies specific cgroup settings for the various subsystems that the container is
+       // placed into to limit the resources the container has available
+       Cgroups *Cgroup `json:"cgroups"`
+
+       // AppArmorProfile specifies the profile to apply to the process running in the container and is
+       // change at the time the process is execed
+       AppArmorProfile string `json:"apparmor_profile,omitempty"`
+
+       // ProcessLabel specifies the label to apply to the process running in the container.  It is
+       // commonly used by selinux
+       ProcessLabel string `json:"process_label,omitempty"`
+
+       // Rlimits specifies the resource limits, such as max open files, to set in the container
+       // If Rlimits are not set, the container will inherit rlimits from the parent process
+       Rlimits []Rlimit `json:"rlimits,omitempty"`
+
+       // OomScoreAdj specifies the adjustment to be made by the kernel when calculating oom scores
+       // for a process. Valid values are between the range [-1000, '1000'], where processes with
+       // higher scores are preferred for being killed. If it is unset then we don't touch the current
+       // value.
+       // More information about kernel oom score calculation here: https://lwn.net/Articles/317814/
+       OomScoreAdj *int `json:"oom_score_adj,omitempty"`
+
+       // UidMappings is an array of User ID mappings for User Namespaces
+       UidMappings []IDMap `json:"uid_mappings"`
+
+       // GidMappings is an array of Group ID mappings for User Namespaces
+       GidMappings []IDMap `json:"gid_mappings"`
+
+       // MaskPaths specifies paths within the container's rootfs to mask over with a bind
+       // mount pointing to /dev/null as to prevent reads of the file.
+       MaskPaths []string `json:"mask_paths"`
+
+       // ReadonlyPaths specifies paths within the container's rootfs to remount as read-only
+       // so that these files prevent any writes.
+       ReadonlyPaths []string `json:"readonly_paths"`
+
+       // Sysctl is a map of properties and their values. It is the equivalent of using
+       // sysctl -w my.property.name value in Linux.
+       Sysctl map[string]string `json:"sysctl"`
+
+       // Seccomp allows actions to be taken whenever a syscall is made within the container.
+       // A number of rules are given, each having an action to be taken if a syscall matches it.
+       // A default action to be taken if no rules match is also given.
+       Seccomp *Seccomp `json:"seccomp"`
+
+       // NoNewPrivileges controls whether processes in the container can gain additional privileges.
+       NoNewPrivileges bool `json:"no_new_privileges,omitempty"`
+
+       // Hooks are a collection of actions to perform at various container lifecycle events.
+       // CommandHooks are serialized to JSON, but other hooks are not.
+       Hooks *Hooks
+
+       // Version is the version of opencontainer specification that is supported.
+       Version string `json:"version"`
+
+       // Labels are user defined metadata that is stored in the config and populated on the state
+       Labels []string `json:"labels"`
+
+       // NoNewKeyring will not allocated a new session keyring for the container.  It will use the
+       // callers keyring in this case.
+       NoNewKeyring bool `json:"no_new_keyring"`
+
+       // IntelRdt specifies settings for Intel RDT group that the container is placed into
+       // to limit the resources (e.g., L3 cache, memory bandwidth) the container has available
+       IntelRdt *IntelRdt `json:"intel_rdt,omitempty"`
+
+       // RootlessEUID is set when the runc was launched with non-zero EUID.
+       // Note that RootlessEUID is set to false when launched with EUID=0 in userns.
+       // When RootlessEUID is set, runc creates a new userns for the container.
+       // (config.json needs to contain userns settings)
+       RootlessEUID bool `json:"rootless_euid,omitempty"`
+
+       // RootlessCgroups is set when unlikely to have the full access to cgroups.
+       // When RootlessCgroups is set, cgroups errors are ignored.
+       RootlessCgroups bool `json:"rootless_cgroups,omitempty"`
+}
+
+type Hooks struct {
+       // Prestart commands are executed after the container namespaces are created,
+       // but before the user supplied command is executed from init.
+       Prestart []Hook
+
+       // Poststart commands are executed after the container init process starts.
+       Poststart []Hook
+
+       // Poststop commands are executed after the container init process exits.
+       Poststop []Hook
+}
+
+type Capabilities struct {
+       // Bounding is the set of capabilities checked by the kernel.
+       Bounding []string
+       // Effective is the set of capabilities checked by the kernel.
+       Effective []string
+       // Inheritable is the capabilities preserved across execve.
+       Inheritable []string
+       // Permitted is the limiting superset for effective capabilities.
+       Permitted []string
+       // Ambient is the ambient set of capabilities that are kept.
+       Ambient []string
+}
+
+func (hooks *Hooks) UnmarshalJSON(b []byte) error {
+       var state struct {
+               Prestart  []CommandHook
+               Poststart []CommandHook
+               Poststop  []CommandHook
+       }
+
+       if err := json.Unmarshal(b, &state); err != nil {
+               return err
+       }
+
+       deserialize := func(shooks []CommandHook) (hooks []Hook) {
+               for _, shook := range shooks {
+                       hooks = append(hooks, shook)
+               }
+
+               return hooks
+       }
+
+       hooks.Prestart = deserialize(state.Prestart)
+       hooks.Poststart = deserialize(state.Poststart)
+       hooks.Poststop = deserialize(state.Poststop)
+       return nil
+}
+
+func (hooks Hooks) MarshalJSON() ([]byte, error) {
+       serialize := func(hooks []Hook) (serializableHooks []CommandHook) {
+               for _, hook := range hooks {
+                       switch chook := hook.(type) {
+                       case CommandHook:
+                               serializableHooks = append(serializableHooks, chook)
+                       default:
+                               logrus.Warnf("cannot serialize hook of type %T, skipping", hook)
+                       }
+               }
+
+               return serializableHooks
+       }
+
+       return json.Marshal(map[string]interface{}{
+               "prestart":  serialize(hooks.Prestart),
+               "poststart": serialize(hooks.Poststart),
+               "poststop":  serialize(hooks.Poststop),
+       })
+}
+
+type Hook interface {
+       // Run executes the hook with the provided state.
+       Run(*specs.State) error
+}
+
+// NewFunctionHook will call the provided function when the hook is run.
+func NewFunctionHook(f func(*specs.State) error) FuncHook {
+       return FuncHook{
+               run: f,
+       }
+}
+
+type FuncHook struct {
+       run func(*specs.State) error
+}
+
+func (f FuncHook) Run(s *specs.State) error {
+       return f.run(s)
+}
+
+type Command struct {
+       Path    string         `json:"path"`
+       Args    []string       `json:"args"`
+       Env     []string       `json:"env"`
+       Dir     string         `json:"dir"`
+       Timeout *time.Duration `json:"timeout"`
+}
+
+// NewCommandHook will execute the provided command when the hook is run.
+func NewCommandHook(cmd Command) CommandHook {
+       return CommandHook{
+               Command: cmd,
+       }
+}
+
+type CommandHook struct {
+       Command
+}
+
+func (c Command) Run(s *specs.State) error {
+       b, err := json.Marshal(s)
+       if err != nil {
+               return err
+       }
+       var stdout, stderr bytes.Buffer
+       cmd := exec.Cmd{
+               Path:   c.Path,
+               Args:   c.Args,
+               Env:    c.Env,
+               Stdin:  bytes.NewReader(b),
+               Stdout: &stdout,
+               Stderr: &stderr,
+       }
+       if err := cmd.Start(); err != nil {
+               return err
+       }
+       errC := make(chan error, 1)
+       go func() {
+               err := cmd.Wait()
+               if err != nil {
+                       err = fmt.Errorf("error running hook: %v, stdout: %s, stderr: %s", err, stdout.String(), stderr.String())
+               }
+               errC <- err
+       }()
+       var timerCh <-chan time.Time
+       if c.Timeout != nil {
+               timer := time.NewTimer(*c.Timeout)
+               defer timer.Stop()
+               timerCh = timer.C
+       }
+       select {
+       case err := <-errC:
+               return err
+       case <-timerCh:
+               cmd.Process.Kill()
+               cmd.Wait()
+               return fmt.Errorf("hook ran past specified timeout of %.1fs", c.Timeout.Seconds())
+       }
+}
diff --git a/libcontainer/configs/config_linux.go b/libcontainer/configs/config_linux.go
new file mode 100644 (file)
index 0000000..07da108
--- /dev/null
@@ -0,0 +1,61 @@
+package configs
+
+import "fmt"
+
+// HostUID gets the translated uid for the process on host which could be
+// different when user namespaces are enabled.
+func (c Config) HostUID(containerId int) (int, error) {
+       if c.Namespaces.Contains(NEWUSER) {
+               if c.UidMappings == nil {
+                       return -1, fmt.Errorf("User namespaces enabled, but no uid mappings found.")
+               }
+               id, found := c.hostIDFromMapping(containerId, c.UidMappings)
+               if !found {
+                       return -1, fmt.Errorf("User namespaces enabled, but no user mapping found.")
+               }
+               return id, nil
+       }
+       // Return unchanged id.
+       return containerId, nil
+}
+
+// HostRootUID gets the root uid for the process on host which could be non-zero
+// when user namespaces are enabled.
+func (c Config) HostRootUID() (int, error) {
+       return c.HostUID(0)
+}
+
+// HostGID gets the translated gid for the process on host which could be
+// different when user namespaces are enabled.
+func (c Config) HostGID(containerId int) (int, error) {
+       if c.Namespaces.Contains(NEWUSER) {
+               if c.GidMappings == nil {
+                       return -1, fmt.Errorf("User namespaces enabled, but no gid mappings found.")
+               }
+               id, found := c.hostIDFromMapping(containerId, c.GidMappings)
+               if !found {
+                       return -1, fmt.Errorf("User namespaces enabled, but no group mapping found.")
+               }
+               return id, nil
+       }
+       // Return unchanged id.
+       return containerId, nil
+}
+
+// HostRootGID gets the root gid for the process on host which could be non-zero
+// when user namespaces are enabled.
+func (c Config) HostRootGID() (int, error) {
+       return c.HostGID(0)
+}
+
+// Utility function that gets a host ID for a container ID from user namespace map
+// if that ID is present in the map.
+func (c Config) hostIDFromMapping(containerID int, uMap []IDMap) (int, bool) {
+       for _, m := range uMap {
+               if (containerID >= m.ContainerID) && (containerID <= (m.ContainerID + m.Size - 1)) {
+                       hostID := m.HostID + (containerID - m.ContainerID)
+                       return hostID, true
+               }
+       }
+       return -1, false
+}
diff --git a/libcontainer/configs/config_linux_test.go b/libcontainer/configs/config_linux_test.go
new file mode 100644 (file)
index 0000000..9c5f0fe
--- /dev/null
@@ -0,0 +1,130 @@
+package configs
+
+import (
+       "encoding/json"
+       "fmt"
+       "os"
+       "path/filepath"
+       "testing"
+)
+
+func loadConfig(name string) (*Config, error) {
+       f, err := os.Open(filepath.Join("../sample_configs", name))
+       if err != nil {
+               return nil, err
+       }
+       defer f.Close()
+
+       var container *Config
+       if err := json.NewDecoder(f).Decode(&container); err != nil {
+               return nil, err
+       }
+
+       // Check that a config doesn't contain extra fields
+       var configMap, abstractMap map[string]interface{}
+
+       if _, err := f.Seek(0, 0); err != nil {
+               return nil, err
+       }
+
+       if err := json.NewDecoder(f).Decode(&abstractMap); err != nil {
+               return nil, err
+       }
+
+       configData, err := json.Marshal(&container)
+       if err != nil {
+               return nil, err
+       }
+
+       if err := json.Unmarshal(configData, &configMap); err != nil {
+               return nil, err
+       }
+
+       for k := range configMap {
+               delete(abstractMap, k)
+       }
+
+       if len(abstractMap) != 0 {
+               return nil, fmt.Errorf("unknown fields: %s", abstractMap)
+       }
+
+       return container, nil
+}
+
+func TestRemoveNamespace(t *testing.T) {
+       ns := Namespaces{
+               {Type: NEWNET},
+       }
+       if !ns.Remove(NEWNET) {
+               t.Fatal("NEWNET was not removed")
+       }
+       if len(ns) != 0 {
+               t.Fatalf("namespaces should have 0 items but reports %d", len(ns))
+       }
+}
+
+func TestHostRootUIDNoUSERNS(t *testing.T) {
+       config := &Config{
+               Namespaces: Namespaces{},
+       }
+       uid, err := config.HostRootUID()
+       if err != nil {
+               t.Fatal(err)
+       }
+       if uid != 0 {
+               t.Fatalf("expected uid 0 with no USERNS but received %d", uid)
+       }
+}
+
+func TestHostRootUIDWithUSERNS(t *testing.T) {
+       config := &Config{
+               Namespaces: Namespaces{{Type: NEWUSER}},
+               UidMappings: []IDMap{
+                       {
+                               ContainerID: 0,
+                               HostID:      1000,
+                               Size:        1,
+                       },
+               },
+       }
+       uid, err := config.HostRootUID()
+       if err != nil {
+               t.Fatal(err)
+       }
+       if uid != 1000 {
+               t.Fatalf("expected uid 1000 with no USERNS but received %d", uid)
+       }
+}
+
+func TestHostRootGIDNoUSERNS(t *testing.T) {
+       config := &Config{
+               Namespaces: Namespaces{},
+       }
+       uid, err := config.HostRootGID()
+       if err != nil {
+               t.Fatal(err)
+       }
+       if uid != 0 {
+               t.Fatalf("expected gid 0 with no USERNS but received %d", uid)
+       }
+}
+
+func TestHostRootGIDWithUSERNS(t *testing.T) {
+       config := &Config{
+               Namespaces: Namespaces{{Type: NEWUSER}},
+               GidMappings: []IDMap{
+                       {
+                               ContainerID: 0,
+                               HostID:      1000,
+                               Size:        1,
+                       },
+               },
+       }
+       uid, err := config.HostRootGID()
+       if err != nil {
+               t.Fatal(err)
+       }
+       if uid != 1000 {
+               t.Fatalf("expected gid 1000 with no USERNS but received %d", uid)
+       }
+}
diff --git a/libcontainer/configs/config_test.go b/libcontainer/configs/config_test.go
new file mode 100644 (file)
index 0000000..c89a764
--- /dev/null
@@ -0,0 +1,195 @@
+package configs_test
+
+import (
+       "encoding/json"
+       "fmt"
+       "os"
+       "reflect"
+       "testing"
+       "time"
+
+       "github.com/opencontainers/runc/libcontainer/configs"
+       "github.com/opencontainers/runtime-spec/specs-go"
+)
+
+func TestUnmarshalHooks(t *testing.T) {
+       timeout := time.Second
+
+       prestartCmd := configs.NewCommandHook(configs.Command{
+               Path:    "/var/vcap/hooks/prestart",
+               Args:    []string{"--pid=123"},
+               Env:     []string{"FOO=BAR"},
+               Dir:     "/var/vcap",
+               Timeout: &timeout,
+       })
+       prestart, err := json.Marshal(prestartCmd.Command)
+       if err != nil {
+               t.Fatal(err)
+       }
+
+       hook := configs.Hooks{}
+       err = hook.UnmarshalJSON([]byte(fmt.Sprintf(`{"Prestart" :[%s]}`, prestart)))
+       if err != nil {
+               t.Fatal(err)
+       }
+
+       if !reflect.DeepEqual(hook.Prestart[0], prestartCmd) {
+               t.Errorf("Expected prestart to equal %+v but it was %+v",
+                       prestartCmd, hook.Prestart[0])
+       }
+}
+
+func TestUnmarshalHooksWithInvalidData(t *testing.T) {
+       hook := configs.Hooks{}
+       err := hook.UnmarshalJSON([]byte(`{invalid-json}`))
+       if err == nil {
+               t.Error("Expected error to occur but it was nil")
+       }
+}
+
+func TestMarshalHooks(t *testing.T) {
+       timeout := time.Second
+
+       prestartCmd := configs.NewCommandHook(configs.Command{
+               Path:    "/var/vcap/hooks/prestart",
+               Args:    []string{"--pid=123"},
+               Env:     []string{"FOO=BAR"},
+               Dir:     "/var/vcap",
+               Timeout: &timeout,
+       })
+
+       hook := configs.Hooks{
+               Prestart: []configs.Hook{prestartCmd},
+       }
+       hooks, err := hook.MarshalJSON()
+       if err != nil {
+               t.Fatal(err)
+       }
+
+       h := `{"poststart":null,"poststop":null,"prestart":[{"path":"/var/vcap/hooks/prestart","args":["--pid=123"],"env":["FOO=BAR"],"dir":"/var/vcap","timeout":1000000000}]}`
+       if string(hooks) != h {
+               t.Errorf("Expected hooks %s to equal %s", string(hooks), h)
+       }
+}
+
+func TestMarshalUnmarshalHooks(t *testing.T) {
+       timeout := time.Second
+
+       prestart := configs.NewCommandHook(configs.Command{
+               Path:    "/var/vcap/hooks/prestart",
+               Args:    []string{"--pid=123"},
+               Env:     []string{"FOO=BAR"},
+               Dir:     "/var/vcap",
+               Timeout: &timeout,
+       })
+
+       hook := configs.Hooks{
+               Prestart: []configs.Hook{prestart},
+       }
+       hooks, err := hook.MarshalJSON()
+       if err != nil {
+               t.Fatal(err)
+       }
+
+       umMhook := configs.Hooks{}
+       err = umMhook.UnmarshalJSON(hooks)
+       if err != nil {
+               t.Fatal(err)
+       }
+       if !reflect.DeepEqual(umMhook.Prestart[0], prestart) {
+               t.Errorf("Expected hooks to be equal after mashaling -> unmarshaling them: %+v, %+v", umMhook.Prestart[0], prestart)
+       }
+}
+
+func TestMarshalHooksWithUnexpectedType(t *testing.T) {
+       fHook := configs.NewFunctionHook(func(*specs.State) error {
+               return nil
+       })
+       hook := configs.Hooks{
+               Prestart: []configs.Hook{fHook},
+       }
+       hooks, err := hook.MarshalJSON()
+       if err != nil {
+               t.Fatal(err)
+       }
+
+       h := `{"poststart":null,"poststop":null,"prestart":null}`
+       if string(hooks) != h {
+               t.Errorf("Expected hooks %s to equal %s", string(hooks), h)
+       }
+}
+
+func TestFuncHookRun(t *testing.T) {
+       state := &specs.State{
+               Version: "1",
+               ID:      "1",
+               Status:  "created",
+               Pid:     1,
+               Bundle:  "/bundle",
+       }
+
+       fHook := configs.NewFunctionHook(func(s *specs.State) error {
+               if !reflect.DeepEqual(state, s) {
+                       t.Errorf("Expected state %+v to equal %+v", state, s)
+               }
+               return nil
+       })
+
+       fHook.Run(state)
+}
+
+func TestCommandHookRun(t *testing.T) {
+       state := &specs.State{
+               Version: "1",
+               ID:      "1",
+               Status:  "created",
+               Pid:     1,
+               Bundle:  "/bundle",
+       }
+       timeout := time.Second
+
+       cmdHook := configs.NewCommandHook(configs.Command{
+               Path:    os.Args[0],
+               Args:    []string{os.Args[0], "-test.run=TestHelperProcess"},
+               Env:     []string{"FOO=BAR"},
+               Dir:     "/",
+               Timeout: &timeout,
+       })
+
+       err := cmdHook.Run(state)
+       if err != nil {
+               t.Errorf(fmt.Sprintf("Expected error to not occur but it was %+v", err))
+       }
+}
+
+func TestCommandHookRunTimeout(t *testing.T) {
+       state := &specs.State{
+               Version: "1",
+               ID:      "1",
+               Status:  "created",
+               Pid:     1,
+               Bundle:  "/bundle",
+       }
+       timeout := (10 * time.Millisecond)
+
+       cmdHook := configs.NewCommandHook(configs.Command{
+               Path:    os.Args[0],
+               Args:    []string{os.Args[0], "-test.run=TestHelperProcessWithTimeout"},
+               Env:     []string{"FOO=BAR"},
+               Dir:     "/",
+               Timeout: &timeout,
+       })
+
+       err := cmdHook.Run(state)
+       if err == nil {
+               t.Error("Expected error to occur but it was nil")
+       }
+}
+
+func TestHelperProcess(*testing.T) {
+       fmt.Println("Helper Process")
+       os.Exit(0)
+}
+func TestHelperProcessWithTimeout(*testing.T) {
+       time.Sleep(time.Second)
+}
diff --git a/libcontainer/configs/config_windows_test.go b/libcontainer/configs/config_windows_test.go
new file mode 100644 (file)
index 0000000..1a0c8fa
--- /dev/null
@@ -0,0 +1,3 @@
+package configs
+
+// All current tests are for Unix-specific functionality
diff --git a/libcontainer/configs/device.go b/libcontainer/configs/device.go
new file mode 100644 (file)
index 0000000..8701bb2
--- /dev/null
@@ -0,0 +1,57 @@
+package configs
+
+import (
+       "fmt"
+       "os"
+)
+
+const (
+       Wildcard = -1
+)
+
+// TODO Windows: This can be factored out in the future
+
+type Device struct {
+       // Device type, block, char, etc.
+       Type rune `json:"type"`
+
+       // Path to the device.
+       Path string `json:"path"`
+
+       // Major is the device's major number.
+       Major int64 `json:"major"`
+
+       // Minor is the device's minor number.
+       Minor int64 `json:"minor"`
+
+       // Cgroup permissions format, rwm.
+       Permissions string `json:"permissions"`
+
+       // FileMode permission bits for the device.
+       FileMode os.FileMode `json:"file_mode"`
+
+       // Uid of the device.
+       Uid uint32 `json:"uid"`
+
+       // Gid of the device.
+       Gid uint32 `json:"gid"`
+
+       // Write the file to the allowed list
+       Allow bool `json:"allow"`
+}
+
+func (d *Device) CgroupString() string {
+       return fmt.Sprintf("%c %s:%s %s", d.Type, deviceNumberString(d.Major), deviceNumberString(d.Minor), d.Permissions)
+}
+
+func (d *Device) Mkdev() int {
+       return int((d.Major << 8) | (d.Minor & 0xff) | ((d.Minor & 0xfff00) << 12))
+}
+
+// deviceNumberString converts the device number to a string return result.
+func deviceNumberString(number int64) string {
+       if number == Wildcard {
+               return "*"
+       }
+       return fmt.Sprint(number)
+}
diff --git a/libcontainer/configs/device_defaults.go b/libcontainer/configs/device_defaults.go
new file mode 100644 (file)
index 0000000..e4f423c
--- /dev/null
@@ -0,0 +1,111 @@
+// +build linux
+
+package configs
+
+var (
+       // DefaultSimpleDevices are devices that are to be both allowed and created.
+       DefaultSimpleDevices = []*Device{
+               // /dev/null and zero
+               {
+                       Path:        "/dev/null",
+                       Type:        'c',
+                       Major:       1,
+                       Minor:       3,
+                       Permissions: "rwm",
+                       FileMode:    0666,
+               },
+               {
+                       Path:        "/dev/zero",
+                       Type:        'c',
+                       Major:       1,
+                       Minor:       5,
+                       Permissions: "rwm",
+                       FileMode:    0666,
+               },
+
+               {
+                       Path:        "/dev/full",
+                       Type:        'c',
+                       Major:       1,
+                       Minor:       7,
+                       Permissions: "rwm",
+                       FileMode:    0666,
+               },
+
+               // consoles and ttys
+               {
+                       Path:        "/dev/tty",
+                       Type:        'c',
+                       Major:       5,
+                       Minor:       0,
+                       Permissions: "rwm",
+                       FileMode:    0666,
+               },
+
+               // /dev/urandom,/dev/random
+               {
+                       Path:        "/dev/urandom",
+                       Type:        'c',
+                       Major:       1,
+                       Minor:       9,
+                       Permissions: "rwm",
+                       FileMode:    0666,
+               },
+               {
+                       Path:        "/dev/random",
+                       Type:        'c',
+                       Major:       1,
+                       Minor:       8,
+                       Permissions: "rwm",
+                       FileMode:    0666,
+               },
+       }
+       DefaultAllowedDevices = append([]*Device{
+               // allow mknod for any device
+               {
+                       Type:        'c',
+                       Major:       Wildcard,
+                       Minor:       Wildcard,
+                       Permissions: "m",
+               },
+               {
+                       Type:        'b',
+                       Major:       Wildcard,
+                       Minor:       Wildcard,
+                       Permissions: "m",
+               },
+
+               {
+                       Path:        "/dev/console",
+                       Type:        'c',
+                       Major:       5,
+                       Minor:       1,
+                       Permissions: "rwm",
+               },
+               // /dev/pts/ - pts namespaces are "coming soon"
+               {
+                       Path:        "",
+                       Type:        'c',
+                       Major:       136,
+                       Minor:       Wildcard,
+                       Permissions: "rwm",
+               },
+               {
+                       Path:        "",
+                       Type:        'c',
+                       Major:       5,
+                       Minor:       2,
+                       Permissions: "rwm",
+               },
+
+               // tuntap
+               {
+                       Path:        "",
+                       Type:        'c',
+                       Major:       10,
+                       Minor:       200,
+                       Permissions: "rwm",
+               },
+       }, DefaultSimpleDevices...)
+       DefaultAutoCreatedDevices = append([]*Device{}, DefaultSimpleDevices...)
+)
diff --git a/libcontainer/configs/hugepage_limit.go b/libcontainer/configs/hugepage_limit.go
new file mode 100644 (file)
index 0000000..d302163
--- /dev/null
@@ -0,0 +1,9 @@
+package configs
+
+type HugepageLimit struct {
+       // which type of hugepage to limit.
+       Pagesize string `json:"page_size"`
+
+       // usage limit for hugepage.
+       Limit uint64 `json:"limit"`
+}
diff --git a/libcontainer/configs/intelrdt.go b/libcontainer/configs/intelrdt.go
new file mode 100644 (file)
index 0000000..57e9f03
--- /dev/null
@@ -0,0 +1,13 @@
+package configs
+
+type IntelRdt struct {
+       // The schema for L3 cache id and capacity bitmask (CBM)
+       // Format: "L3:<cache_id0>=<cbm0>;<cache_id1>=<cbm1>;..."
+       L3CacheSchema string `json:"l3_cache_schema,omitempty"`
+
+       // The schema of memory bandwidth per L3 cache id
+       // Format: "MB:<cache_id0>=bandwidth0;<cache_id1>=bandwidth1;..."
+       // The unit of memory bandwidth is specified in "percentages" by
+       // default, and in "MBps" if MBA Software Controller is enabled.
+       MemBwSchema string `json:"memBwSchema,omitempty"`
+}
diff --git a/libcontainer/configs/interface_priority_map.go b/libcontainer/configs/interface_priority_map.go
new file mode 100644 (file)
index 0000000..9a0395e
--- /dev/null
@@ -0,0 +1,14 @@
+package configs
+
+import (
+       "fmt"
+)
+
+type IfPrioMap struct {
+       Interface string `json:"interface"`
+       Priority  int64  `json:"priority"`
+}
+
+func (i *IfPrioMap) CgroupString() string {
+       return fmt.Sprintf("%s %d", i.Interface, i.Priority)
+}
diff --git a/libcontainer/configs/mount.go b/libcontainer/configs/mount.go
new file mode 100644 (file)
index 0000000..670757d
--- /dev/null
@@ -0,0 +1,39 @@
+package configs
+
+const (
+       // EXT_COPYUP is a directive to copy up the contents of a directory when
+       // a tmpfs is mounted over it.
+       EXT_COPYUP = 1 << iota
+)
+
+type Mount struct {
+       // Source path for the mount.
+       Source string `json:"source"`
+
+       // Destination path for the mount inside the container.
+       Destination string `json:"destination"`
+
+       // Device the mount is for.
+       Device string `json:"device"`
+
+       // Mount flags.
+       Flags int `json:"flags"`
+
+       // Propagation Flags
+       PropagationFlags []int `json:"propagation_flags"`
+
+       // Mount data applied to the mount.
+       Data string `json:"data"`
+
+       // Relabel source if set, "z" indicates shared, "Z" indicates unshared.
+       Relabel string `json:"relabel"`
+
+       // Extensions are additional flags that are specific to runc.
+       Extensions int `json:"extensions"`
+
+       // Optional Command to be run before Source is mounted.
+       PremountCmds []Command `json:"premount_cmds"`
+
+       // Optional Command to be run after Source is mounted.
+       PostmountCmds []Command `json:"postmount_cmds"`
+}
diff --git a/libcontainer/configs/namespaces.go b/libcontainer/configs/namespaces.go
new file mode 100644 (file)
index 0000000..a3329a3
--- /dev/null
@@ -0,0 +1,5 @@
+package configs
+
+type NamespaceType string
+
+type Namespaces []Namespace
diff --git a/libcontainer/configs/namespaces_linux.go b/libcontainer/configs/namespaces_linux.go
new file mode 100644 (file)
index 0000000..1bbaef9
--- /dev/null
@@ -0,0 +1,126 @@
+package configs
+
+import (
+       "fmt"
+       "os"
+       "sync"
+)
+
+const (
+       NEWNET    NamespaceType = "NEWNET"
+       NEWPID    NamespaceType = "NEWPID"
+       NEWNS     NamespaceType = "NEWNS"
+       NEWUTS    NamespaceType = "NEWUTS"
+       NEWIPC    NamespaceType = "NEWIPC"
+       NEWUSER   NamespaceType = "NEWUSER"
+       NEWCGROUP NamespaceType = "NEWCGROUP"
+)
+
+var (
+       nsLock              sync.Mutex
+       supportedNamespaces = make(map[NamespaceType]bool)
+)
+
+// NsName converts the namespace type to its filename
+func NsName(ns NamespaceType) string {
+       switch ns {
+       case NEWNET:
+               return "net"
+       case NEWNS:
+               return "mnt"
+       case NEWPID:
+               return "pid"
+       case NEWIPC:
+               return "ipc"
+       case NEWUSER:
+               return "user"
+       case NEWUTS:
+               return "uts"
+       case NEWCGROUP:
+               return "cgroup"
+       }
+       return ""
+}
+
+// IsNamespaceSupported returns whether a namespace is available or
+// not
+func IsNamespaceSupported(ns NamespaceType) bool {
+       nsLock.Lock()
+       defer nsLock.Unlock()
+       supported, ok := supportedNamespaces[ns]
+       if ok {
+               return supported
+       }
+       nsFile := NsName(ns)
+       // if the namespace type is unknown, just return false
+       if nsFile == "" {
+               return false
+       }
+       _, err := os.Stat(fmt.Sprintf("/proc/self/ns/%s", nsFile))
+       // a namespace is supported if it exists and we have permissions to read it
+       supported = err == nil
+       supportedNamespaces[ns] = supported
+       return supported
+}
+
+func NamespaceTypes() []NamespaceType {
+       return []NamespaceType{
+               NEWUSER, // Keep user NS always first, don't move it.
+               NEWIPC,
+               NEWUTS,
+               NEWNET,
+               NEWPID,
+               NEWNS,
+               NEWCGROUP,
+       }
+}
+
+// Namespace defines configuration for each namespace.  It specifies an
+// alternate path that is able to be joined via setns.
+type Namespace struct {
+       Type NamespaceType `json:"type"`
+       Path string        `json:"path"`
+}
+
+func (n *Namespace) GetPath(pid int) string {
+       return fmt.Sprintf("/proc/%d/ns/%s", pid, NsName(n.Type))
+}
+
+func (n *Namespaces) Remove(t NamespaceType) bool {
+       i := n.index(t)
+       if i == -1 {
+               return false
+       }
+       *n = append((*n)[:i], (*n)[i+1:]...)
+       return true
+}
+
+func (n *Namespaces) Add(t NamespaceType, path string) {
+       i := n.index(t)
+       if i == -1 {
+               *n = append(*n, Namespace{Type: t, Path: path})
+               return
+       }
+       (*n)[i].Path = path
+}
+
+func (n *Namespaces) index(t NamespaceType) int {
+       for i, ns := range *n {
+               if ns.Type == t {
+                       return i
+               }
+       }
+       return -1
+}
+
+func (n *Namespaces) Contains(t NamespaceType) bool {
+       return n.index(t) != -1
+}
+
+func (n *Namespaces) PathOf(t NamespaceType) string {
+       i := n.index(t)
+       if i == -1 {
+               return ""
+       }
+       return (*n)[i].Path
+}
diff --git a/libcontainer/configs/namespaces_syscall.go b/libcontainer/configs/namespaces_syscall.go
new file mode 100644 (file)
index 0000000..2dc7adf
--- /dev/null
@@ -0,0 +1,32 @@
+// +build linux
+
+package configs
+
+import "golang.org/x/sys/unix"
+
+func (n *Namespace) Syscall() int {
+       return namespaceInfo[n.Type]
+}
+
+var namespaceInfo = map[NamespaceType]int{
+       NEWNET:    unix.CLONE_NEWNET,
+       NEWNS:     unix.CLONE_NEWNS,
+       NEWUSER:   unix.CLONE_NEWUSER,
+       NEWIPC:    unix.CLONE_NEWIPC,
+       NEWUTS:    unix.CLONE_NEWUTS,
+       NEWPID:    unix.CLONE_NEWPID,
+       NEWCGROUP: unix.CLONE_NEWCGROUP,
+}
+
+// CloneFlags parses the container's Namespaces options to set the correct
+// flags on clone, unshare. This function returns flags only for new namespaces.
+func (n *Namespaces) CloneFlags() uintptr {
+       var flag int
+       for _, v := range *n {
+               if v.Path != "" {
+                       continue
+               }
+               flag |= namespaceInfo[v.Type]
+       }
+       return uintptr(flag)
+}
diff --git a/libcontainer/configs/namespaces_syscall_unsupported.go b/libcontainer/configs/namespaces_syscall_unsupported.go
new file mode 100644 (file)
index 0000000..5d9a5c8
--- /dev/null
@@ -0,0 +1,13 @@
+// +build !linux,!windows
+
+package configs
+
+func (n *Namespace) Syscall() int {
+       panic("No namespace syscall support")
+}
+
+// CloneFlags parses the container's Namespaces options to set the correct
+// flags on clone, unshare. This function returns flags only for new namespaces.
+func (n *Namespaces) CloneFlags() uintptr {
+       panic("No namespace syscall support")
+}
diff --git a/libcontainer/configs/namespaces_unsupported.go b/libcontainer/configs/namespaces_unsupported.go
new file mode 100644 (file)
index 0000000..19bf713
--- /dev/null
@@ -0,0 +1,8 @@
+// +build !linux
+
+package configs
+
+// Namespace defines configuration for each namespace.  It specifies an
+// alternate path that is able to be joined via setns.
+type Namespace struct {
+}
diff --git a/libcontainer/configs/network.go b/libcontainer/configs/network.go
new file mode 100644 (file)
index 0000000..ccdb228
--- /dev/null
@@ -0,0 +1,72 @@
+package configs
+
+// Network defines configuration for a container's networking stack
+//
+// The network configuration can be omitted from a container causing the
+// container to be setup with the host's networking stack
+type Network struct {
+       // Type sets the networks type, commonly veth and loopback
+       Type string `json:"type"`
+
+       // Name of the network interface
+       Name string `json:"name"`
+
+       // The bridge to use.
+       Bridge string `json:"bridge"`
+
+       // MacAddress contains the MAC address to set on the network interface
+       MacAddress string `json:"mac_address"`
+
+       // Address contains the IPv4 and mask to set on the network interface
+       Address string `json:"address"`
+
+       // Gateway sets the gateway address that is used as the default for the interface
+       Gateway string `json:"gateway"`
+
+       // IPv6Address contains the IPv6 and mask to set on the network interface
+       IPv6Address string `json:"ipv6_address"`
+
+       // IPv6Gateway sets the ipv6 gateway address that is used as the default for the interface
+       IPv6Gateway string `json:"ipv6_gateway"`
+
+       // Mtu sets the mtu value for the interface and will be mirrored on both the host and
+       // container's interfaces if a pair is created, specifically in the case of type veth
+       // Note: This does not apply to loopback interfaces.
+       Mtu int `json:"mtu"`
+
+       // TxQueueLen sets the tx_queuelen value for the interface and will be mirrored on both the host and
+       // container's interfaces if a pair is created, specifically in the case of type veth
+       // Note: This does not apply to loopback interfaces.
+       TxQueueLen int `json:"txqueuelen"`
+
+       // HostInterfaceName is a unique name of a veth pair that resides on in the host interface of the
+       // container.
+       HostInterfaceName string `json:"host_interface_name"`
+
+       // HairpinMode specifies if hairpin NAT should be enabled on the virtual interface
+       // bridge port in the case of type veth
+       // Note: This is unsupported on some systems.
+       // Note: This does not apply to loopback interfaces.
+       HairpinMode bool `json:"hairpin_mode"`
+}
+
+// Routes can be specified to create entries in the route table as the container is started
+//
+// All of destination, source, and gateway should be either IPv4 or IPv6.
+// One of the three options must be present, and omitted entries will use their
+// IP family default for the route table.  For IPv4 for example, setting the
+// gateway to 1.2.3.4 and the interface to eth0 will set up a standard
+// destination of 0.0.0.0(or *) when viewed in the route table.
+type Route struct {
+       // Sets the destination and mask, should be a CIDR.  Accepts IPv4 and IPv6
+       Destination string `json:"destination"`
+
+       // Sets the source and mask, should be a CIDR.  Accepts IPv4 and IPv6
+       Source string `json:"source"`
+
+       // Sets the gateway.  Accepts IPv4 and IPv6
+       Gateway string `json:"gateway"`
+
+       // The device to set this route up for, for example: eth0
+       InterfaceName string `json:"interface_name"`
+}
diff --git a/libcontainer/configs/validate/rootless.go b/libcontainer/configs/validate/rootless.go
new file mode 100644 (file)
index 0000000..393d9e8
--- /dev/null
@@ -0,0 +1,89 @@
+package validate
+
+import (
+       "fmt"
+       "strings"
+
+       "github.com/opencontainers/runc/libcontainer/configs"
+)
+
+// rootlessEUID makes sure that the config can be applied when runc
+// is being executed as a non-root user (euid != 0) in the current user namespace.
+func (v *ConfigValidator) rootlessEUID(config *configs.Config) error {
+       if err := rootlessEUIDMappings(config); err != nil {
+               return err
+       }
+       if err := rootlessEUIDMount(config); err != nil {
+               return err
+       }
+
+       // XXX: We currently can't verify the user config at all, because
+       //      configs.Config doesn't store the user-related configs. So this
+       //      has to be verified by setupUser() in init_linux.go.
+
+       return nil
+}
+
+func hasIDMapping(id int, mappings []configs.IDMap) bool {
+       for _, m := range mappings {
+               if id >= m.ContainerID && id < m.ContainerID+m.Size {
+                       return true
+               }
+       }
+       return false
+}
+
+func rootlessEUIDMappings(config *configs.Config) error {
+       if !config.Namespaces.Contains(configs.NEWUSER) {
+               return fmt.Errorf("rootless container requires user namespaces")
+       }
+
+       if len(config.UidMappings) == 0 {
+               return fmt.Errorf("rootless containers requires at least one UID mapping")
+       }
+       if len(config.GidMappings) == 0 {
+               return fmt.Errorf("rootless containers requires at least one GID mapping")
+       }
+       return nil
+}
+
+// mount verifies that the user isn't trying to set up any mounts they don't have
+// the rights to do. In addition, it makes sure that no mount has a `uid=` or
+// `gid=` option that doesn't resolve to root.
+func rootlessEUIDMount(config *configs.Config) error {
+       // XXX: We could whitelist allowed devices at this point, but I'm not
+       //      convinced that's a good idea. The kernel is the best arbiter of
+       //      access control.
+
+       for _, mount := range config.Mounts {
+               // Check that the options list doesn't contain any uid= or gid= entries
+               // that don't resolve to root.
+               for _, opt := range strings.Split(mount.Data, ",") {
+                       if strings.HasPrefix(opt, "uid=") {
+                               var uid int
+                               n, err := fmt.Sscanf(opt, "uid=%d", &uid)
+                               if n != 1 || err != nil {
+                                       // Ignore unknown mount options.
+                                       continue
+                               }
+                               if !hasIDMapping(uid, config.UidMappings) {
+                                       return fmt.Errorf("cannot specify uid= mount options for unmapped uid in rootless containers")
+                               }
+                       }
+
+                       if strings.HasPrefix(opt, "gid=") {
+                               var gid int
+                               n, err := fmt.Sscanf(opt, "gid=%d", &gid)
+                               if n != 1 || err != nil {
+                                       // Ignore unknown mount options.
+                                       continue
+                               }
+                               if !hasIDMapping(gid, config.GidMappings) {
+                                       return fmt.Errorf("cannot specify gid= mount options for unmapped gid in rootless containers")
+                               }
+                       }
+               }
+       }
+
+       return nil
+}
diff --git a/libcontainer/configs/validate/rootless_test.go b/libcontainer/configs/validate/rootless_test.go
new file mode 100644 (file)
index 0000000..59d1557
--- /dev/null
@@ -0,0 +1,155 @@
+package validate
+
+import (
+       "testing"
+
+       "github.com/opencontainers/runc/libcontainer/configs"
+)
+
+func rootlessEUIDConfig() *configs.Config {
+       return &configs.Config{
+               Rootfs:          "/var",
+               RootlessEUID:    true,
+               RootlessCgroups: true,
+               Namespaces: configs.Namespaces(
+                       []configs.Namespace{
+                               {Type: configs.NEWUSER},
+                       },
+               ),
+               UidMappings: []configs.IDMap{
+                       {
+                               HostID:      1337,
+                               ContainerID: 0,
+                               Size:        1,
+                       },
+               },
+               GidMappings: []configs.IDMap{
+                       {
+                               HostID:      7331,
+                               ContainerID: 0,
+                               Size:        1,
+                       },
+               },
+       }
+}
+
+func TestValidateRootlessEUID(t *testing.T) {
+       validator := New()
+
+       config := rootlessEUIDConfig()
+       if err := validator.Validate(config); err != nil {
+               t.Errorf("Expected error to not occur: %+v", err)
+       }
+}
+
+/* rootlessEUIDMappings */
+
+func TestValidateRootlessEUIDUserns(t *testing.T) {
+       validator := New()
+
+       config := rootlessEUIDConfig()
+       config.Namespaces = nil
+       if err := validator.Validate(config); err == nil {
+               t.Errorf("Expected error to occur if user namespaces not set")
+       }
+}
+
+func TestValidateRootlessEUIDMappingUid(t *testing.T) {
+       validator := New()
+
+       config := rootlessEUIDConfig()
+       config.UidMappings = nil
+       if err := validator.Validate(config); err == nil {
+               t.Errorf("Expected error to occur if no uid mappings provided")
+       }
+}
+
+func TestValidateNonZeroEUIDMappingGid(t *testing.T) {
+       validator := New()
+
+       config := rootlessEUIDConfig()
+       config.GidMappings = nil
+       if err := validator.Validate(config); err == nil {
+               t.Errorf("Expected error to occur if no gid mappings provided")
+       }
+}
+
+/* rootlessEUIDMount() */
+
+func TestValidateRootlessEUIDMountUid(t *testing.T) {
+       config := rootlessEUIDConfig()
+       validator := New()
+
+       config.Mounts = []*configs.Mount{
+               {
+                       Source:      "devpts",
+                       Destination: "/dev/pts",
+                       Device:      "devpts",
+               },
+       }
+
+       if err := validator.Validate(config); err != nil {
+               t.Errorf("Expected error to not occur when uid= not set in mount options: %+v", err)
+       }
+
+       config.Mounts[0].Data = "uid=5"
+       if err := validator.Validate(config); err == nil {
+               t.Errorf("Expected error to occur when setting uid=5 in mount options")
+       }
+
+       config.Mounts[0].Data = "uid=0"
+       if err := validator.Validate(config); err != nil {
+               t.Errorf("Expected error to not occur when setting uid=0 in mount options: %+v", err)
+       }
+
+       config.Mounts[0].Data = "uid=2"
+       config.UidMappings[0].Size = 10
+       if err := validator.Validate(config); err != nil {
+               t.Errorf("Expected error to not occur when setting uid=2 in mount options and UidMapping[0].size is 10")
+       }
+
+       config.Mounts[0].Data = "uid=20"
+       config.UidMappings[0].Size = 10
+       if err := validator.Validate(config); err == nil {
+               t.Errorf("Expected error to occur when setting uid=20 in mount options and UidMapping[0].size is 10")
+       }
+}
+
+func TestValidateRootlessEUIDMountGid(t *testing.T) {
+       config := rootlessEUIDConfig()
+       validator := New()
+
+       config.Mounts = []*configs.Mount{
+               {
+                       Source:      "devpts",
+                       Destination: "/dev/pts",
+                       Device:      "devpts",
+               },
+       }
+
+       if err := validator.Validate(config); err != nil {
+               t.Errorf("Expected error to not occur when gid= not set in mount options: %+v", err)
+       }
+
+       config.Mounts[0].Data = "gid=5"
+       if err := validator.Validate(config); err == nil {
+               t.Errorf("Expected error to occur when setting gid=5 in mount options")
+       }
+
+       config.Mounts[0].Data = "gid=0"
+       if err := validator.Validate(config); err != nil {
+               t.Errorf("Expected error to not occur when setting gid=0 in mount options: %+v", err)
+       }
+
+       config.Mounts[0].Data = "gid=5"
+       config.GidMappings[0].Size = 10
+       if err := validator.Validate(config); err != nil {
+               t.Errorf("Expected error to not occur when setting gid=5 in mount options and GidMapping[0].size is 10")
+       }
+
+       config.Mounts[0].Data = "gid=11"
+       config.GidMappings[0].Size = 10
+       if err := validator.Validate(config); err == nil {
+               t.Errorf("Expected error to occur when setting gid=11 in mount options and GidMapping[0].size is 10")
+       }
+}
diff --git a/libcontainer/configs/validate/validator.go b/libcontainer/configs/validate/validator.go
new file mode 100644 (file)
index 0000000..3b42f30
--- /dev/null
@@ -0,0 +1,245 @@
+package validate
+
+import (
+       "fmt"
+       "os"
+       "path/filepath"
+       "strings"
+
+       "github.com/opencontainers/runc/libcontainer/configs"
+       "github.com/opencontainers/runc/libcontainer/intelrdt"
+       selinux "github.com/opencontainers/selinux/go-selinux"
+)
+
+type Validator interface {
+       Validate(*configs.Config) error
+}
+
+func New() Validator {
+       return &ConfigValidator{}
+}
+
+type ConfigValidator struct {
+}
+
+func (v *ConfigValidator) Validate(config *configs.Config) error {
+       if err := v.rootfs(config); err != nil {
+               return err
+       }
+       if err := v.network(config); err != nil {
+               return err
+       }
+       if err := v.hostname(config); err != nil {
+               return err
+       }
+       if err := v.security(config); err != nil {
+               return err
+       }
+       if err := v.usernamespace(config); err != nil {
+               return err
+       }
+       if err := v.cgroupnamespace(config); err != nil {
+               return err
+       }
+       if err := v.sysctl(config); err != nil {
+               return err
+       }
+       if err := v.intelrdt(config); err != nil {
+               return err
+       }
+       if config.RootlessEUID {
+               if err := v.rootlessEUID(config); err != nil {
+                       return err
+               }
+       }
+       return nil
+}
+
+// rootfs validates if the rootfs is an absolute path and is not a symlink
+// to the container's root filesystem.
+func (v *ConfigValidator) rootfs(config *configs.Config) error {
+       if _, err := os.Stat(config.Rootfs); err != nil {
+               if os.IsNotExist(err) {
+                       return fmt.Errorf("rootfs (%s) does not exist", config.Rootfs)
+               }
+               return err
+       }
+       cleaned, err := filepath.Abs(config.Rootfs)
+       if err != nil {
+               return err
+       }
+       if cleaned, err = filepath.EvalSymlinks(cleaned); err != nil {
+               return err
+       }
+       if filepath.Clean(config.Rootfs) != cleaned {
+               return fmt.Errorf("%s is not an absolute path or is a symlink", config.Rootfs)
+       }
+       return nil
+}
+
+func (v *ConfigValidator) network(config *configs.Config) error {
+       if !config.Namespaces.Contains(configs.NEWNET) {
+               if len(config.Networks) > 0 || len(config.Routes) > 0 {
+                       return fmt.Errorf("unable to apply network settings without a private NET namespace")
+               }
+       }
+       return nil
+}
+
+func (v *ConfigValidator) hostname(config *configs.Config) error {
+       if config.Hostname != "" && !config.Namespaces.Contains(configs.NEWUTS) {
+               return fmt.Errorf("unable to set hostname without a private UTS namespace")
+       }
+       return nil
+}
+
+func (v *ConfigValidator) security(config *configs.Config) error {
+       // restrict sys without mount namespace
+       if (len(config.MaskPaths) > 0 || len(config.ReadonlyPaths) > 0) &&
+               !config.Namespaces.Contains(configs.NEWNS) {
+               return fmt.Errorf("unable to restrict sys entries without a private MNT namespace")
+       }
+       if config.ProcessLabel != "" && !selinux.GetEnabled() {
+               return fmt.Errorf("selinux label is specified in config, but selinux is disabled or not supported")
+       }
+
+       return nil
+}
+
+func (v *ConfigValidator) usernamespace(config *configs.Config) error {
+       if config.Namespaces.Contains(configs.NEWUSER) {
+               if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) {
+                       return fmt.Errorf("USER namespaces aren't enabled in the kernel")
+               }
+       } else {
+               if config.UidMappings != nil || config.GidMappings != nil {
+                       return fmt.Errorf("User namespace mappings specified, but USER namespace isn't enabled in the config")
+               }
+       }
+       return nil
+}
+
+func (v *ConfigValidator) cgroupnamespace(config *configs.Config) error {
+       if config.Namespaces.Contains(configs.NEWCGROUP) {
+               if _, err := os.Stat("/proc/self/ns/cgroup"); os.IsNotExist(err) {
+                       return fmt.Errorf("cgroup namespaces aren't enabled in the kernel")
+               }
+       }
+       return nil
+}
+
+// sysctl validates that the specified sysctl keys are valid or not.
+// /proc/sys isn't completely namespaced and depending on which namespaces
+// are specified, a subset of sysctls are permitted.
+func (v *ConfigValidator) sysctl(config *configs.Config) error {
+       validSysctlMap := map[string]bool{
+               "kernel.msgmax":          true,
+               "kernel.msgmnb":          true,
+               "kernel.msgmni":          true,
+               "kernel.sem":             true,
+               "kernel.shmall":          true,
+               "kernel.shmmax":          true,
+               "kernel.shmmni":          true,
+               "kernel.shm_rmid_forced": true,
+       }
+
+       for s := range config.Sysctl {
+               if validSysctlMap[s] || strings.HasPrefix(s, "fs.mqueue.") {
+                       if config.Namespaces.Contains(configs.NEWIPC) {
+                               continue
+                       } else {
+                               return fmt.Errorf("sysctl %q is not allowed in the hosts ipc namespace", s)
+                       }
+               }
+               if strings.HasPrefix(s, "net.") {
+                       if config.Namespaces.Contains(configs.NEWNET) {
+                               if path := config.Namespaces.PathOf(configs.NEWNET); path != "" {
+                                       if err := checkHostNs(s, path); err != nil {
+                                               return err
+                                       }
+                               }
+                               continue
+                       } else {
+                               return fmt.Errorf("sysctl %q is not allowed in the hosts network namespace", s)
+                       }
+               }
+               if config.Namespaces.Contains(configs.NEWUTS) {
+                       switch s {
+                       case "kernel.domainname":
+                               // This is namespaced and there's no explicit OCI field for it.
+                               continue
+                       case "kernel.hostname":
+                               // This is namespaced but there's a conflicting (dedicated) OCI field for it.
+                               return fmt.Errorf("sysctl %q is not allowed as it conflicts with the OCI %q field", s, "hostname")
+                       }
+               }
+               return fmt.Errorf("sysctl %q is not in a separate kernel namespace", s)
+       }
+
+       return nil
+}
+
+func (v *ConfigValidator) intelrdt(config *configs.Config) error {
+       if config.IntelRdt != nil {
+               if !intelrdt.IsCatEnabled() && !intelrdt.IsMbaEnabled() {
+                       return fmt.Errorf("intelRdt is specified in config, but Intel RDT is not supported or enabled")
+               }
+
+               if !intelrdt.IsCatEnabled() && config.IntelRdt.L3CacheSchema != "" {
+                       return fmt.Errorf("intelRdt.l3CacheSchema is specified in config, but Intel RDT/CAT is not enabled")
+               }
+               if !intelrdt.IsMbaEnabled() && config.IntelRdt.MemBwSchema != "" {
+                       return fmt.Errorf("intelRdt.memBwSchema is specified in config, but Intel RDT/MBA is not enabled")
+               }
+
+               if intelrdt.IsCatEnabled() && config.IntelRdt.L3CacheSchema == "" {
+                       return fmt.Errorf("Intel RDT/CAT is enabled and intelRdt is specified in config, but intelRdt.l3CacheSchema is empty")
+               }
+               if intelrdt.IsMbaEnabled() && config.IntelRdt.MemBwSchema == "" {
+                       return fmt.Errorf("Intel RDT/MBA is enabled and intelRdt is specified in config, but intelRdt.memBwSchema is empty")
+               }
+       }
+
+       return nil
+}
+
+func isSymbolicLink(path string) (bool, error) {
+       fi, err := os.Lstat(path)
+       if err != nil {
+               return false, err
+       }
+
+       return fi.Mode()&os.ModeSymlink == os.ModeSymlink, nil
+}
+
+// checkHostNs checks whether network sysctl is used in host namespace.
+func checkHostNs(sysctlConfig string, path string) error {
+       var currentProcessNetns = "/proc/self/ns/net"
+       // readlink on the current processes network namespace
+       destOfCurrentProcess, err := os.Readlink(currentProcessNetns)
+       if err != nil {
+               return fmt.Errorf("read soft link %q error", currentProcessNetns)
+       }
+
+       // First check if the provided path is a symbolic link
+       symLink, err := isSymbolicLink(path)
+       if err != nil {
+               return fmt.Errorf("could not check that %q is a symlink: %v", path, err)
+       }
+
+       if symLink == false {
+               // The provided namespace is not a symbolic link,
+               // it is not the host namespace.
+               return nil
+       }
+
+       // readlink on the path provided in the struct
+       destOfContainer, err := os.Readlink(path)
+       if err != nil {
+               return fmt.Errorf("read soft link %q error", path)
+       }
+       if destOfContainer == destOfCurrentProcess {
+               return fmt.Errorf("sysctl %q is not allowed in the hosts network namespace", sysctlConfig)
+       }
+       return nil
+}
diff --git a/libcontainer/configs/validate/validator_test.go b/libcontainer/configs/validate/validator_test.go
new file mode 100644 (file)
index 0000000..f6826fb
--- /dev/null
@@ -0,0 +1,267 @@
+package validate_test
+
+import (
+       "os"
+       "testing"
+
+       "github.com/opencontainers/runc/libcontainer/configs"
+       "github.com/opencontainers/runc/libcontainer/configs/validate"
+)
+
+func TestValidate(t *testing.T) {
+       config := &configs.Config{
+               Rootfs: "/var",
+       }
+
+       validator := validate.New()
+       err := validator.Validate(config)
+       if err != nil {
+               t.Errorf("Expected error to not occur: %+v", err)
+       }
+}
+
+func TestValidateWithInvalidRootfs(t *testing.T) {
+       dir := "rootfs"
+       os.Symlink("/var", dir)
+       defer os.Remove(dir)
+
+       config := &configs.Config{
+               Rootfs: dir,
+       }
+
+       validator := validate.New()
+       err := validator.Validate(config)
+       if err == nil {
+               t.Error("Expected error to occur but it was nil")
+       }
+}
+
+func TestValidateNetworkWithoutNETNamespace(t *testing.T) {
+       network := &configs.Network{Type: "loopback"}
+       config := &configs.Config{
+               Rootfs:     "/var",
+               Namespaces: []configs.Namespace{},
+               Networks:   []*configs.Network{network},
+       }
+
+       validator := validate.New()
+       err := validator.Validate(config)
+       if err == nil {
+               t.Error("Expected error to occur but it was nil")
+       }
+}
+
+func TestValidateNetworkRoutesWithoutNETNamespace(t *testing.T) {
+       route := &configs.Route{Gateway: "255.255.255.0"}
+       config := &configs.Config{
+               Rootfs:     "/var",
+               Namespaces: []configs.Namespace{},
+               Routes:     []*configs.Route{route},
+       }
+
+       validator := validate.New()
+       err := validator.Validate(config)
+       if err == nil {
+               t.Error("Expected error to occur but it was nil")
+       }
+}
+
+func TestValidateHostname(t *testing.T) {
+       config := &configs.Config{
+               Rootfs:   "/var",
+               Hostname: "runc",
+               Namespaces: configs.Namespaces(
+                       []configs.Namespace{
+                               {Type: configs.NEWUTS},
+                       },
+               ),
+       }
+
+       validator := validate.New()
+       err := validator.Validate(config)
+       if err != nil {
+               t.Errorf("Expected error to not occur: %+v", err)
+       }
+}
+
+func TestValidateHostnameWithoutUTSNamespace(t *testing.T) {
+       config := &configs.Config{
+               Rootfs:   "/var",
+               Hostname: "runc",
+       }
+
+       validator := validate.New()
+       err := validator.Validate(config)
+       if err == nil {
+               t.Error("Expected error to occur but it was nil")
+       }
+}
+
+func TestValidateSecurityWithMaskPaths(t *testing.T) {
+       config := &configs.Config{
+               Rootfs:    "/var",
+               MaskPaths: []string{"/proc/kcore"},
+               Namespaces: configs.Namespaces(
+                       []configs.Namespace{
+                               {Type: configs.NEWNS},
+                       },
+               ),
+       }
+
+       validator := validate.New()
+       err := validator.Validate(config)
+       if err != nil {
+               t.Errorf("Expected error to not occur: %+v", err)
+       }
+}
+
+func TestValidateSecurityWithROPaths(t *testing.T) {
+       config := &configs.Config{
+               Rootfs:        "/var",
+               ReadonlyPaths: []string{"/proc/sys"},
+               Namespaces: configs.Namespaces(
+                       []configs.Namespace{
+                               {Type: configs.NEWNS},
+                       },
+               ),
+       }
+
+       validator := validate.New()
+       err := validator.Validate(config)
+       if err != nil {
+               t.Errorf("Expected error to not occur: %+v", err)
+       }
+}
+
+func TestValidateSecurityWithoutNEWNS(t *testing.T) {
+       config := &configs.Config{
+               Rootfs:        "/var",
+               MaskPaths:     []string{"/proc/kcore"},
+               ReadonlyPaths: []string{"/proc/sys"},
+       }
+
+       validator := validate.New()
+       err := validator.Validate(config)
+       if err == nil {
+               t.Error("Expected error to occur but it was nil")
+       }
+}
+
+func TestValidateUsernamespace(t *testing.T) {
+       if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) {
+               t.Skip("userns is unsupported")
+       }
+       config := &configs.Config{
+               Rootfs: "/var",
+               Namespaces: configs.Namespaces(
+                       []configs.Namespace{
+                               {Type: configs.NEWUSER},
+                       },
+               ),
+       }
+
+       validator := validate.New()
+       err := validator.Validate(config)
+       if err != nil {
+               t.Errorf("expected error to not occur %+v", err)
+       }
+}
+
+func TestValidateUsernamespaceWithoutUserNS(t *testing.T) {
+       uidMap := configs.IDMap{ContainerID: 123}
+       config := &configs.Config{
+               Rootfs:      "/var",
+               UidMappings: []configs.IDMap{uidMap},
+       }
+
+       validator := validate.New()
+       err := validator.Validate(config)
+       if err == nil {
+               t.Error("Expected error to occur but it was nil")
+       }
+}
+
+func TestValidateSysctl(t *testing.T) {
+       sysctl := map[string]string{
+               "fs.mqueue.ctl": "ctl",
+               "net.ctl":       "ctl",
+               "kernel.ctl":    "ctl",
+       }
+
+       for k, v := range sysctl {
+               config := &configs.Config{
+                       Rootfs: "/var",
+                       Sysctl: map[string]string{k: v},
+               }
+
+               validator := validate.New()
+               err := validator.Validate(config)
+               if err == nil {
+                       t.Error("Expected error to occur but it was nil")
+               }
+       }
+}
+
+func TestValidateValidSysctl(t *testing.T) {
+       sysctl := map[string]string{
+               "fs.mqueue.ctl": "ctl",
+               "net.ctl":       "ctl",
+               "kernel.msgmax": "ctl",
+       }
+
+       for k, v := range sysctl {
+               config := &configs.Config{
+                       Rootfs: "/var",
+                       Sysctl: map[string]string{k: v},
+                       Namespaces: []configs.Namespace{
+                               {
+                                       Type: configs.NEWNET,
+                               },
+                               {
+                                       Type: configs.NEWIPC,
+                               },
+                       },
+               }
+
+               validator := validate.New()
+               err := validator.Validate(config)
+               if err != nil {
+                       t.Errorf("Expected error to not occur with {%s=%s} but got: %q", k, v, err)
+               }
+       }
+}
+
+func TestValidateSysctlWithSameNs(t *testing.T) {
+       config := &configs.Config{
+               Rootfs: "/var",
+               Sysctl: map[string]string{"net.ctl": "ctl"},
+               Namespaces: configs.Namespaces(
+                       []configs.Namespace{
+                               {
+                                       Type: configs.NEWNET,
+                                       Path: "/proc/self/ns/net",
+                               },
+                       },
+               ),
+       }
+
+       validator := validate.New()
+       err := validator.Validate(config)
+       if err == nil {
+               t.Error("Expected error to occur but it was nil")
+       }
+}
+
+func TestValidateSysctlWithoutNETNamespace(t *testing.T) {
+       config := &configs.Config{
+               Rootfs:     "/var",
+               Sysctl:     map[string]string{"net.ctl": "ctl"},
+               Namespaces: []configs.Namespace{},
+       }
+
+       validator := validate.New()
+       err := validator.Validate(config)
+       if err == nil {
+               t.Error("Expected error to occur but it was nil")
+       }
+}
diff --git a/libcontainer/console_linux.go b/libcontainer/console_linux.go
new file mode 100644 (file)
index 0000000..9997e93
--- /dev/null
@@ -0,0 +1,41 @@
+package libcontainer
+
+import (
+       "os"
+
+       "golang.org/x/sys/unix"
+)
+
+// mount initializes the console inside the rootfs mounting with the specified mount label
+// and applying the correct ownership of the console.
+func mountConsole(slavePath string) error {
+       oldMask := unix.Umask(0000)
+       defer unix.Umask(oldMask)
+       f, err := os.Create("/dev/console")
+       if err != nil && !os.IsExist(err) {
+               return err
+       }
+       if f != nil {
+               f.Close()
+       }
+       return unix.Mount(slavePath, "/dev/console", "bind", unix.MS_BIND, "")
+}
+
+// dupStdio opens the slavePath for the console and dups the fds to the current
+// processes stdio, fd 0,1,2.
+func dupStdio(slavePath string) error {
+       fd, err := unix.Open(slavePath, unix.O_RDWR, 0)
+       if err != nil {
+               return &os.PathError{
+                       Op:   "open",
+                       Path: slavePath,
+                       Err:  err,
+               }
+       }
+       for _, i := range []int{0, 1, 2} {
+               if err := unix.Dup3(fd, i, 0); err != nil {
+                       return err
+               }
+       }
+       return nil
+}
diff --git a/libcontainer/container.go b/libcontainer/container.go
new file mode 100644 (file)
index 0000000..ba7541c
--- /dev/null
@@ -0,0 +1,173 @@
+// Package libcontainer provides a native Go implementation for creating containers
+// with namespaces, cgroups, capabilities, and filesystem access controls.
+// It allows you to manage the lifecycle of the container performing additional operations
+// after the container is created.
+package libcontainer
+
+import (
+       "os"
+       "time"
+
+       "github.com/opencontainers/runc/libcontainer/configs"
+       "github.com/opencontainers/runtime-spec/specs-go"
+)
+
+// Status is the status of a container.
+type Status int
+
+const (
+       // Created is the status that denotes the container exists but has not been run yet.
+       Created Status = iota
+       // Running is the status that denotes the container exists and is running.
+       Running
+       // Pausing is the status that denotes the container exists, it is in the process of being paused.
+       Pausing
+       // Paused is the status that denotes the container exists, but all its processes are paused.
+       Paused
+       // Stopped is the status that denotes the container does not have a created or running process.
+       Stopped
+)
+
+func (s Status) String() string {
+       switch s {
+       case Created:
+               return "created"
+       case Running:
+               return "running"
+       case Pausing:
+               return "pausing"
+       case Paused:
+               return "paused"
+       case Stopped:
+               return "stopped"
+       default:
+               return "unknown"
+       }
+}
+
+// BaseState represents the platform agnostic pieces relating to a
+// running container's state
+type BaseState struct {
+       // ID is the container ID.
+       ID string `json:"id"`
+
+       // InitProcessPid is the init process id in the parent namespace.
+       InitProcessPid int `json:"init_process_pid"`
+
+       // InitProcessStartTime is the init process start time in clock cycles since boot time.
+       InitProcessStartTime uint64 `json:"init_process_start"`
+
+       // Created is the unix timestamp for the creation time of the container in UTC
+       Created time.Time `json:"created"`
+
+       // Config is the container's configuration.
+       Config configs.Config `json:"config"`
+}
+
+// BaseContainer is a libcontainer container object.
+//
+// Each container is thread-safe within the same process. Since a container can
+// be destroyed by a separate process, any function may return that the container
+// was not found. BaseContainer includes methods that are platform agnostic.
+type BaseContainer interface {
+       // Returns the ID of the container
+       ID() string
+
+       // Returns the current status of the container.
+       //
+       // errors:
+       // ContainerNotExists - Container no longer exists,
+       // Systemerror - System error.
+       Status() (Status, error)
+
+       // State returns the current container's state information.
+       //
+       // errors:
+       // SystemError - System error.
+       State() (*State, error)
+
+       // OCIState returns the current container's state information.
+       //
+       // errors:
+       // SystemError - System error.
+       OCIState() (*specs.State, error)
+
+       // Returns the current config of the container.
+       Config() configs.Config
+
+       // Returns the PIDs inside this container. The PIDs are in the namespace of the calling process.
+       //
+       // errors:
+       // ContainerNotExists - Container no longer exists,
+       // Systemerror - System error.
+       //
+       // Some of the returned PIDs may no longer refer to processes in the Container, unless
+       // the Container state is PAUSED in which case every PID in the slice is valid.
+       Processes() ([]int, error)
+
+       // Returns statistics for the container.
+       //
+       // errors:
+       // ContainerNotExists - Container no longer exists,
+       // Systemerror - System error.
+       Stats() (*Stats, error)
+
+       // Set resources of container as configured
+       //
+       // We can use this to change resources when containers are running.
+       //
+       // errors:
+       // SystemError - System error.
+       Set(config configs.Config) error
+
+       // Start a process inside the container. Returns error if process fails to
+       // start. You can track process lifecycle with passed Process structure.
+       //
+       // errors:
+       // ContainerNotExists - Container no longer exists,
+       // ConfigInvalid - config is invalid,
+       // ContainerPaused - Container is paused,
+       // SystemError - System error.
+       Start(process *Process) (err error)
+
+       // Run immediately starts the process inside the container.  Returns error if process
+       // fails to start.  It does not block waiting for the exec fifo  after start returns but
+       // opens the fifo after start returns.
+       //
+       // errors:
+       // ContainerNotExists - Container no longer exists,
+       // ConfigInvalid - config is invalid,
+       // ContainerPaused - Container is paused,
+       // SystemError - System error.
+       Run(process *Process) (err error)
+
+       // Destroys the container, if its in a valid state, after killing any
+       // remaining running processes.
+       //
+       // Any event registrations are removed before the container is destroyed.
+       // No error is returned if the container is already destroyed.
+       //
+       // Running containers must first be stopped using Signal(..).
+       // Paused containers must first be resumed using Resume(..).
+       //
+       // errors:
+       // ContainerNotStopped - Container is still running,
+       // ContainerPaused - Container is paused,
+       // SystemError - System error.
+       Destroy() error
+
+       // Signal sends the provided signal code to the container's initial process.
+       //
+       // If all is specified the signal is sent to all processes in the container
+       // including the initial process.
+       //
+       // errors:
+       // SystemError - System error.
+       Signal(s os.Signal, all bool) error
+
+       // Exec signals the container to exec the users process at the end of the init.
+       //
+       // errors:
+       // SystemError - System error.
+       Exec() error
+}
diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go
new file mode 100644 (file)
index 0000000..fe70c93
--- /dev/null
@@ -0,0 +1,2060 @@
+// +build linux
+
+package libcontainer
+
+import (
+       "bytes"
+       "encoding/json"
+       "errors"
+       "fmt"
+       "io"
+       "io/ioutil"
+       "net"
+       "os"
+       "os/exec"
+       "path/filepath"
+       "reflect"
+       "strings"
+       "sync"
+       "syscall" // only for SysProcAttr and Signal
+       "time"
+
+       securejoin "github.com/cyphar/filepath-securejoin"
+       "github.com/opencontainers/runc/libcontainer/cgroups"
+       "github.com/opencontainers/runc/libcontainer/configs"
+       "github.com/opencontainers/runc/libcontainer/intelrdt"
+       "github.com/opencontainers/runc/libcontainer/system"
+       "github.com/opencontainers/runc/libcontainer/utils"
+       "github.com/opencontainers/runtime-spec/specs-go"
+
+       criurpc "github.com/checkpoint-restore/go-criu/rpc"
+       "github.com/golang/protobuf/proto"
+       "github.com/sirupsen/logrus"
+       "github.com/vishvananda/netlink/nl"
+       "golang.org/x/sys/unix"
+)
+
+const stdioFdCount = 3
+
+type linuxContainer struct {
+       id                   string
+       root                 string
+       config               *configs.Config
+       cgroupManager        cgroups.Manager
+       intelRdtManager      intelrdt.Manager
+       initPath             string
+       initArgs             []string
+       initProcess          parentProcess
+       initProcessStartTime uint64
+       criuPath             string
+       newuidmapPath        string
+       newgidmapPath        string
+       m                    sync.Mutex
+       criuVersion          int
+       state                containerState
+       created              time.Time
+}
+
+// State represents a running container's state
+type State struct {
+       BaseState
+
+       // Platform specific fields below here
+
+       // Specified if the container was started under the rootless mode.
+       // Set to true if BaseState.Config.RootlessEUID && BaseState.Config.RootlessCgroups
+       Rootless bool `json:"rootless"`
+
+       // Path to all the cgroups setup for a container. Key is cgroup subsystem name
+       // with the value as the path.
+       CgroupPaths map[string]string `json:"cgroup_paths"`
+
+       // NamespacePaths are filepaths to the container's namespaces. Key is the namespace type
+       // with the value as the path.
+       NamespacePaths map[configs.NamespaceType]string `json:"namespace_paths"`
+
+       // Container's standard descriptors (std{in,out,err}), needed for checkpoint and restore
+       ExternalDescriptors []string `json:"external_descriptors,omitempty"`
+
+       // Intel RDT "resource control" filesystem path
+       IntelRdtPath string `json:"intel_rdt_path"`
+}
+
+// Container is a libcontainer container object.
+//
+// Each container is thread-safe within the same process. Since a container can
+// be destroyed by a separate process, any function may return that the container
+// was not found.
+type Container interface {
+       BaseContainer
+
+       // Methods below here are platform specific
+
+       // Checkpoint checkpoints the running container's state to disk using the criu(8) utility.
+       //
+       // errors:
+       // Systemerror - System error.
+       Checkpoint(criuOpts *CriuOpts) error
+
+       // Restore restores the checkpointed container to a running state using the criu(8) utility.
+       //
+       // errors:
+       // Systemerror - System error.
+       Restore(process *Process, criuOpts *CriuOpts) error
+
+       // If the Container state is RUNNING or CREATED, sets the Container state to PAUSING and pauses
+       // the execution of any user processes. Asynchronously, when the container finished being paused the
+       // state is changed to PAUSED.
+       // If the Container state is PAUSED, do nothing.
+       //
+       // errors:
+       // ContainerNotExists - Container no longer exists,
+       // ContainerNotRunning - Container not running or created,
+       // Systemerror - System error.
+       Pause() error
+
+       // If the Container state is PAUSED, resumes the execution of any user processes in the
+       // Container before setting the Container state to RUNNING.
+       // If the Container state is RUNNING, do nothing.
+       //
+       // errors:
+       // ContainerNotExists - Container no longer exists,
+       // ContainerNotPaused - Container is not paused,
+       // Systemerror - System error.
+       Resume() error
+
+       // NotifyOOM returns a read-only channel signaling when the container receives an OOM notification.
+       //
+       // errors:
+       // Systemerror - System error.
+       NotifyOOM() (<-chan struct{}, error)
+
+       // NotifyMemoryPressure returns a read-only channel signaling when the container reaches a given pressure level
+       //
+       // errors:
+       // Systemerror - System error.
+       NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error)
+}
+
+// ID returns the container's unique ID
+func (c *linuxContainer) ID() string {
+       return c.id
+}
+
+// Config returns the container's configuration
+func (c *linuxContainer) Config() configs.Config {
+       return *c.config
+}
+
+func (c *linuxContainer) Status() (Status, error) {
+       c.m.Lock()
+       defer c.m.Unlock()
+       return c.currentStatus()
+}
+
+func (c *linuxContainer) State() (*State, error) {
+       c.m.Lock()
+       defer c.m.Unlock()
+       return c.currentState()
+}
+
+func (c *linuxContainer) OCIState() (*specs.State, error) {
+       c.m.Lock()
+       defer c.m.Unlock()
+       return c.currentOCIState()
+}
+
+func (c *linuxContainer) Processes() ([]int, error) {
+       pids, err := c.cgroupManager.GetAllPids()
+       if err != nil {
+               return nil, newSystemErrorWithCause(err, "getting all container pids from cgroups")
+       }
+       return pids, nil
+}
+
+func (c *linuxContainer) Stats() (*Stats, error) {
+       var (
+               err   error
+               stats = &Stats{}
+       )
+       if stats.CgroupStats, err = c.cgroupManager.GetStats(); err != nil {
+               return stats, newSystemErrorWithCause(err, "getting container stats from cgroups")
+       }
+       if c.intelRdtManager != nil {
+               if stats.IntelRdtStats, err = c.intelRdtManager.GetStats(); err != nil {
+                       return stats, newSystemErrorWithCause(err, "getting container's Intel RDT stats")
+               }
+       }
+       for _, iface := range c.config.Networks {
+               switch iface.Type {
+               case "veth":
+                       istats, err := getNetworkInterfaceStats(iface.HostInterfaceName)
+                       if err != nil {
+                               return stats, newSystemErrorWithCausef(err, "getting network stats for interface %q", iface.HostInterfaceName)
+                       }
+                       stats.Interfaces = append(stats.Interfaces, istats)
+               }
+       }
+       return stats, nil
+}
+
+func (c *linuxContainer) Set(config configs.Config) error {
+       c.m.Lock()
+       defer c.m.Unlock()
+       status, err := c.currentStatus()
+       if err != nil {
+               return err
+       }
+       if status == Stopped {
+               return newGenericError(fmt.Errorf("container not running"), ContainerNotRunning)
+       }
+       if err := c.cgroupManager.Set(&config); err != nil {
+               // Set configs back
+               if err2 := c.cgroupManager.Set(c.config); err2 != nil {
+                       logrus.Warnf("Setting back cgroup configs failed due to error: %v, your state.json and actual configs might be inconsistent.", err2)
+               }
+               return err
+       }
+       if c.intelRdtManager != nil {
+               if err := c.intelRdtManager.Set(&config); err != nil {
+                       // Set configs back
+                       if err2 := c.intelRdtManager.Set(c.config); err2 != nil {
+                               logrus.Warnf("Setting back intelrdt configs failed due to error: %v, your state.json and actual configs might be inconsistent.", err2)
+                       }
+                       return err
+               }
+       }
+       // After config setting succeed, update config and states
+       c.config = &config
+       _, err = c.updateState(nil)
+       return err
+}
+
+func (c *linuxContainer) Start(process *Process) error {
+       c.m.Lock()
+       defer c.m.Unlock()
+       if process.Init {
+               if err := c.createExecFifo(); err != nil {
+                       return err
+               }
+       }
+       if err := c.start(process); err != nil {
+               if process.Init {
+                       c.deleteExecFifo()
+               }
+               return err
+       }
+       return nil
+}
+
+func (c *linuxContainer) Run(process *Process) error {
+       if err := c.Start(process); err != nil {
+               return err
+       }
+       if process.Init {
+               return c.exec()
+       }
+       return nil
+}
+
+func (c *linuxContainer) Exec() error {
+       c.m.Lock()
+       defer c.m.Unlock()
+       return c.exec()
+}
+
+func (c *linuxContainer) exec() error {
+       path := filepath.Join(c.root, execFifoFilename)
+       pid := c.initProcess.pid()
+       blockingFifoOpenCh := awaitFifoOpen(path)
+       for {
+               select {
+               case result := <-blockingFifoOpenCh:
+                       return handleFifoResult(result)
+
+               case <-time.After(time.Millisecond * 100):
+                       stat, err := system.Stat(pid)
+                       if err != nil || stat.State == system.Zombie {
+                               // could be because process started, ran, and completed between our 100ms timeout and our system.Stat() check.
+                               // see if the fifo exists and has data (with a non-blocking open, which will succeed if the writing process is complete).
+                               if err := handleFifoResult(fifoOpen(path, false)); err != nil {
+                                       return errors.New("container process is already dead")
+                               }
+                               return nil
+                       }
+               }
+       }
+}
+
+func readFromExecFifo(execFifo io.Reader) error {
+       data, err := ioutil.ReadAll(execFifo)
+       if err != nil {
+               return err
+       }
+       if len(data) <= 0 {
+               return fmt.Errorf("cannot start an already running container")
+       }
+       return nil
+}
+
+func awaitFifoOpen(path string) <-chan openResult {
+       fifoOpened := make(chan openResult)
+       go func() {
+               result := fifoOpen(path, true)
+               fifoOpened <- result
+       }()
+       return fifoOpened
+}
+
+func fifoOpen(path string, block bool) openResult {
+       flags := os.O_RDONLY
+       if !block {
+               flags |= syscall.O_NONBLOCK
+       }
+       f, err := os.OpenFile(path, flags, 0)
+       if err != nil {
+               return openResult{err: newSystemErrorWithCause(err, "open exec fifo for reading")}
+       }
+       return openResult{file: f}
+}
+
+func handleFifoResult(result openResult) error {
+       if result.err != nil {
+               return result.err
+       }
+       f := result.file
+       defer f.Close()
+       if err := readFromExecFifo(f); err != nil {
+               return err
+       }
+       return os.Remove(f.Name())
+}
+
+type openResult struct {
+       file *os.File
+       err  error
+}
+
+func (c *linuxContainer) start(process *Process) error {
+       parent, err := c.newParentProcess(process)
+       if err != nil {
+               return newSystemErrorWithCause(err, "creating new parent process")
+       }
+       parent.forwardChildLogs()
+       if err := parent.start(); err != nil {
+               // terminate the process to ensure that it properly is reaped.
+               if err := ignoreTerminateErrors(parent.terminate()); err != nil {
+                       logrus.Warn(err)
+               }
+               return newSystemErrorWithCause(err, "starting container process")
+       }
+       // generate a timestamp indicating when the container was started
+       c.created = time.Now().UTC()
+       if process.Init {
+               c.state = &createdState{
+                       c: c,
+               }
+               state, err := c.updateState(parent)
+               if err != nil {
+                       return err
+               }
+               c.initProcessStartTime = state.InitProcessStartTime
+
+               if c.config.Hooks != nil {
+                       s, err := c.currentOCIState()
+                       if err != nil {
+                               return err
+                       }
+                       for i, hook := range c.config.Hooks.Poststart {
+                               if err := hook.Run(s); err != nil {
+                                       if err := ignoreTerminateErrors(parent.terminate()); err != nil {
+                                               logrus.Warn(err)
+                                       }
+                                       return newSystemErrorWithCausef(err, "running poststart hook %d", i)
+                               }
+                       }
+               }
+       }
+       return nil
+}
+
+func (c *linuxContainer) Signal(s os.Signal, all bool) error {
+       if all {
+               return signalAllProcesses(c.cgroupManager, s)
+       }
+       status, err := c.currentStatus()
+       if err != nil {
+               return err
+       }
+       // to avoid a PID reuse attack
+       if status == Running || status == Created || status == Paused {
+               if err := c.initProcess.signal(s); err != nil {
+                       return newSystemErrorWithCause(err, "signaling init process")
+               }
+               return nil
+       }
+       return newGenericError(fmt.Errorf("container not running"), ContainerNotRunning)
+}
+
+func (c *linuxContainer) createExecFifo() error {
+       rootuid, err := c.Config().HostRootUID()
+       if err != nil {
+               return err
+       }
+       rootgid, err := c.Config().HostRootGID()
+       if err != nil {
+               return err
+       }
+
+       fifoName := filepath.Join(c.root, execFifoFilename)
+       if _, err := os.Stat(fifoName); err == nil {
+               return fmt.Errorf("exec fifo %s already exists", fifoName)
+       }
+       oldMask := unix.Umask(0000)
+       if err := unix.Mkfifo(fifoName, 0622); err != nil {
+               unix.Umask(oldMask)
+               return err
+       }
+       unix.Umask(oldMask)
+       return os.Chown(fifoName, rootuid, rootgid)
+}
+
+func (c *linuxContainer) deleteExecFifo() {
+       fifoName := filepath.Join(c.root, execFifoFilename)
+       os.Remove(fifoName)
+}
+
+// includeExecFifo opens the container's execfifo as a pathfd, so that the
+// container cannot access the statedir (and the FIFO itself remains
+// un-opened). It then adds the FifoFd to the given exec.Cmd as an inherited
+// fd, with _LIBCONTAINER_FIFOFD set to its fd number.
+func (c *linuxContainer) includeExecFifo(cmd *exec.Cmd) error {
+       fifoName := filepath.Join(c.root, execFifoFilename)
+       fifoFd, err := unix.Open(fifoName, unix.O_PATH|unix.O_CLOEXEC, 0)
+       if err != nil {
+               return err
+       }
+
+       cmd.ExtraFiles = append(cmd.ExtraFiles, os.NewFile(uintptr(fifoFd), fifoName))
+       cmd.Env = append(cmd.Env,
+               fmt.Sprintf("_LIBCONTAINER_FIFOFD=%d", stdioFdCount+len(cmd.ExtraFiles)-1))
+       return nil
+}
+
+func (c *linuxContainer) newParentProcess(p *Process) (parentProcess, error) {
+       parentInitPipe, childInitPipe, err := utils.NewSockPair("init")
+       if err != nil {
+               return nil, newSystemErrorWithCause(err, "creating new init pipe")
+       }
+       messageSockPair := filePair{parentInitPipe, childInitPipe}
+
+       parentLogPipe, childLogPipe, err := os.Pipe()
+       if err != nil {
+               return nil, fmt.Errorf("Unable to create the log pipe:  %s", err)
+       }
+       logFilePair := filePair{parentLogPipe, childLogPipe}
+
+       cmd, err := c.commandTemplate(p, childInitPipe, childLogPipe)
+       if err != nil {
+               return nil, newSystemErrorWithCause(err, "creating new command template")
+       }
+       if !p.Init {
+               return c.newSetnsProcess(p, cmd, messageSockPair, logFilePair)
+       }
+
+       // We only set up fifoFd if we're not doing a `runc exec`. The historic
+       // reason for this is that previously we would pass a dirfd that allowed
+       // for container rootfs escape (and not doing it in `runc exec` avoided
+       // that problem), but we no longer do that. However, there's no need to do
+       // this for `runc exec` so we just keep it this way to be safe.
+       if err := c.includeExecFifo(cmd); err != nil {
+               return nil, newSystemErrorWithCause(err, "including execfifo in cmd.Exec setup")
+       }
+       return c.newInitProcess(p, cmd, messageSockPair, logFilePair)
+}
+
+func (c *linuxContainer) commandTemplate(p *Process, childInitPipe *os.File, childLogPipe *os.File) (*exec.Cmd, error) {
+       cmd := exec.Command(c.initPath, c.initArgs[1:]...)
+       cmd.Args[0] = c.initArgs[0]
+       cmd.Stdin = p.Stdin
+       cmd.Stdout = p.Stdout
+       cmd.Stderr = p.Stderr
+       cmd.Dir = c.config.Rootfs
+       if cmd.SysProcAttr == nil {
+               cmd.SysProcAttr = &syscall.SysProcAttr{}
+       }
+       cmd.Env = append(cmd.Env, fmt.Sprintf("GOMAXPROCS=%s", os.Getenv("GOMAXPROCS")))
+       cmd.ExtraFiles = append(cmd.ExtraFiles, p.ExtraFiles...)
+       if p.ConsoleSocket != nil {
+               cmd.ExtraFiles = append(cmd.ExtraFiles, p.ConsoleSocket)
+               cmd.Env = append(cmd.Env,
+                       fmt.Sprintf("_LIBCONTAINER_CONSOLE=%d", stdioFdCount+len(cmd.ExtraFiles)-1),
+               )
+       }
+       cmd.ExtraFiles = append(cmd.ExtraFiles, childInitPipe)
+       cmd.Env = append(cmd.Env,
+               fmt.Sprintf("_LIBCONTAINER_INITPIPE=%d", stdioFdCount+len(cmd.ExtraFiles)-1),
+               fmt.Sprintf("_LIBCONTAINER_STATEDIR=%s", c.root),
+       )
+
+       cmd.ExtraFiles = append(cmd.ExtraFiles, childLogPipe)
+       cmd.Env = append(cmd.Env,
+               fmt.Sprintf("_LIBCONTAINER_LOGPIPE=%d", stdioFdCount+len(cmd.ExtraFiles)-1),
+               fmt.Sprintf("_LIBCONTAINER_LOGLEVEL=%s", p.LogLevel),
+       )
+
+       // NOTE: when running a container with no PID namespace and the parent process spawning the container is
+       // PID1 the pdeathsig is being delivered to the container's init process by the kernel for some reason
+       // even with the parent still running.
+       if c.config.ParentDeathSignal > 0 {
+               cmd.SysProcAttr.Pdeathsig = syscall.Signal(c.config.ParentDeathSignal)
+       }
+       return cmd, nil
+}
+
+func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, messageSockPair, logFilePair filePair) (*initProcess, error) {
+       cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard))
+       nsMaps := make(map[configs.NamespaceType]string)
+       for _, ns := range c.config.Namespaces {
+               if ns.Path != "" {
+                       nsMaps[ns.Type] = ns.Path
+               }
+       }
+       _, sharePidns := nsMaps[configs.NEWPID]
+       data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps)
+       if err != nil {
+               return nil, err
+       }
+       init := &initProcess{
+               cmd:             cmd,
+               messageSockPair: messageSockPair,
+               logFilePair:     logFilePair,
+               manager:         c.cgroupManager,
+               intelRdtManager: c.intelRdtManager,
+               config:          c.newInitConfig(p),
+               container:       c,
+               process:         p,
+               bootstrapData:   data,
+               sharePidns:      sharePidns,
+       }
+       c.initProcess = init
+       return init, nil
+}
+
+func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, messageSockPair, logFilePair filePair) (*setnsProcess, error) {
+       cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initSetns))
+       state, err := c.currentState()
+       if err != nil {
+               return nil, newSystemErrorWithCause(err, "getting container's current state")
+       }
+       // for setns process, we don't have to set cloneflags as the process namespaces
+       // will only be set via setns syscall
+       data, err := c.bootstrapData(0, state.NamespacePaths)
+       if err != nil {
+               return nil, err
+       }
+       return &setnsProcess{
+               cmd:             cmd,
+               cgroupPaths:     c.cgroupManager.GetPaths(),
+               rootlessCgroups: c.config.RootlessCgroups,
+               intelRdtPath:    state.IntelRdtPath,
+               messageSockPair: messageSockPair,
+               logFilePair:     logFilePair,
+               config:          c.newInitConfig(p),
+               process:         p,
+               bootstrapData:   data,
+       }, nil
+}
+
+func (c *linuxContainer) newInitConfig(process *Process) *initConfig {
+       cfg := &initConfig{
+               Config:           c.config,
+               Args:             process.Args,
+               Env:              process.Env,
+               User:             process.User,
+               AdditionalGroups: process.AdditionalGroups,
+               Cwd:              process.Cwd,
+               Capabilities:     process.Capabilities,
+               PassedFilesCount: len(process.ExtraFiles),
+               ContainerId:      c.ID(),
+               NoNewPrivileges:  c.config.NoNewPrivileges,
+               RootlessEUID:     c.config.RootlessEUID,
+               RootlessCgroups:  c.config.RootlessCgroups,
+               AppArmorProfile:  c.config.AppArmorProfile,
+               ProcessLabel:     c.config.ProcessLabel,
+               Rlimits:          c.config.Rlimits,
+       }
+       if process.NoNewPrivileges != nil {
+               cfg.NoNewPrivileges = *process.NoNewPrivileges
+       }
+       if process.AppArmorProfile != "" {
+               cfg.AppArmorProfile = process.AppArmorProfile
+       }
+       if process.Label != "" {
+               cfg.ProcessLabel = process.Label
+       }
+       if len(process.Rlimits) > 0 {
+               cfg.Rlimits = process.Rlimits
+       }
+       cfg.CreateConsole = process.ConsoleSocket != nil
+       cfg.ConsoleWidth = process.ConsoleWidth
+       cfg.ConsoleHeight = process.ConsoleHeight
+       return cfg
+}
+
+func (c *linuxContainer) Destroy() error {
+       c.m.Lock()
+       defer c.m.Unlock()
+       return c.state.destroy()
+}
+
+func (c *linuxContainer) Pause() error {
+       c.m.Lock()
+       defer c.m.Unlock()
+       status, err := c.currentStatus()
+       if err != nil {
+               return err
+       }
+       switch status {
+       case Running, Created:
+               if err := c.cgroupManager.Freeze(configs.Frozen); err != nil {
+                       return err
+               }
+               return c.state.transition(&pausedState{
+                       c: c,
+               })
+       }
+       return newGenericError(fmt.Errorf("container not running or created: %s", status), ContainerNotRunning)
+}
+
+func (c *linuxContainer) Resume() error {
+       c.m.Lock()
+       defer c.m.Unlock()
+       status, err := c.currentStatus()
+       if err != nil {
+               return err
+       }
+       if status != Paused {
+               return newGenericError(fmt.Errorf("container not paused"), ContainerNotPaused)
+       }
+       if err := c.cgroupManager.Freeze(configs.Thawed); err != nil {
+               return err
+       }
+       return c.state.transition(&runningState{
+               c: c,
+       })
+}
+
+func (c *linuxContainer) NotifyOOM() (<-chan struct{}, error) {
+       // XXX(cyphar): This requires cgroups.
+       if c.config.RootlessCgroups {
+               logrus.Warn("getting OOM notifications may fail if you don't have the full access to cgroups")
+       }
+       return notifyOnOOM(c.cgroupManager.GetPaths())
+}
+
+func (c *linuxContainer) NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error) {
+       // XXX(cyphar): This requires cgroups.
+       if c.config.RootlessCgroups {
+               logrus.Warn("getting memory pressure notifications may fail if you don't have the full access to cgroups")
+       }
+       return notifyMemoryPressure(c.cgroupManager.GetPaths(), level)
+}
+
+var criuFeatures *criurpc.CriuFeatures
+
+func (c *linuxContainer) checkCriuFeatures(criuOpts *CriuOpts, rpcOpts *criurpc.CriuOpts, criuFeat *criurpc.CriuFeatures) error {
+
+       var t criurpc.CriuReqType
+       t = criurpc.CriuReqType_FEATURE_CHECK
+
+       // criu 1.8 => 10800
+       if err := c.checkCriuVersion(10800); err != nil {
+               // Feature checking was introduced with CRIU 1.8.
+               // Ignore the feature check if an older CRIU version is used
+               // and just act as before.
+               // As all automated PR testing is done using CRIU 1.7 this
+               // code will not be tested by automated PR testing.
+               return nil
+       }
+
+       // make sure the features we are looking for are really not from
+       // some previous check
+       criuFeatures = nil
+
+       req := &criurpc.CriuReq{
+               Type: &t,
+               // Theoretically this should not be necessary but CRIU
+               // segfaults if Opts is empty.
+               // Fixed in CRIU  2.12
+               Opts:     rpcOpts,
+               Features: criuFeat,
+       }
+
+       err := c.criuSwrk(nil, req, criuOpts, false, nil)
+       if err != nil {
+               logrus.Debugf("%s", err)
+               return fmt.Errorf("CRIU feature check failed")
+       }
+
+       logrus.Debugf("Feature check says: %s", criuFeatures)
+       missingFeatures := false
+
+       // The outer if checks if the fields actually exist
+       if (criuFeat.MemTrack != nil) &&
+               (criuFeatures.MemTrack != nil) {
+               // The inner if checks if they are set to true
+               if *criuFeat.MemTrack && !*criuFeatures.MemTrack {
+                       missingFeatures = true
+                       logrus.Debugf("CRIU does not support MemTrack")
+               }
+       }
+
+       // This needs to be repeated for every new feature check.
+       // Is there a way to put this in a function. Reflection?
+       if (criuFeat.LazyPages != nil) &&
+               (criuFeatures.LazyPages != nil) {
+               if *criuFeat.LazyPages && !*criuFeatures.LazyPages {
+                       missingFeatures = true
+                       logrus.Debugf("CRIU does not support LazyPages")
+               }
+       }
+
+       if missingFeatures {
+               return fmt.Errorf("CRIU is missing features")
+       }
+
+       return nil
+}
+
+func parseCriuVersion(path string) (int, error) {
+       var x, y, z int
+
+       out, err := exec.Command(path, "-V").Output()
+       if err != nil {
+               return 0, fmt.Errorf("Unable to execute CRIU command: %s", path)
+       }
+
+       x = 0
+       y = 0
+       z = 0
+       if ep := strings.Index(string(out), "-"); ep >= 0 {
+               // criu Git version format
+               var version string
+               if sp := strings.Index(string(out), "GitID"); sp > 0 {
+                       version = string(out)[sp:ep]
+               } else {
+                       return 0, fmt.Errorf("Unable to parse the CRIU version: %s", path)
+               }
+
+               n, err := fmt.Sscanf(version, "GitID: v%d.%d.%d", &x, &y, &z) // 1.5.2
+               if err != nil {
+                       n, err = fmt.Sscanf(version, "GitID: v%d.%d", &x, &y) // 1.6
+                       y++
+               } else {
+                       z++
+               }
+               if n < 2 || err != nil {
+                       return 0, fmt.Errorf("Unable to parse the CRIU version: %s %d %s", version, n, err)
+               }
+       } else {
+               // criu release version format
+               n, err := fmt.Sscanf(string(out), "Version: %d.%d.%d\n", &x, &y, &z) // 1.5.2
+               if err != nil {
+                       n, err = fmt.Sscanf(string(out), "Version: %d.%d\n", &x, &y) // 1.6
+               }
+               if n < 2 || err != nil {
+                       return 0, fmt.Errorf("Unable to parse the CRIU version: %s %d %s", out, n, err)
+               }
+       }
+
+       return x*10000 + y*100 + z, nil
+}
+
+func compareCriuVersion(criuVersion int, minVersion int) error {
+       // simple function to perform the actual version compare
+       if criuVersion < minVersion {
+               return fmt.Errorf("CRIU version %d must be %d or higher", criuVersion, minVersion)
+       }
+
+       return nil
+}
+
+// This is used to store the result of criu version RPC
+var criuVersionRPC *criurpc.CriuVersion
+
+// checkCriuVersion checks Criu version greater than or equal to minVersion
+func (c *linuxContainer) checkCriuVersion(minVersion int) error {
+
+       // If the version of criu has already been determined there is no need
+       // to ask criu for the version again. Use the value from c.criuVersion.
+       if c.criuVersion != 0 {
+               return compareCriuVersion(c.criuVersion, minVersion)
+       }
+
+       // First try if this version of CRIU support the version RPC.
+       // The CRIU version RPC was introduced with CRIU 3.0.
+
+       // First, reset the variable for the RPC answer to nil
+       criuVersionRPC = nil
+
+       var t criurpc.CriuReqType
+       t = criurpc.CriuReqType_VERSION
+       req := &criurpc.CriuReq{
+               Type: &t,
+       }
+
+       err := c.criuSwrk(nil, req, nil, false, nil)
+       if err != nil {
+               return fmt.Errorf("CRIU version check failed: %s", err)
+       }
+
+       if criuVersionRPC != nil {
+               logrus.Debugf("CRIU version: %s", criuVersionRPC)
+               // major and minor are always set
+               c.criuVersion = int(*criuVersionRPC.Major) * 10000
+               c.criuVersion += int(*criuVersionRPC.Minor) * 100
+               if criuVersionRPC.Sublevel != nil {
+                       c.criuVersion += int(*criuVersionRPC.Sublevel)
+               }
+               if criuVersionRPC.Gitid != nil {
+                       // runc's convention is that a CRIU git release is
+                       // always the same as increasing the minor by 1
+                       c.criuVersion -= (c.criuVersion % 100)
+                       c.criuVersion += 100
+               }
+               return compareCriuVersion(c.criuVersion, minVersion)
+       }
+
+       // This is CRIU without the version RPC and therefore
+       // older than 3.0. Parsing the output is required.
+
+       // This can be remove once runc does not work with criu older than 3.0
+
+       c.criuVersion, err = parseCriuVersion(c.criuPath)
+       if err != nil {
+               return err
+       }
+
+       return compareCriuVersion(c.criuVersion, minVersion)
+}
+
+const descriptorsFilename = "descriptors.json"
+
+func (c *linuxContainer) addCriuDumpMount(req *criurpc.CriuReq, m *configs.Mount) {
+       mountDest := m.Destination
+       if strings.HasPrefix(mountDest, c.config.Rootfs) {
+               mountDest = mountDest[len(c.config.Rootfs):]
+       }
+
+       extMnt := &criurpc.ExtMountMap{
+               Key: proto.String(mountDest),
+               Val: proto.String(mountDest),
+       }
+       req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt)
+}
+
+func (c *linuxContainer) addMaskPaths(req *criurpc.CriuReq) error {
+       for _, path := range c.config.MaskPaths {
+               fi, err := os.Stat(fmt.Sprintf("/proc/%d/root/%s", c.initProcess.pid(), path))
+               if err != nil {
+                       if os.IsNotExist(err) {
+                               continue
+                       }
+                       return err
+               }
+               if fi.IsDir() {
+                       continue
+               }
+
+               extMnt := &criurpc.ExtMountMap{
+                       Key: proto.String(path),
+                       Val: proto.String("/dev/null"),
+               }
+               req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt)
+       }
+       return nil
+}
+
+func waitForCriuLazyServer(r *os.File, status string) error {
+
+       data := make([]byte, 1)
+       _, err := r.Read(data)
+       if err != nil {
+               return err
+       }
+       fd, err := os.OpenFile(status, os.O_TRUNC|os.O_WRONLY, os.ModeAppend)
+       if err != nil {
+               return err
+       }
+       _, err = fd.Write(data)
+       if err != nil {
+               return err
+       }
+       fd.Close()
+
+       return nil
+}
+
+func (c *linuxContainer) handleCriuConfigurationFile(rpcOpts *criurpc.CriuOpts) {
+       // CRIU will evaluate a configuration starting with release 3.11.
+       // Settings in the configuration file will overwrite RPC settings.
+       // Look for annotations. The annotation 'org.criu.config'
+       // specifies if CRIU should use a different, container specific
+       // configuration file.
+       _, annotations := utils.Annotations(c.config.Labels)
+       configFile, exists := annotations["org.criu.config"]
+       if exists {
+               // If the annotation 'org.criu.config' exists and is set
+               // to a non-empty string, tell CRIU to use that as a
+               // configuration file. If the file does not exist, CRIU
+               // will just ignore it.
+               if configFile != "" {
+                       rpcOpts.ConfigFile = proto.String(configFile)
+               }
+               // If 'org.criu.config' exists and is set to an empty
+               // string, a runc specific CRIU configuration file will
+               // be not set at all.
+       } else {
+               // If the mentioned annotation has not been found, specify
+               // a default CRIU configuration file.
+               rpcOpts.ConfigFile = proto.String("/etc/criu/runc.conf")
+       }
+}
+
+func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error {
+       c.m.Lock()
+       defer c.m.Unlock()
+
+       // Checkpoint is unlikely to work if os.Geteuid() != 0 || system.RunningInUserNS().
+       // (CLI prints a warning)
+       // TODO(avagin): Figure out how to make this work nicely. CRIU 2.0 has
+       //               support for doing unprivileged dumps, but the setup of
+       //               rootless containers might make this complicated.
+
+       // criu 1.5.2 => 10502
+       if err := c.checkCriuVersion(10502); err != nil {
+               return err
+       }
+
+       if criuOpts.ImagesDirectory == "" {
+               return fmt.Errorf("invalid directory to save checkpoint")
+       }
+
+       // Since a container can be C/R'ed multiple times,
+       // the checkpoint directory may already exist.
+       if err := os.Mkdir(criuOpts.ImagesDirectory, 0700); err != nil && !os.IsExist(err) {
+               return err
+       }
+
+       if criuOpts.WorkDirectory == "" {
+               criuOpts.WorkDirectory = filepath.Join(c.root, "criu.work")
+       }
+
+       if err := os.Mkdir(criuOpts.WorkDirectory, 0700); err != nil && !os.IsExist(err) {
+               return err
+       }
+
+       workDir, err := os.Open(criuOpts.WorkDirectory)
+       if err != nil {
+               return err
+       }
+       defer workDir.Close()
+
+       imageDir, err := os.Open(criuOpts.ImagesDirectory)
+       if err != nil {
+               return err
+       }
+       defer imageDir.Close()
+
+       rpcOpts := criurpc.CriuOpts{
+               ImagesDirFd:     proto.Int32(int32(imageDir.Fd())),
+               WorkDirFd:       proto.Int32(int32(workDir.Fd())),
+               LogLevel:        proto.Int32(4),
+               LogFile:         proto.String("dump.log"),
+               Root:            proto.String(c.config.Rootfs),
+               ManageCgroups:   proto.Bool(true),
+               NotifyScripts:   proto.Bool(true),
+               Pid:             proto.Int32(int32(c.initProcess.pid())),
+               ShellJob:        proto.Bool(criuOpts.ShellJob),
+               LeaveRunning:    proto.Bool(criuOpts.LeaveRunning),
+               TcpEstablished:  proto.Bool(criuOpts.TcpEstablished),
+               ExtUnixSk:       proto.Bool(criuOpts.ExternalUnixConnections),
+               FileLocks:       proto.Bool(criuOpts.FileLocks),
+               EmptyNs:         proto.Uint32(criuOpts.EmptyNs),
+               OrphanPtsMaster: proto.Bool(true),
+               AutoDedup:       proto.Bool(criuOpts.AutoDedup),
+               LazyPages:       proto.Bool(criuOpts.LazyPages),
+       }
+
+       c.handleCriuConfigurationFile(&rpcOpts)
+
+       // If the container is running in a network namespace and has
+       // a path to the network namespace configured, we will dump
+       // that network namespace as an external namespace and we
+       // will expect that the namespace exists during restore.
+       // This basically means that CRIU will ignore the namespace
+       // and expect to be setup correctly.
+       nsPath := c.config.Namespaces.PathOf(configs.NEWNET)
+       if nsPath != "" {
+               // For this to work we need at least criu 3.11.0 => 31100.
+               // As there was already a successful version check we will
+               // not error out if it fails. runc will just behave as it used
+               // to do and ignore external network namespaces.
+               err := c.checkCriuVersion(31100)
+               if err == nil {
+                       // CRIU expects the information about an external namespace
+                       // like this: --external net[<inode>]:<key>
+                       // This <key> is always 'extRootNetNS'.
+                       var netns syscall.Stat_t
+                       err = syscall.Stat(nsPath, &netns)
+                       if err != nil {
+                               return err
+                       }
+                       criuExternal := fmt.Sprintf("net[%d]:extRootNetNS", netns.Ino)
+                       rpcOpts.External = append(rpcOpts.External, criuExternal)
+               }
+       }
+
+       fcg := c.cgroupManager.GetPaths()["freezer"]
+       if fcg != "" {
+               rpcOpts.FreezeCgroup = proto.String(fcg)
+       }
+
+       // append optional criu opts, e.g., page-server and port
+       if criuOpts.PageServer.Address != "" && criuOpts.PageServer.Port != 0 {
+               rpcOpts.Ps = &criurpc.CriuPageServerInfo{
+                       Address: proto.String(criuOpts.PageServer.Address),
+                       Port:    proto.Int32(criuOpts.PageServer.Port),
+               }
+       }
+
+       //pre-dump may need parentImage param to complete iterative migration
+       if criuOpts.ParentImage != "" {
+               rpcOpts.ParentImg = proto.String(criuOpts.ParentImage)
+               rpcOpts.TrackMem = proto.Bool(true)
+       }
+
+       // append optional manage cgroups mode
+       if criuOpts.ManageCgroupsMode != 0 {
+               // criu 1.7 => 10700
+               if err := c.checkCriuVersion(10700); err != nil {
+                       return err
+               }
+               mode := criurpc.CriuCgMode(criuOpts.ManageCgroupsMode)
+               rpcOpts.ManageCgroupsMode = &mode
+       }
+
+       var t criurpc.CriuReqType
+       if criuOpts.PreDump {
+               feat := criurpc.CriuFeatures{
+                       MemTrack: proto.Bool(true),
+               }
+
+               if err := c.checkCriuFeatures(criuOpts, &rpcOpts, &feat); err != nil {
+                       return err
+               }
+
+               t = criurpc.CriuReqType_PRE_DUMP
+       } else {
+               t = criurpc.CriuReqType_DUMP
+       }
+       req := &criurpc.CriuReq{
+               Type: &t,
+               Opts: &rpcOpts,
+       }
+
+       if criuOpts.LazyPages {
+               // lazy migration requested; check if criu supports it
+               feat := criurpc.CriuFeatures{
+                       LazyPages: proto.Bool(true),
+               }
+
+               if err := c.checkCriuFeatures(criuOpts, &rpcOpts, &feat); err != nil {
+                       return err
+               }
+
+               statusRead, statusWrite, err := os.Pipe()
+               if err != nil {
+                       return err
+               }
+               rpcOpts.StatusFd = proto.Int32(int32(statusWrite.Fd()))
+               go waitForCriuLazyServer(statusRead, criuOpts.StatusFd)
+       }
+
+       //no need to dump these information in pre-dump
+       if !criuOpts.PreDump {
+               for _, m := range c.config.Mounts {
+                       switch m.Device {
+                       case "bind":
+                               c.addCriuDumpMount(req, m)
+                       case "cgroup":
+                               binds, err := getCgroupMounts(m)
+                               if err != nil {
+                                       return err
+                               }
+                               for _, b := range binds {
+                                       c.addCriuDumpMount(req, b)
+                               }
+                       }
+               }
+
+               if err := c.addMaskPaths(req); err != nil {
+                       return err
+               }
+
+               for _, node := range c.config.Devices {
+                       m := &configs.Mount{Destination: node.Path, Source: node.Path}
+                       c.addCriuDumpMount(req, m)
+               }
+
+               // Write the FD info to a file in the image directory
+               fdsJSON, err := json.Marshal(c.initProcess.externalDescriptors())
+               if err != nil {
+                       return err
+               }
+
+               err = ioutil.WriteFile(filepath.Join(criuOpts.ImagesDirectory, descriptorsFilename), fdsJSON, 0600)
+               if err != nil {
+                       return err
+               }
+       }
+
+       err = c.criuSwrk(nil, req, criuOpts, false, nil)
+       if err != nil {
+               return err
+       }
+       return nil
+}
+
+func (c *linuxContainer) addCriuRestoreMount(req *criurpc.CriuReq, m *configs.Mount) {
+       mountDest := m.Destination
+       if strings.HasPrefix(mountDest, c.config.Rootfs) {
+               mountDest = mountDest[len(c.config.Rootfs):]
+       }
+
+       extMnt := &criurpc.ExtMountMap{
+               Key: proto.String(mountDest),
+               Val: proto.String(m.Source),
+       }
+       req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt)
+}
+
+func (c *linuxContainer) restoreNetwork(req *criurpc.CriuReq, criuOpts *CriuOpts) {
+       for _, iface := range c.config.Networks {
+               switch iface.Type {
+               case "veth":
+                       veth := new(criurpc.CriuVethPair)
+                       veth.IfOut = proto.String(iface.HostInterfaceName)
+                       veth.IfIn = proto.String(iface.Name)
+                       req.Opts.Veths = append(req.Opts.Veths, veth)
+               case "loopback":
+                       // Do nothing
+               }
+       }
+       for _, i := range criuOpts.VethPairs {
+               veth := new(criurpc.CriuVethPair)
+               veth.IfOut = proto.String(i.HostInterfaceName)
+               veth.IfIn = proto.String(i.ContainerInterfaceName)
+               req.Opts.Veths = append(req.Opts.Veths, veth)
+       }
+}
+
+// makeCriuRestoreMountpoints makes the actual mountpoints for the
+// restore using CRIU. This function is inspired from the code in
+// rootfs_linux.go
+func (c *linuxContainer) makeCriuRestoreMountpoints(m *configs.Mount) error {
+       switch m.Device {
+       case "cgroup":
+               // Do nothing for cgroup, CRIU should handle it
+       case "bind":
+               // The prepareBindMount() function checks if source
+               // exists. So it cannot be used for other filesystem types.
+               if err := prepareBindMount(m, c.config.Rootfs); err != nil {
+                       return err
+               }
+       default:
+               // for all other file-systems just create the mountpoints
+               dest, err := securejoin.SecureJoin(c.config.Rootfs, m.Destination)
+               if err != nil {
+                       return err
+               }
+               if err := checkProcMount(c.config.Rootfs, dest, ""); err != nil {
+                       return err
+               }
+               m.Destination = dest
+               if err := os.MkdirAll(dest, 0755); err != nil {
+                       return err
+               }
+       }
+       return nil
+}
+
+// isPathInPrefixList is a small function for CRIU restore to make sure
+// mountpoints, which are on a tmpfs, are not created in the roofs
+func isPathInPrefixList(path string, prefix []string) bool {
+       for _, p := range prefix {
+               if strings.HasPrefix(path, p+"/") {
+                       return false
+               }
+       }
+       return true
+}
+
+// prepareCriuRestoreMounts tries to set up the rootfs of the
+// container to be restored in the same way runc does it for
+// initial container creation. Even for a read-only rootfs container
+// runc modifies the rootfs to add mountpoints which do not exist.
+// This function also creates missing mountpoints as long as they
+// are not on top of a tmpfs, as CRIU will restore tmpfs content anyway.
+func (c *linuxContainer) prepareCriuRestoreMounts(mounts []*configs.Mount) error {
+       // First get a list of a all tmpfs mounts
+       tmpfs := []string{}
+       for _, m := range mounts {
+               switch m.Device {
+               case "tmpfs":
+                       tmpfs = append(tmpfs, m.Destination)
+               }
+       }
+       // Now go through all mounts and create the mountpoints
+       // if the mountpoints are not on a tmpfs, as CRIU will
+       // restore the complete tmpfs content from its checkpoint.
+       for _, m := range mounts {
+               if isPathInPrefixList(m.Destination, tmpfs) {
+                       if err := c.makeCriuRestoreMountpoints(m); err != nil {
+                               return err
+                       }
+               }
+       }
+       return nil
+}
+
+func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
+       c.m.Lock()
+       defer c.m.Unlock()
+
+       var extraFiles []*os.File
+
+       // Restore is unlikely to work if os.Geteuid() != 0 || system.RunningInUserNS().
+       // (CLI prints a warning)
+       // TODO(avagin): Figure out how to make this work nicely. CRIU doesn't have
+       //               support for unprivileged restore at the moment.
+
+       // criu 1.5.2 => 10502
+       if err := c.checkCriuVersion(10502); err != nil {
+               return err
+       }
+       if criuOpts.WorkDirectory == "" {
+               criuOpts.WorkDirectory = filepath.Join(c.root, "criu.work")
+       }
+       // Since a container can be C/R'ed multiple times,
+       // the work directory may already exist.
+       if err := os.Mkdir(criuOpts.WorkDirectory, 0700); err != nil && !os.IsExist(err) {
+               return err
+       }
+       workDir, err := os.Open(criuOpts.WorkDirectory)
+       if err != nil {
+               return err
+       }
+       defer workDir.Close()
+       if criuOpts.ImagesDirectory == "" {
+               return fmt.Errorf("invalid directory to restore checkpoint")
+       }
+       imageDir, err := os.Open(criuOpts.ImagesDirectory)
+       if err != nil {
+               return err
+       }
+       defer imageDir.Close()
+       // CRIU has a few requirements for a root directory:
+       // * it must be a mount point
+       // * its parent must not be overmounted
+       // c.config.Rootfs is bind-mounted to a temporary directory
+       // to satisfy these requirements.
+       root := filepath.Join(c.root, "criu-root")
+       if err := os.Mkdir(root, 0755); err != nil {
+               return err
+       }
+       defer os.Remove(root)
+       root, err = filepath.EvalSymlinks(root)
+       if err != nil {
+               return err
+       }
+       err = unix.Mount(c.config.Rootfs, root, "", unix.MS_BIND|unix.MS_REC, "")
+       if err != nil {
+               return err
+       }
+       defer unix.Unmount(root, unix.MNT_DETACH)
+       t := criurpc.CriuReqType_RESTORE
+       req := &criurpc.CriuReq{
+               Type: &t,
+               Opts: &criurpc.CriuOpts{
+                       ImagesDirFd:     proto.Int32(int32(imageDir.Fd())),
+                       WorkDirFd:       proto.Int32(int32(workDir.Fd())),
+                       EvasiveDevices:  proto.Bool(true),
+                       LogLevel:        proto.Int32(4),
+                       LogFile:         proto.String("restore.log"),
+                       RstSibling:      proto.Bool(true),
+                       Root:            proto.String(root),
+                       ManageCgroups:   proto.Bool(true),
+                       NotifyScripts:   proto.Bool(true),
+                       ShellJob:        proto.Bool(criuOpts.ShellJob),
+                       ExtUnixSk:       proto.Bool(criuOpts.ExternalUnixConnections),
+                       TcpEstablished:  proto.Bool(criuOpts.TcpEstablished),
+                       FileLocks:       proto.Bool(criuOpts.FileLocks),
+                       EmptyNs:         proto.Uint32(criuOpts.EmptyNs),
+                       OrphanPtsMaster: proto.Bool(true),
+                       AutoDedup:       proto.Bool(criuOpts.AutoDedup),
+                       LazyPages:       proto.Bool(criuOpts.LazyPages),
+               },
+       }
+
+       c.handleCriuConfigurationFile(req.Opts)
+
+       // Same as during checkpointing. If the container has a specific network namespace
+       // assigned to it, this now expects that the checkpoint will be restored in a
+       // already created network namespace.
+       nsPath := c.config.Namespaces.PathOf(configs.NEWNET)
+       if nsPath != "" {
+               // For this to work we need at least criu 3.11.0 => 31100.
+               // As there was already a successful version check we will
+               // not error out if it fails. runc will just behave as it used
+               // to do and ignore external network namespaces.
+               err := c.checkCriuVersion(31100)
+               if err == nil {
+                       // CRIU wants the information about an existing network namespace
+                       // like this: --inherit-fd fd[<fd>]:<key>
+                       // The <key> needs to be the same as during checkpointing.
+                       // We are always using 'extRootNetNS' as the key in this.
+                       netns, err := os.Open(nsPath)
+                       defer netns.Close()
+                       if err != nil {
+                               logrus.Errorf("If a specific network namespace is defined it must exist: %s", err)
+                               return fmt.Errorf("Requested network namespace %v does not exist", nsPath)
+                       }
+                       inheritFd := new(criurpc.InheritFd)
+                       inheritFd.Key = proto.String("extRootNetNS")
+                       // The offset of four is necessary because 0, 1, 2 and 3 is already
+                       // used by stdin, stdout, stderr, 'criu swrk' socket.
+                       inheritFd.Fd = proto.Int32(int32(4 + len(extraFiles)))
+                       req.Opts.InheritFd = append(req.Opts.InheritFd, inheritFd)
+                       // All open FDs need to be transferred to CRIU via extraFiles
+                       extraFiles = append(extraFiles, netns)
+               }
+       }
+
+       // This will modify the rootfs of the container in the same way runc
+       // modifies the container during initial creation.
+       if err := c.prepareCriuRestoreMounts(c.config.Mounts); err != nil {
+               return err
+       }
+
+       for _, m := range c.config.Mounts {
+               switch m.Device {
+               case "bind":
+                       c.addCriuRestoreMount(req, m)
+               case "cgroup":
+                       binds, err := getCgroupMounts(m)
+                       if err != nil {
+                               return err
+                       }
+                       for _, b := range binds {
+                               c.addCriuRestoreMount(req, b)
+                       }
+               }
+       }
+
+       if len(c.config.MaskPaths) > 0 {
+               m := &configs.Mount{Destination: "/dev/null", Source: "/dev/null"}
+               c.addCriuRestoreMount(req, m)
+       }
+
+       for _, node := range c.config.Devices {
+               m := &configs.Mount{Destination: node.Path, Source: node.Path}
+               c.addCriuRestoreMount(req, m)
+       }
+
+       if criuOpts.EmptyNs&unix.CLONE_NEWNET == 0 {
+               c.restoreNetwork(req, criuOpts)
+       }
+
+       // append optional manage cgroups mode
+       if criuOpts.ManageCgroupsMode != 0 {
+               // criu 1.7 => 10700
+               if err := c.checkCriuVersion(10700); err != nil {
+                       return err
+               }
+               mode := criurpc.CriuCgMode(criuOpts.ManageCgroupsMode)
+               req.Opts.ManageCgroupsMode = &mode
+       }
+
+       var (
+               fds    []string
+               fdJSON []byte
+       )
+       if fdJSON, err = ioutil.ReadFile(filepath.Join(criuOpts.ImagesDirectory, descriptorsFilename)); err != nil {
+               return err
+       }
+
+       if err := json.Unmarshal(fdJSON, &fds); err != nil {
+               return err
+       }
+       for i := range fds {
+               if s := fds[i]; strings.Contains(s, "pipe:") {
+                       inheritFd := new(criurpc.InheritFd)
+                       inheritFd.Key = proto.String(s)
+                       inheritFd.Fd = proto.Int32(int32(i))
+                       req.Opts.InheritFd = append(req.Opts.InheritFd, inheritFd)
+               }
+       }
+       return c.criuSwrk(process, req, criuOpts, true, extraFiles)
+}
+
+func (c *linuxContainer) criuApplyCgroups(pid int, req *criurpc.CriuReq) error {
+       // XXX: Do we need to deal with this case? AFAIK criu still requires root.
+       if err := c.cgroupManager.Apply(pid); err != nil {
+               return err
+       }
+
+       if err := c.cgroupManager.Set(c.config); err != nil {
+               return newSystemError(err)
+       }
+
+       path := fmt.Sprintf("/proc/%d/cgroup", pid)
+       cgroupsPaths, err := cgroups.ParseCgroupFile(path)
+       if err != nil {
+               return err
+       }
+
+       for c, p := range cgroupsPaths {
+               cgroupRoot := &criurpc.CgroupRoot{
+                       Ctrl: proto.String(c),
+                       Path: proto.String(p),
+               }
+               req.Opts.CgRoot = append(req.Opts.CgRoot, cgroupRoot)
+       }
+
+       return nil
+}
+
+func (c *linuxContainer) criuSwrk(process *Process, req *criurpc.CriuReq, opts *CriuOpts, applyCgroups bool, extraFiles []*os.File) error {
+       fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_SEQPACKET|unix.SOCK_CLOEXEC, 0)
+       if err != nil {
+               return err
+       }
+
+       var logPath string
+       if opts != nil {
+               logPath = filepath.Join(opts.WorkDirectory, req.GetOpts().GetLogFile())
+       } else {
+               // For the VERSION RPC 'opts' is set to 'nil' and therefore
+               // opts.WorkDirectory does not exist. Set logPath to "".
+               logPath = ""
+       }
+       criuClient := os.NewFile(uintptr(fds[0]), "criu-transport-client")
+       criuClientFileCon, err := net.FileConn(criuClient)
+       criuClient.Close()
+       if err != nil {
+               return err
+       }
+
+       criuClientCon := criuClientFileCon.(*net.UnixConn)
+       defer criuClientCon.Close()
+
+       criuServer := os.NewFile(uintptr(fds[1]), "criu-transport-server")
+       defer criuServer.Close()
+
+       args := []string{"swrk", "3"}
+       if c.criuVersion != 0 {
+               // If the CRIU Version is still '0' then this is probably
+               // the initial CRIU run to detect the version. Skip it.
+               logrus.Debugf("Using CRIU %d at: %s", c.criuVersion, c.criuPath)
+       }
+       logrus.Debugf("Using CRIU with following args: %s", args)
+       cmd := exec.Command(c.criuPath, args...)
+       if process != nil {
+               cmd.Stdin = process.Stdin
+               cmd.Stdout = process.Stdout
+               cmd.Stderr = process.Stderr
+       }
+       cmd.ExtraFiles = append(cmd.ExtraFiles, criuServer)
+       if extraFiles != nil {
+               cmd.ExtraFiles = append(cmd.ExtraFiles, extraFiles...)
+       }
+
+       if err := cmd.Start(); err != nil {
+               return err
+       }
+       criuServer.Close()
+
+       defer func() {
+               criuClientCon.Close()
+               _, err := cmd.Process.Wait()
+               if err != nil {
+                       return
+               }
+       }()
+
+       if applyCgroups {
+               err := c.criuApplyCgroups(cmd.Process.Pid, req)
+               if err != nil {
+                       return err
+               }
+       }
+
+       var extFds []string
+       if process != nil {
+               extFds, err = getPipeFds(cmd.Process.Pid)
+               if err != nil {
+                       return err
+               }
+       }
+
+       logrus.Debugf("Using CRIU in %s mode", req.GetType().String())
+       // In the case of criurpc.CriuReqType_FEATURE_CHECK req.GetOpts()
+       // should be empty. For older CRIU versions it still will be
+       // available but empty. criurpc.CriuReqType_VERSION actually
+       // has no req.GetOpts().
+       if !(req.GetType() == criurpc.CriuReqType_FEATURE_CHECK ||
+               req.GetType() == criurpc.CriuReqType_VERSION) {
+
+               val := reflect.ValueOf(req.GetOpts())
+               v := reflect.Indirect(val)
+               for i := 0; i < v.NumField(); i++ {
+                       st := v.Type()
+                       name := st.Field(i).Name
+                       if strings.HasPrefix(name, "XXX_") {
+                               continue
+                       }
+                       value := val.MethodByName("Get" + name).Call([]reflect.Value{})
+                       logrus.Debugf("CRIU option %s with value %v", name, value[0])
+               }
+       }
+       data, err := proto.Marshal(req)
+       if err != nil {
+               return err
+       }
+       _, err = criuClientCon.Write(data)
+       if err != nil {
+               return err
+       }
+
+       buf := make([]byte, 10*4096)
+       oob := make([]byte, 4096)
+       for true {
+               n, oobn, _, _, err := criuClientCon.ReadMsgUnix(buf, oob)
+               if err != nil {
+                       return err
+               }
+               if n == 0 {
+                       return fmt.Errorf("unexpected EOF")
+               }
+               if n == len(buf) {
+                       return fmt.Errorf("buffer is too small")
+               }
+
+               resp := new(criurpc.CriuResp)
+               err = proto.Unmarshal(buf[:n], resp)
+               if err != nil {
+                       return err
+               }
+               if !resp.GetSuccess() {
+                       typeString := req.GetType().String()
+                       if typeString == "VERSION" {
+                               // If the VERSION RPC fails this probably means that the CRIU
+                               // version is too old for this RPC. Just return 'nil'.
+                               return nil
+                       }
+                       return fmt.Errorf("criu failed: type %s errno %d\nlog file: %s", typeString, resp.GetCrErrno(), logPath)
+               }
+
+               t := resp.GetType()
+               switch {
+               case t == criurpc.CriuReqType_VERSION:
+                       logrus.Debugf("CRIU version: %s", resp)
+                       criuVersionRPC = resp.GetVersion()
+                       break
+               case t == criurpc.CriuReqType_FEATURE_CHECK:
+                       logrus.Debugf("Feature check says: %s", resp)
+                       criuFeatures = resp.GetFeatures()
+               case t == criurpc.CriuReqType_NOTIFY:
+                       if err := c.criuNotifications(resp, process, opts, extFds, oob[:oobn]); err != nil {
+                               return err
+                       }
+                       t = criurpc.CriuReqType_NOTIFY
+                       req = &criurpc.CriuReq{
+                               Type:          &t,
+                               NotifySuccess: proto.Bool(true),
+                       }
+                       data, err = proto.Marshal(req)
+                       if err != nil {
+                               return err
+                       }
+                       _, err = criuClientCon.Write(data)
+                       if err != nil {
+                               return err
+                       }
+                       continue
+               case t == criurpc.CriuReqType_RESTORE:
+               case t == criurpc.CriuReqType_DUMP:
+               case t == criurpc.CriuReqType_PRE_DUMP:
+               default:
+                       return fmt.Errorf("unable to parse the response %s", resp.String())
+               }
+
+               break
+       }
+
+       criuClientCon.CloseWrite()
+       // cmd.Wait() waits cmd.goroutines which are used for proxying file descriptors.
+       // Here we want to wait only the CRIU process.
+       st, err := cmd.Process.Wait()
+       if err != nil {
+               return err
+       }
+
+       // In pre-dump mode CRIU is in a loop and waits for
+       // the final DUMP command.
+       // The current runc pre-dump approach, however, is
+       // start criu in PRE_DUMP once for a single pre-dump
+       // and not the whole series of pre-dump, pre-dump, ...m, dump
+       // If we got the message CriuReqType_PRE_DUMP it means
+       // CRIU was successful and we need to forcefully stop CRIU
+       if !st.Success() && *req.Type != criurpc.CriuReqType_PRE_DUMP {
+               return fmt.Errorf("criu failed: %s\nlog file: %s", st.String(), logPath)
+       }
+       return nil
+}
+
+// block any external network activity
+func lockNetwork(config *configs.Config) error {
+       for _, config := range config.Networks {
+               strategy, err := getStrategy(config.Type)
+               if err != nil {
+                       return err
+               }
+
+               if err := strategy.detach(config); err != nil {
+                       return err
+               }
+       }
+       return nil
+}
+
+func unlockNetwork(config *configs.Config) error {
+       for _, config := range config.Networks {
+               strategy, err := getStrategy(config.Type)
+               if err != nil {
+                       return err
+               }
+               if err = strategy.attach(config); err != nil {
+                       return err
+               }
+       }
+       return nil
+}
+
+func (c *linuxContainer) criuNotifications(resp *criurpc.CriuResp, process *Process, opts *CriuOpts, fds []string, oob []byte) error {
+       notify := resp.GetNotify()
+       if notify == nil {
+               return fmt.Errorf("invalid response: %s", resp.String())
+       }
+       logrus.Debugf("notify: %s\n", notify.GetScript())
+       switch {
+       case notify.GetScript() == "post-dump":
+               f, err := os.Create(filepath.Join(c.root, "checkpoint"))
+               if err != nil {
+                       return err
+               }
+               f.Close()
+       case notify.GetScript() == "network-unlock":
+               if err := unlockNetwork(c.config); err != nil {
+                       return err
+               }
+       case notify.GetScript() == "network-lock":
+               if err := lockNetwork(c.config); err != nil {
+                       return err
+               }
+       case notify.GetScript() == "setup-namespaces":
+               if c.config.Hooks != nil {
+                       s, err := c.currentOCIState()
+                       if err != nil {
+                               return nil
+                       }
+                       s.Pid = int(notify.GetPid())
+                       for i, hook := range c.config.Hooks.Prestart {
+                               if err := hook.Run(s); err != nil {
+                                       return newSystemErrorWithCausef(err, "running prestart hook %d", i)
+                               }
+                       }
+               }
+       case notify.GetScript() == "post-restore":
+               pid := notify.GetPid()
+               r, err := newRestoredProcess(int(pid), fds)
+               if err != nil {
+                       return err
+               }
+               process.ops = r
+               if err := c.state.transition(&restoredState{
+                       imageDir: opts.ImagesDirectory,
+                       c:        c,
+               }); err != nil {
+                       return err
+               }
+               // create a timestamp indicating when the restored checkpoint was started
+               c.created = time.Now().UTC()
+               if _, err := c.updateState(r); err != nil {
+                       return err
+               }
+               if err := os.Remove(filepath.Join(c.root, "checkpoint")); err != nil {
+                       if !os.IsNotExist(err) {
+                               logrus.Error(err)
+                       }
+               }
+       case notify.GetScript() == "orphan-pts-master":
+               scm, err := unix.ParseSocketControlMessage(oob)
+               if err != nil {
+                       return err
+               }
+               fds, err := unix.ParseUnixRights(&scm[0])
+               if err != nil {
+                       return err
+               }
+
+               master := os.NewFile(uintptr(fds[0]), "orphan-pts-master")
+               defer master.Close()
+
+               // While we can access console.master, using the API is a good idea.
+               if err := utils.SendFd(process.ConsoleSocket, master.Name(), master.Fd()); err != nil {
+                       return err
+               }
+       }
+       return nil
+}
+
+func (c *linuxContainer) updateState(process parentProcess) (*State, error) {
+       if process != nil {
+               c.initProcess = process
+       }
+       state, err := c.currentState()
+       if err != nil {
+               return nil, err
+       }
+       err = c.saveState(state)
+       if err != nil {
+               return nil, err
+       }
+       return state, nil
+}
+
+func (c *linuxContainer) saveState(s *State) error {
+       f, err := os.Create(filepath.Join(c.root, stateFilename))
+       if err != nil {
+               return err
+       }
+       defer f.Close()
+       return utils.WriteJSON(f, s)
+}
+
+func (c *linuxContainer) deleteState() error {
+       return os.Remove(filepath.Join(c.root, stateFilename))
+}
+
+func (c *linuxContainer) currentStatus() (Status, error) {
+       if err := c.refreshState(); err != nil {
+               return -1, err
+       }
+       return c.state.status(), nil
+}
+
+// refreshState needs to be called to verify that the current state on the
+// container is what is true.  Because consumers of libcontainer can use it
+// out of process we need to verify the container's status based on runtime
+// information and not rely on our in process info.
+func (c *linuxContainer) refreshState() error {
+       paused, err := c.isPaused()
+       if err != nil {
+               return err
+       }
+       if paused {
+               return c.state.transition(&pausedState{c: c})
+       }
+       t, err := c.runType()
+       if err != nil {
+               return err
+       }
+       switch t {
+       case Created:
+               return c.state.transition(&createdState{c: c})
+       case Running:
+               return c.state.transition(&runningState{c: c})
+       }
+       return c.state.transition(&stoppedState{c: c})
+}
+
+func (c *linuxContainer) runType() (Status, error) {
+       if c.initProcess == nil {
+               return Stopped, nil
+       }
+       pid := c.initProcess.pid()
+       stat, err := system.Stat(pid)
+       if err != nil {
+               return Stopped, nil
+       }
+       if stat.StartTime != c.initProcessStartTime || stat.State == system.Zombie || stat.State == system.Dead {
+               return Stopped, nil
+       }
+       // We'll create exec fifo and blocking on it after container is created,
+       // and delete it after start container.
+       if _, err := os.Stat(filepath.Join(c.root, execFifoFilename)); err == nil {
+               return Created, nil
+       }
+       return Running, nil
+}
+
+func (c *linuxContainer) isPaused() (bool, error) {
+       fcg := c.cgroupManager.GetPaths()["freezer"]
+       if fcg == "" {
+               // A container doesn't have a freezer cgroup
+               return false, nil
+       }
+       pausedState := "FROZEN"
+       filename := "freezer.state"
+       if cgroups.IsCgroup2UnifiedMode() {
+               filename = "cgroup.freeze"
+               pausedState = "1"
+       }
+
+       data, err := ioutil.ReadFile(filepath.Join(fcg, filename))
+       if err != nil {
+               // If freezer cgroup is not mounted, the container would just be not paused.
+               if os.IsNotExist(err) || err == syscall.ENODEV {
+                       return false, nil
+               }
+               return false, newSystemErrorWithCause(err, "checking if container is paused")
+       }
+       return bytes.Equal(bytes.TrimSpace(data), []byte(pausedState)), nil
+}
+
+func (c *linuxContainer) currentState() (*State, error) {
+       var (
+               startTime           uint64
+               externalDescriptors []string
+               pid                 = -1
+       )
+       if c.initProcess != nil {
+               pid = c.initProcess.pid()
+               startTime, _ = c.initProcess.startTime()
+               externalDescriptors = c.initProcess.externalDescriptors()
+       }
+       intelRdtPath, err := intelrdt.GetIntelRdtPath(c.ID())
+       if err != nil {
+               intelRdtPath = ""
+       }
+       state := &State{
+               BaseState: BaseState{
+                       ID:                   c.ID(),
+                       Config:               *c.config,
+                       InitProcessPid:       pid,
+                       InitProcessStartTime: startTime,
+                       Created:              c.created,
+               },
+               Rootless:            c.config.RootlessEUID && c.config.RootlessCgroups,
+               CgroupPaths:         c.cgroupManager.GetPaths(),
+               IntelRdtPath:        intelRdtPath,
+               NamespacePaths:      make(map[configs.NamespaceType]string),
+               ExternalDescriptors: externalDescriptors,
+       }
+       if pid > 0 {
+               for _, ns := range c.config.Namespaces {
+                       state.NamespacePaths[ns.Type] = ns.GetPath(pid)
+               }
+               for _, nsType := range configs.NamespaceTypes() {
+                       if !configs.IsNamespaceSupported(nsType) {
+                               continue
+                       }
+                       if _, ok := state.NamespacePaths[nsType]; !ok {
+                               ns := configs.Namespace{Type: nsType}
+                               state.NamespacePaths[ns.Type] = ns.GetPath(pid)
+                       }
+               }
+       }
+       return state, nil
+}
+
+func (c *linuxContainer) currentOCIState() (*specs.State, error) {
+       bundle, annotations := utils.Annotations(c.config.Labels)
+       state := &specs.State{
+               Version:     specs.Version,
+               ID:          c.ID(),
+               Bundle:      bundle,
+               Annotations: annotations,
+       }
+       status, err := c.currentStatus()
+       if err != nil {
+               return nil, err
+       }
+       state.Status = status.String()
+       if status != Stopped {
+               if c.initProcess != nil {
+                       state.Pid = c.initProcess.pid()
+               }
+       }
+       return state, nil
+}
+
+// orderNamespacePaths sorts namespace paths into a list of paths that we
+// can setns in order.
+func (c *linuxContainer) orderNamespacePaths(namespaces map[configs.NamespaceType]string) ([]string, error) {
+       paths := []string{}
+       for _, ns := range configs.NamespaceTypes() {
+
+               // Remove namespaces that we don't need to join.
+               if !c.config.Namespaces.Contains(ns) {
+                       continue
+               }
+
+               if p, ok := namespaces[ns]; ok && p != "" {
+                       // check if the requested namespace is supported
+                       if !configs.IsNamespaceSupported(ns) {
+                               return nil, newSystemError(fmt.Errorf("namespace %s is not supported", ns))
+                       }
+                       // only set to join this namespace if it exists
+                       if _, err := os.Lstat(p); err != nil {
+                               return nil, newSystemErrorWithCausef(err, "running lstat on namespace path %q", p)
+                       }
+                       // do not allow namespace path with comma as we use it to separate
+                       // the namespace paths
+                       if strings.ContainsRune(p, ',') {
+                               return nil, newSystemError(fmt.Errorf("invalid path %s", p))
+                       }
+                       paths = append(paths, fmt.Sprintf("%s:%s", configs.NsName(ns), p))
+               }
+
+       }
+
+       return paths, nil
+}
+
+func encodeIDMapping(idMap []configs.IDMap) ([]byte, error) {
+       data := bytes.NewBuffer(nil)
+       for _, im := range idMap {
+               line := fmt.Sprintf("%d %d %d\n", im.ContainerID, im.HostID, im.Size)
+               if _, err := data.WriteString(line); err != nil {
+                       return nil, err
+               }
+       }
+       return data.Bytes(), nil
+}
+
+// bootstrapData encodes the necessary data in netlink binary format
+// as a io.Reader.
+// Consumer can write the data to a bootstrap program
+// such as one that uses nsenter package to bootstrap the container's
+// init process correctly, i.e. with correct namespaces, uid/gid
+// mapping etc.
+func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.NamespaceType]string) (io.Reader, error) {
+       // create the netlink message
+       r := nl.NewNetlinkRequest(int(InitMsg), 0)
+
+       // write cloneFlags
+       r.AddData(&Int32msg{
+               Type:  CloneFlagsAttr,
+               Value: uint32(cloneFlags),
+       })
+
+       // write custom namespace paths
+       if len(nsMaps) > 0 {
+               nsPaths, err := c.orderNamespacePaths(nsMaps)
+               if err != nil {
+                       return nil, err
+               }
+               r.AddData(&Bytemsg{
+                       Type:  NsPathsAttr,
+                       Value: []byte(strings.Join(nsPaths, ",")),
+               })
+       }
+
+       // write namespace paths only when we are not joining an existing user ns
+       _, joinExistingUser := nsMaps[configs.NEWUSER]
+       if !joinExistingUser {
+               // write uid mappings
+               if len(c.config.UidMappings) > 0 {
+                       if c.config.RootlessEUID && c.newuidmapPath != "" {
+                               r.AddData(&Bytemsg{
+                                       Type:  UidmapPathAttr,
+                                       Value: []byte(c.newuidmapPath),
+                               })
+                       }
+                       b, err := encodeIDMapping(c.config.UidMappings)
+                       if err != nil {
+                               return nil, err
+                       }
+                       r.AddData(&Bytemsg{
+                               Type:  UidmapAttr,
+                               Value: b,
+                       })
+               }
+
+               // write gid mappings
+               if len(c.config.GidMappings) > 0 {
+                       b, err := encodeIDMapping(c.config.GidMappings)
+                       if err != nil {
+                               return nil, err
+                       }
+                       r.AddData(&Bytemsg{
+                               Type:  GidmapAttr,
+                               Value: b,
+                       })
+                       if c.config.RootlessEUID && c.newgidmapPath != "" {
+                               r.AddData(&Bytemsg{
+                                       Type:  GidmapPathAttr,
+                                       Value: []byte(c.newgidmapPath),
+                               })
+                       }
+                       if requiresRootOrMappingTool(c.config) {
+                               r.AddData(&Boolmsg{
+                                       Type:  SetgroupAttr,
+                                       Value: true,
+                               })
+                       }
+               }
+       }
+
+       if c.config.OomScoreAdj != nil {
+               // write oom_score_adj
+               r.AddData(&Bytemsg{
+                       Type:  OomScoreAdjAttr,
+                       Value: []byte(fmt.Sprintf("%d", *c.config.OomScoreAdj)),
+               })
+       }
+
+       // write rootless
+       r.AddData(&Boolmsg{
+               Type:  RootlessEUIDAttr,
+               Value: c.config.RootlessEUID,
+       })
+
+       return bytes.NewReader(r.Serialize()), nil
+}
+
+// ignoreTerminateErrors returns nil if the given err matches an error known
+// to indicate that the terminate occurred successfully or err was nil, otherwise
+// err is returned unaltered.
+func ignoreTerminateErrors(err error) error {
+       if err == nil {
+               return nil
+       }
+       s := err.Error()
+       switch {
+       case strings.Contains(s, "process already finished"), strings.Contains(s, "Wait was already called"):
+               return nil
+       }
+       return err
+}
+
+func requiresRootOrMappingTool(c *configs.Config) bool {
+       gidMap := []configs.IDMap{
+               {ContainerID: 0, HostID: os.Getegid(), Size: 1},
+       }
+       return !reflect.DeepEqual(c.GidMappings, gidMap)
+}
diff --git a/libcontainer/container_linux_test.go b/libcontainer/container_linux_test.go
new file mode 100644 (file)
index 0000000..f8af05d
--- /dev/null
@@ -0,0 +1,372 @@
+// +build linux
+
+package libcontainer
+
+import (
+       "fmt"
+       "io/ioutil"
+       "os"
+       "testing"
+
+       "github.com/opencontainers/runc/libcontainer/cgroups"
+       "github.com/opencontainers/runc/libcontainer/configs"
+       "github.com/opencontainers/runc/libcontainer/intelrdt"
+       "github.com/opencontainers/runc/libcontainer/system"
+)
+
+type mockCgroupManager struct {
+       pids    []int
+       allPids []int
+       stats   *cgroups.Stats
+       paths   map[string]string
+}
+
+type mockIntelRdtManager struct {
+       stats *intelrdt.Stats
+       path  string
+}
+
+func (m *mockCgroupManager) GetPids() ([]int, error) {
+       return m.pids, nil
+}
+
+func (m *mockCgroupManager) GetAllPids() ([]int, error) {
+       return m.allPids, nil
+}
+
+func (m *mockCgroupManager) GetStats() (*cgroups.Stats, error) {
+       return m.stats, nil
+}
+
+func (m *mockCgroupManager) Apply(pid int) error {
+       return nil
+}
+
+func (m *mockCgroupManager) Set(container *configs.Config) error {
+       return nil
+}
+
+func (m *mockCgroupManager) Destroy() error {
+       return nil
+}
+
+func (m *mockCgroupManager) GetPaths() map[string]string {
+       return m.paths
+}
+
+func (m *mockCgroupManager) GetUnifiedPath() (string, error) {
+       return "", fmt.Errorf("unimplemented")
+}
+
+func (m *mockCgroupManager) Freeze(state configs.FreezerState) error {
+       return nil
+}
+func (m *mockCgroupManager) GetCgroups() (*configs.Cgroup, error) {
+       return nil, nil
+}
+
+func (m *mockIntelRdtManager) Apply(pid int) error {
+       return nil
+}
+
+func (m *mockIntelRdtManager) GetStats() (*intelrdt.Stats, error) {
+       return m.stats, nil
+}
+
+func (m *mockIntelRdtManager) Destroy() error {
+       return nil
+}
+
+func (m *mockIntelRdtManager) GetPath() string {
+       return m.path
+}
+
+func (m *mockIntelRdtManager) Set(container *configs.Config) error {
+       return nil
+}
+
+func (m *mockIntelRdtManager) GetCgroups() (*configs.Cgroup, error) {
+       return nil, nil
+}
+
+type mockProcess struct {
+       _pid    int
+       started uint64
+}
+
+func (m *mockProcess) terminate() error {
+       return nil
+}
+
+func (m *mockProcess) pid() int {
+       return m._pid
+}
+
+func (m *mockProcess) startTime() (uint64, error) {
+       return m.started, nil
+}
+
+func (m *mockProcess) start() error {
+       return nil
+}
+
+func (m *mockProcess) wait() (*os.ProcessState, error) {
+       return nil, nil
+}
+
+func (m *mockProcess) signal(_ os.Signal) error {
+       return nil
+}
+
+func (m *mockProcess) externalDescriptors() []string {
+       return []string{}
+}
+
+func (m *mockProcess) setExternalDescriptors(newFds []string) {
+}
+
+func (m *mockProcess) forwardChildLogs() {
+}
+
+func TestGetContainerPids(t *testing.T) {
+       container := &linuxContainer{
+               id:            "myid",
+               config:        &configs.Config{},
+               cgroupManager: &mockCgroupManager{allPids: []int{1, 2, 3}},
+       }
+       pids, err := container.Processes()
+       if err != nil {
+               t.Fatal(err)
+       }
+       for i, expected := range []int{1, 2, 3} {
+               if pids[i] != expected {
+                       t.Fatalf("expected pid %d but received %d", expected, pids[i])
+               }
+       }
+}
+
+func TestGetContainerStats(t *testing.T) {
+       container := &linuxContainer{
+               id:     "myid",
+               config: &configs.Config{},
+               cgroupManager: &mockCgroupManager{
+                       pids: []int{1, 2, 3},
+                       stats: &cgroups.Stats{
+                               MemoryStats: cgroups.MemoryStats{
+                                       Usage: cgroups.MemoryData{
+                                               Usage: 1024,
+                                       },
+                               },
+                       },
+               },
+               intelRdtManager: &mockIntelRdtManager{
+                       stats: &intelrdt.Stats{
+                               L3CacheSchema: "L3:0=f;1=f0",
+                               MemBwSchema:   "MB:0=20;1=70",
+                       },
+               },
+       }
+       stats, err := container.Stats()
+       if err != nil {
+               t.Fatal(err)
+       }
+       if stats.CgroupStats == nil {
+               t.Fatal("cgroup stats are nil")
+       }
+       if stats.CgroupStats.MemoryStats.Usage.Usage != 1024 {
+               t.Fatalf("expected memory usage 1024 but received %d", stats.CgroupStats.MemoryStats.Usage.Usage)
+       }
+       if intelrdt.IsCatEnabled() {
+               if stats.IntelRdtStats == nil {
+                       t.Fatal("intel rdt stats are nil")
+               }
+               if stats.IntelRdtStats.L3CacheSchema != "L3:0=f;1=f0" {
+                       t.Fatalf("expected L3CacheSchema L3:0=f;1=f0 but received %s", stats.IntelRdtStats.L3CacheSchema)
+               }
+       }
+       if intelrdt.IsMbaEnabled() {
+               if stats.IntelRdtStats == nil {
+                       t.Fatal("intel rdt stats are nil")
+               }
+               if stats.IntelRdtStats.MemBwSchema != "MB:0=20;1=70" {
+                       t.Fatalf("expected MemBwSchema MB:0=20;1=70 but received %s", stats.IntelRdtStats.MemBwSchema)
+               }
+       }
+}
+
+func TestGetContainerState(t *testing.T) {
+       var (
+               pid                  = os.Getpid()
+               expectedMemoryPath   = "/sys/fs/cgroup/memory/myid"
+               expectedNetworkPath  = fmt.Sprintf("/proc/%d/ns/net", pid)
+               expectedIntelRdtPath = "/sys/fs/resctrl/myid"
+       )
+       container := &linuxContainer{
+               id: "myid",
+               config: &configs.Config{
+                       Namespaces: []configs.Namespace{
+                               {Type: configs.NEWPID},
+                               {Type: configs.NEWNS},
+                               {Type: configs.NEWNET, Path: expectedNetworkPath},
+                               {Type: configs.NEWUTS},
+                               // emulate host for IPC
+                               //{Type: configs.NEWIPC},
+                               {Type: configs.NEWCGROUP},
+                       },
+               },
+               initProcess: &mockProcess{
+                       _pid:    pid,
+                       started: 10,
+               },
+               cgroupManager: &mockCgroupManager{
+                       pids: []int{1, 2, 3},
+                       stats: &cgroups.Stats{
+                               MemoryStats: cgroups.MemoryStats{
+                                       Usage: cgroups.MemoryData{
+                                               Usage: 1024,
+                                       },
+                               },
+                       },
+                       paths: map[string]string{
+                               "memory": expectedMemoryPath,
+                       },
+               },
+               intelRdtManager: &mockIntelRdtManager{
+                       stats: &intelrdt.Stats{
+                               L3CacheSchema: "L3:0=f0;1=f",
+                               MemBwSchema:   "MB:0=70;1=20",
+                       },
+                       path: expectedIntelRdtPath,
+               },
+       }
+       container.state = &createdState{c: container}
+       state, err := container.State()
+       if err != nil {
+               t.Fatal(err)
+       }
+       if state.InitProcessPid != pid {
+               t.Fatalf("expected pid %d but received %d", pid, state.InitProcessPid)
+       }
+       if state.InitProcessStartTime != 10 {
+               t.Fatalf("expected process start time 10 but received %d", state.InitProcessStartTime)
+       }
+       paths := state.CgroupPaths
+       if paths == nil {
+               t.Fatal("cgroup paths should not be nil")
+       }
+       if memPath := paths["memory"]; memPath != expectedMemoryPath {
+               t.Fatalf("expected memory path %q but received %q", expectedMemoryPath, memPath)
+       }
+       if intelrdt.IsCatEnabled() || intelrdt.IsMbaEnabled() {
+               intelRdtPath := state.IntelRdtPath
+               if intelRdtPath == "" {
+                       t.Fatal("intel rdt path should not be empty")
+               }
+               if intelRdtPath != expectedIntelRdtPath {
+                       t.Fatalf("expected intel rdt path %q but received %q", expectedIntelRdtPath, intelRdtPath)
+               }
+       }
+       for _, ns := range container.config.Namespaces {
+               path := state.NamespacePaths[ns.Type]
+               if path == "" {
+                       t.Fatalf("expected non nil namespace path for %s", ns.Type)
+               }
+               if ns.Type == configs.NEWNET {
+                       if path != expectedNetworkPath {
+                               t.Fatalf("expected path %q but received %q", expectedNetworkPath, path)
+                       }
+               } else {
+                       file := ""
+                       switch ns.Type {
+                       case configs.NEWNET:
+                               file = "net"
+                       case configs.NEWNS:
+                               file = "mnt"
+                       case configs.NEWPID:
+                               file = "pid"
+                       case configs.NEWIPC:
+                               file = "ipc"
+                       case configs.NEWUSER:
+                               file = "user"
+                       case configs.NEWUTS:
+                               file = "uts"
+                       case configs.NEWCGROUP:
+                               file = "cgroup"
+                       }
+                       expected := fmt.Sprintf("/proc/%d/ns/%s", pid, file)
+                       if expected != path {
+                               t.Fatalf("expected path %q but received %q", expected, path)
+                       }
+               }
+       }
+}
+
+func TestGetContainerStateAfterUpdate(t *testing.T) {
+       var (
+               pid = os.Getpid()
+       )
+       stat, err := system.Stat(pid)
+       if err != nil {
+               t.Fatal(err)
+       }
+
+       rootDir, err := ioutil.TempDir("", "TestGetContainerStateAfterUpdate")
+       if err != nil {
+               t.Fatal(err)
+       }
+       defer os.RemoveAll(rootDir)
+
+       container := &linuxContainer{
+               root: rootDir,
+               id:   "myid",
+               config: &configs.Config{
+                       Namespaces: []configs.Namespace{
+                               {Type: configs.NEWPID},
+                               {Type: configs.NEWNS},
+                               {Type: configs.NEWNET},
+                               {Type: configs.NEWUTS},
+                               {Type: configs.NEWIPC},
+                       },
+                       Cgroups: &configs.Cgroup{
+                               Resources: &configs.Resources{
+                                       Memory: 1024,
+                               },
+                       },
+               },
+               initProcess: &mockProcess{
+                       _pid:    pid,
+                       started: stat.StartTime,
+               },
+               cgroupManager: &mockCgroupManager{},
+       }
+       container.state = &createdState{c: container}
+       state, err := container.State()
+       if err != nil {
+               t.Fatal(err)
+       }
+       if state.InitProcessPid != pid {
+               t.Fatalf("expected pid %d but received %d", pid, state.InitProcessPid)
+       }
+       if state.InitProcessStartTime != stat.StartTime {
+               t.Fatalf("expected process start time %d but received %d", stat.StartTime, state.InitProcessStartTime)
+       }
+       if state.Config.Cgroups.Resources.Memory != 1024 {
+               t.Fatalf("expected Memory to be 1024 but received %q", state.Config.Cgroups.Memory)
+       }
+
+       // Set initProcessStartTime so we fake to be running
+       container.initProcessStartTime = state.InitProcessStartTime
+       container.state = &runningState{c: container}
+       newConfig := container.Config()
+       newConfig.Cgroups.Resources.Memory = 2048
+       if err := container.Set(newConfig); err != nil {
+               t.Fatal(err)
+       }
+       state, err = container.State()
+       if err != nil {
+               t.Fatal(err)
+       }
+       if state.Config.Cgroups.Resources.Memory != 2048 {
+               t.Fatalf("expected Memory to be 2048 but received %q", state.Config.Cgroups.Memory)
+       }
+}
diff --git a/libcontainer/criu_opts_linux.go b/libcontainer/criu_opts_linux.go
new file mode 100644 (file)
index 0000000..a2e344f
--- /dev/null
@@ -0,0 +1,40 @@
+package libcontainer
+
+// cgroup restoring strategy provided by criu
+type cgMode uint32
+
+const (
+       CRIU_CG_MODE_SOFT    cgMode = 3 + iota // restore cgroup properties if only dir created by criu
+       CRIU_CG_MODE_FULL                      // always restore all cgroups and their properties
+       CRIU_CG_MODE_STRICT                    // restore all, requiring them to not present in the system
+       CRIU_CG_MODE_DEFAULT                   // the same as CRIU_CG_MODE_SOFT
+)
+
+type CriuPageServerInfo struct {
+       Address string // IP address of CRIU page server
+       Port    int32  // port number of CRIU page server
+}
+
+type VethPairName struct {
+       ContainerInterfaceName string
+       HostInterfaceName      string
+}
+
+type CriuOpts struct {
+       ImagesDirectory         string             // directory for storing image files
+       WorkDirectory           string             // directory to cd and write logs/pidfiles/stats to
+       ParentImage             string             // directory for storing parent image files in pre-dump and dump
+       LeaveRunning            bool               // leave container in running state after checkpoint
+       TcpEstablished          bool               // checkpoint/restore established TCP connections
+       ExternalUnixConnections bool               // allow external unix connections
+       ShellJob                bool               // allow to dump and restore shell jobs
+       FileLocks               bool               // handle file locks, for safety
+       PreDump                 bool               // call criu predump to perform iterative checkpoint
+       PageServer              CriuPageServerInfo // allow to dump to criu page server
+       VethPairs               []VethPairName     // pass the veth to criu when restore
+       ManageCgroupsMode       cgMode             // dump or restore cgroup mode
+       EmptyNs                 uint32             // don't c/r properties for namespace from this mask
+       AutoDedup               bool               // auto deduplication for incremental dumps
+       LazyPages               bool               // restore memory pages lazily using userfaultfd
+       StatusFd                string             // fd for feedback when lazy server is ready
+}
diff --git a/libcontainer/devices/devices.go b/libcontainer/devices/devices.go
new file mode 100644 (file)
index 0000000..5dabe06
--- /dev/null
@@ -0,0 +1,110 @@
+package devices
+
+import (
+       "errors"
+       "io/ioutil"
+       "os"
+       "path/filepath"
+
+       "github.com/opencontainers/runc/libcontainer/configs"
+       "golang.org/x/sys/unix"
+)
+
+var (
+       // ErrNotADevice denotes that a file is not a valid linux device.
+       ErrNotADevice = errors.New("not a device node")
+)
+
+// Testing dependencies
+var (
+       unixLstat     = unix.Lstat
+       ioutilReadDir = ioutil.ReadDir
+)
+
+// Given the path to a device and its cgroup_permissions(which cannot be easily queried) look up the
+// information about a linux device and return that information as a Device struct.
+func DeviceFromPath(path, permissions string) (*configs.Device, error) {
+       var stat unix.Stat_t
+       err := unixLstat(path, &stat)
+       if err != nil {
+               return nil, err
+       }
+
+       var (
+               devNumber = uint64(stat.Rdev)
+               major     = unix.Major(devNumber)
+               minor     = unix.Minor(devNumber)
+       )
+       if major == 0 {
+               return nil, ErrNotADevice
+       }
+
+       var (
+               devType rune
+               mode    = stat.Mode
+       )
+       switch {
+       case mode&unix.S_IFBLK == unix.S_IFBLK:
+               devType = 'b'
+       case mode&unix.S_IFCHR == unix.S_IFCHR:
+               devType = 'c'
+       }
+       return &configs.Device{
+               Type:        devType,
+               Path:        path,
+               Major:       int64(major),
+               Minor:       int64(minor),
+               Permissions: permissions,
+               FileMode:    os.FileMode(mode),
+               Uid:         stat.Uid,
+               Gid:         stat.Gid,
+       }, nil
+}
+
+// HostDevices returns all devices that can be found under /dev directory.
+func HostDevices() ([]*configs.Device, error) {
+       return GetDevices("/dev")
+}
+
+// GetDevices recursively traverses a directory specified by path
+// and returns all devices found there.
+func GetDevices(path string) ([]*configs.Device, error) {
+       files, err := ioutilReadDir(path)
+       if err != nil {
+               return nil, err
+       }
+       var out []*configs.Device
+       for _, f := range files {
+               switch {
+               case f.IsDir():
+                       switch f.Name() {
+                       // ".lxc" & ".lxd-mounts" added to address https://github.com/lxc/lxd/issues/2825
+                       // ".udev" added to address https://github.com/opencontainers/runc/issues/2093
+                       case "pts", "shm", "fd", "mqueue", ".lxc", ".lxd-mounts", ".udev":
+                               continue
+                       default:
+                               sub, err := GetDevices(filepath.Join(path, f.Name()))
+                               if err != nil {
+                                       return nil, err
+                               }
+
+                               out = append(out, sub...)
+                               continue
+                       }
+               case f.Name() == "console":
+                       continue
+               }
+               device, err := DeviceFromPath(filepath.Join(path, f.Name()), "rwm")
+               if err != nil {
+                       if err == ErrNotADevice {
+                               continue
+                       }
+                       if os.IsNotExist(err) {
+                               continue
+                       }
+                       return nil, err
+               }
+               out = append(out, device)
+       }
+       return out, nil
+}
diff --git a/libcontainer/devices/devices_test.go b/libcontainer/devices/devices_test.go
new file mode 100644 (file)
index 0000000..0afa9d9
--- /dev/null
@@ -0,0 +1,63 @@
+package devices
+
+import (
+       "errors"
+       "os"
+       "testing"
+
+       "golang.org/x/sys/unix"
+)
+
+func TestDeviceFromPathLstatFailure(t *testing.T) {
+       testError := errors.New("test error")
+
+       // Override unix.Lstat to inject error.
+       unixLstat = func(path string, stat *unix.Stat_t) error {
+               return testError
+       }
+
+       _, err := DeviceFromPath("", "")
+       if err != testError {
+               t.Fatalf("Unexpected error %v, expected %v", err, testError)
+       }
+}
+
+func TestHostDevicesIoutilReadDirFailure(t *testing.T) {
+       testError := errors.New("test error")
+
+       // Override ioutil.ReadDir to inject error.
+       ioutilReadDir = func(dirname string) ([]os.FileInfo, error) {
+               return nil, testError
+       }
+
+       _, err := HostDevices()
+       if err != testError {
+               t.Fatalf("Unexpected error %v, expected %v", err, testError)
+       }
+}
+
+func TestHostDevicesIoutilReadDirDeepFailure(t *testing.T) {
+       testError := errors.New("test error")
+       called := false
+
+       // Override ioutil.ReadDir to inject error after the first call.
+       ioutilReadDir = func(dirname string) ([]os.FileInfo, error) {
+               if called {
+                       return nil, testError
+               }
+               called = true
+
+               // Provoke a second call.
+               fi, err := os.Lstat("/tmp")
+               if err != nil {
+                       t.Fatalf("Unexpected error %v", err)
+               }
+
+               return []os.FileInfo{fi}, nil
+       }
+
+       _, err := HostDevices()
+       if err != testError {
+               t.Fatalf("Unexpected error %v, expected %v", err, testError)
+       }
+}
diff --git a/libcontainer/error.go b/libcontainer/error.go
new file mode 100644 (file)
index 0000000..21a3789
--- /dev/null
@@ -0,0 +1,70 @@
+package libcontainer
+
+import "io"
+
+// ErrorCode is the API error code type.
+type ErrorCode int
+
+// API error codes.
+const (
+       // Factory errors
+       IdInUse ErrorCode = iota
+       InvalidIdFormat
+
+       // Container errors
+       ContainerNotExists
+       ContainerPaused
+       ContainerNotStopped
+       ContainerNotRunning
+       ContainerNotPaused
+
+       // Process errors
+       NoProcessOps
+
+       // Common errors
+       ConfigInvalid
+       ConsoleExists
+       SystemError
+)
+
+func (c ErrorCode) String() string {
+       switch c {
+       case IdInUse:
+               return "Id already in use"
+       case InvalidIdFormat:
+               return "Invalid format"
+       case ContainerPaused:
+               return "Container paused"
+       case ConfigInvalid:
+               return "Invalid configuration"
+       case SystemError:
+               return "System error"
+       case ContainerNotExists:
+               return "Container does not exist"
+       case ContainerNotStopped:
+               return "Container is not stopped"
+       case ContainerNotRunning:
+               return "Container is not running"
+       case ConsoleExists:
+               return "Console exists for process"
+       case ContainerNotPaused:
+               return "Container is not paused"
+       case NoProcessOps:
+               return "No process operations"
+       default:
+               return "Unknown error"
+       }
+}
+
+// Error is the API error type.
+type Error interface {
+       error
+
+       // Returns an error if it failed to write the detail of the Error to w.
+       // The detail of the Error may include the error message and a
+       // representation of the stack trace.
+       Detail(w io.Writer) error
+
+       // Returns the error code for this error.
+       Code() ErrorCode
+}
diff --git a/libcontainer/error_test.go b/libcontainer/error_test.go
new file mode 100644 (file)
index 0000000..36841ad
--- /dev/null
@@ -0,0 +1,25 @@
+package libcontainer
+
+import "testing"
+
+func TestErrorCode(t *testing.T) {
+       codes := map[ErrorCode]string{
+               IdInUse:             "Id already in use",
+               InvalidIdFormat:     "Invalid format",
+               ContainerPaused:     "Container paused",
+               ConfigInvalid:       "Invalid configuration",
+               SystemError:         "System error",
+               ContainerNotExists:  "Container does not exist",
+               ContainerNotStopped: "Container is not stopped",
+               ContainerNotRunning: "Container is not running",
+               ConsoleExists:       "Console exists for process",
+               ContainerNotPaused:  "Container is not paused",
+               NoProcessOps:        "No process operations",
+       }
+
+       for code, expected := range codes {
+               if actual := code.String(); actual != expected {
+                       t.Fatalf("expected string %q but received %q", expected, actual)
+               }
+       }
+}
diff --git a/libcontainer/factory.go b/libcontainer/factory.go
new file mode 100644 (file)
index 0000000..0986cd7
--- /dev/null
@@ -0,0 +1,44 @@
+package libcontainer
+
+import (
+       "github.com/opencontainers/runc/libcontainer/configs"
+)
+
+type Factory interface {
+       // Creates a new container with the given id and starts the initial process inside it.
+       // id must be a string containing only letters, digits and underscores and must contain
+       // between 1 and 1024 characters, inclusive.
+       //
+       // The id must not already be in use by an existing container. Containers created using
+       // a factory with the same path (and filesystem) must have distinct ids.
+       //
+       // Returns the new container with a running process.
+       //
+       // errors:
+       // IdInUse - id is already in use by a container
+       // InvalidIdFormat - id has incorrect format
+       // ConfigInvalid - config is invalid
+       // Systemerror - System error
+       //
+       // On error, any partially created container parts are cleaned up (the operation is atomic).
+       Create(id string, config *configs.Config) (Container, error)
+
+       // Load takes an ID for an existing container and returns the container information
+       // from the state.  This presents a read only view of the container.
+       //
+       // errors:
+       // Path does not exist
+       // System error
+       Load(id string) (Container, error)
+
+       // StartInitialization is an internal API to libcontainer used during the reexec of the
+       // container.
+       //
+       // Errors:
+       // Pipe connection error
+       // System error
+       StartInitialization() error
+
+       // Type returns info string about factory type (e.g. lxc, libcontainer...)
+       Type() string
+}
diff --git a/libcontainer/factory_linux.go b/libcontainer/factory_linux.go
new file mode 100644 (file)
index 0000000..437633c
--- /dev/null
@@ -0,0 +1,427 @@
+// +build linux
+
+package libcontainer
+
+import (
+       "encoding/json"
+       "fmt"
+       "os"
+       "path/filepath"
+       "regexp"
+       "runtime/debug"
+       "strconv"
+
+       "github.com/cyphar/filepath-securejoin"
+       "github.com/opencontainers/runc/libcontainer/cgroups"
+       "github.com/opencontainers/runc/libcontainer/cgroups/fs"
+       "github.com/opencontainers/runc/libcontainer/cgroups/fs2"
+       "github.com/opencontainers/runc/libcontainer/cgroups/systemd"
+       "github.com/opencontainers/runc/libcontainer/configs"
+       "github.com/opencontainers/runc/libcontainer/configs/validate"
+       "github.com/opencontainers/runc/libcontainer/intelrdt"
+       "github.com/opencontainers/runc/libcontainer/mount"
+       "github.com/opencontainers/runc/libcontainer/utils"
+       "github.com/pkg/errors"
+
+       "golang.org/x/sys/unix"
+)
+
+const (
+       stateFilename    = "state.json"
+       execFifoFilename = "exec.fifo"
+)
+
+var idRegex = regexp.MustCompile(`^[\w+-\.]+$`)
+
+// InitArgs returns an options func to configure a LinuxFactory with the
+// provided init binary path and arguments.
+func InitArgs(args ...string) func(*LinuxFactory) error {
+       return func(l *LinuxFactory) (err error) {
+               if len(args) > 0 {
+                       // Resolve relative paths to ensure that its available
+                       // after directory changes.
+                       if args[0], err = filepath.Abs(args[0]); err != nil {
+                               return newGenericError(err, ConfigInvalid)
+                       }
+               }
+
+               l.InitArgs = args
+               return nil
+       }
+}
+
+// SystemdCgroups is an options func to configure a LinuxFactory to return
+// containers that use systemd to create and manage cgroups.
+func SystemdCgroups(l *LinuxFactory) error {
+       systemdCgroupsManager, err := systemd.NewSystemdCgroupsManager()
+       if err != nil {
+               return err
+       }
+       l.NewCgroupsManager = systemdCgroupsManager
+       return nil
+}
+
+func getUnifiedPath(paths map[string]string) string {
+       unifiedPath := ""
+       for k, v := range paths {
+               if unifiedPath == "" {
+                       unifiedPath = v
+               } else if v != unifiedPath {
+                       panic(errors.Errorf("expected %q path to be unified path %q, got %q", k, unifiedPath, v))
+               }
+       }
+       // can be empty
+       return unifiedPath
+}
+
+func cgroupfs2(l *LinuxFactory, rootless bool) error {
+       l.NewCgroupsManager = func(config *configs.Cgroup, paths map[string]string) cgroups.Manager {
+               m, err := fs2.NewManager(config, getUnifiedPath(paths), rootless)
+               if err != nil {
+                       panic(err)
+               }
+               return m
+       }
+       return nil
+}
+
+// Cgroupfs is an options func to configure a LinuxFactory to return containers
+// that use the native cgroups filesystem implementation to create and manage
+// cgroups.
+func Cgroupfs(l *LinuxFactory) error {
+       if cgroups.IsCgroup2UnifiedMode() {
+               return cgroupfs2(l, false)
+       }
+       l.NewCgroupsManager = func(config *configs.Cgroup, paths map[string]string) cgroups.Manager {
+               return &fs.Manager{
+                       Cgroups: config,
+                       Paths:   paths,
+               }
+       }
+       return nil
+}
+
+// RootlessCgroupfs is an options func to configure a LinuxFactory to return
+// containers that use the native cgroups filesystem implementation to create
+// and manage cgroups. The difference between RootlessCgroupfs and Cgroupfs is
+// that RootlessCgroupfs can transparently handle permission errors that occur
+// during rootless container (including euid=0 in userns) setup (while still allowing cgroup usage if
+// they've been set up properly).
+func RootlessCgroupfs(l *LinuxFactory) error {
+       if cgroups.IsCgroup2UnifiedMode() {
+               return cgroupfs2(l, true)
+       }
+       l.NewCgroupsManager = func(config *configs.Cgroup, paths map[string]string) cgroups.Manager {
+               return &fs.Manager{
+                       Cgroups:  config,
+                       Rootless: true,
+                       Paths:    paths,
+               }
+       }
+       return nil
+}
+
+// IntelRdtfs is an options func to configure a LinuxFactory to return
+// containers that use the Intel RDT "resource control" filesystem to
+// create and manage Intel RDT resources (e.g., L3 cache, memory bandwidth).
+func IntelRdtFs(l *LinuxFactory) error {
+       l.NewIntelRdtManager = func(config *configs.Config, id string, path string) intelrdt.Manager {
+               return &intelrdt.IntelRdtManager{
+                       Config: config,
+                       Id:     id,
+                       Path:   path,
+               }
+       }
+       return nil
+}
+
+// TmpfsRoot is an option func to mount LinuxFactory.Root to tmpfs.
+func TmpfsRoot(l *LinuxFactory) error {
+       mounted, err := mount.Mounted(l.Root)
+       if err != nil {
+               return err
+       }
+       if !mounted {
+               if err := unix.Mount("tmpfs", l.Root, "tmpfs", 0, ""); err != nil {
+                       return err
+               }
+       }
+       return nil
+}
+
+// CriuPath returns an option func to configure a LinuxFactory with the
+// provided criupath
+func CriuPath(criupath string) func(*LinuxFactory) error {
+       return func(l *LinuxFactory) error {
+               l.CriuPath = criupath
+               return nil
+       }
+}
+
+// New returns a linux based container factory based in the root directory and
+// configures the factory with the provided option funcs.
+func New(root string, options ...func(*LinuxFactory) error) (Factory, error) {
+       if root != "" {
+               if err := os.MkdirAll(root, 0700); err != nil {
+                       return nil, newGenericError(err, SystemError)
+               }
+       }
+       l := &LinuxFactory{
+               Root:      root,
+               InitPath:  "/proc/self/exe",
+               InitArgs:  []string{os.Args[0], "init"},
+               Validator: validate.New(),
+               CriuPath:  "criu",
+       }
+       Cgroupfs(l)
+       for _, opt := range options {
+               if opt == nil {
+                       continue
+               }
+               if err := opt(l); err != nil {
+                       return nil, err
+               }
+       }
+       return l, nil
+}
+
+// LinuxFactory implements the default factory interface for linux based systems.
+type LinuxFactory struct {
+       // Root directory for the factory to store state.
+       Root string
+
+       // InitPath is the path for calling the init responsibilities for spawning
+       // a container.
+       InitPath string
+
+       // InitArgs are arguments for calling the init responsibilities for spawning
+       // a container.
+       InitArgs []string
+
+       // CriuPath is the path to the criu binary used for checkpoint and restore of
+       // containers.
+       CriuPath string
+
+       // New{u,g}uidmapPath is the path to the binaries used for mapping with
+       // rootless containers.
+       NewuidmapPath string
+       NewgidmapPath string
+
+       // Validator provides validation to container configurations.
+       Validator validate.Validator
+
+       // NewCgroupsManager returns an initialized cgroups manager for a single container.
+       NewCgroupsManager func(config *configs.Cgroup, paths map[string]string) cgroups.Manager
+
+       // NewIntelRdtManager returns an initialized Intel RDT manager for a single container.
+       NewIntelRdtManager func(config *configs.Config, id string, path string) intelrdt.Manager
+}
+
+func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, error) {
+       if l.Root == "" {
+               return nil, newGenericError(fmt.Errorf("invalid root"), ConfigInvalid)
+       }
+       if err := l.validateID(id); err != nil {
+               return nil, err
+       }
+       if err := l.Validator.Validate(config); err != nil {
+               return nil, newGenericError(err, ConfigInvalid)
+       }
+       containerRoot, err := securejoin.SecureJoin(l.Root, id)
+       if err != nil {
+               return nil, err
+       }
+       if _, err := os.Stat(containerRoot); err == nil {
+               return nil, newGenericError(fmt.Errorf("container with id exists: %v", id), IdInUse)
+       } else if !os.IsNotExist(err) {
+               return nil, newGenericError(err, SystemError)
+       }
+       if err := os.MkdirAll(containerRoot, 0711); err != nil {
+               return nil, newGenericError(err, SystemError)
+       }
+       if err := os.Chown(containerRoot, unix.Geteuid(), unix.Getegid()); err != nil {
+               return nil, newGenericError(err, SystemError)
+       }
+       c := &linuxContainer{
+               id:            id,
+               root:          containerRoot,
+               config:        config,
+               initPath:      l.InitPath,
+               initArgs:      l.InitArgs,
+               criuPath:      l.CriuPath,
+               newuidmapPath: l.NewuidmapPath,
+               newgidmapPath: l.NewgidmapPath,
+               cgroupManager: l.NewCgroupsManager(config.Cgroups, nil),
+       }
+       if intelrdt.IsCatEnabled() || intelrdt.IsMbaEnabled() {
+               c.intelRdtManager = l.NewIntelRdtManager(config, id, "")
+       }
+       c.state = &stoppedState{c: c}
+       return c, nil
+}
+
+func (l *LinuxFactory) Load(id string) (Container, error) {
+       if l.Root == "" {
+               return nil, newGenericError(fmt.Errorf("invalid root"), ConfigInvalid)
+       }
+       //when load, we need to check id is valid or not.
+       if err := l.validateID(id); err != nil {
+               return nil, err
+       }
+       containerRoot, err := securejoin.SecureJoin(l.Root, id)
+       if err != nil {
+               return nil, err
+       }
+       state, err := l.loadState(containerRoot, id)
+       if err != nil {
+               return nil, err
+       }
+       r := &nonChildProcess{
+               processPid:       state.InitProcessPid,
+               processStartTime: state.InitProcessStartTime,
+               fds:              state.ExternalDescriptors,
+       }
+       c := &linuxContainer{
+               initProcess:          r,
+               initProcessStartTime: state.InitProcessStartTime,
+               id:                   id,
+               config:               &state.Config,
+               initPath:             l.InitPath,
+               initArgs:             l.InitArgs,
+               criuPath:             l.CriuPath,
+               newuidmapPath:        l.NewuidmapPath,
+               newgidmapPath:        l.NewgidmapPath,
+               cgroupManager:        l.NewCgroupsManager(state.Config.Cgroups, state.CgroupPaths),
+               root:                 containerRoot,
+               created:              state.Created,
+       }
+       c.state = &loadedState{c: c}
+       if err := c.refreshState(); err != nil {
+               return nil, err
+       }
+       if intelrdt.IsCatEnabled() || intelrdt.IsMbaEnabled() {
+               c.intelRdtManager = l.NewIntelRdtManager(&state.Config, id, state.IntelRdtPath)
+       }
+       return c, nil
+}
+
+func (l *LinuxFactory) Type() string {
+       return "libcontainer"
+}
+
+// StartInitialization loads a container by opening the pipe fd from the parent to read the configuration and state
+// This is a low level implementation detail of the reexec and should not be consumed externally
+func (l *LinuxFactory) StartInitialization() (err error) {
+       var (
+               pipefd, fifofd int
+               consoleSocket  *os.File
+               envInitPipe    = os.Getenv("_LIBCONTAINER_INITPIPE")
+               envFifoFd      = os.Getenv("_LIBCONTAINER_FIFOFD")
+               envConsole     = os.Getenv("_LIBCONTAINER_CONSOLE")
+       )
+
+       // Get the INITPIPE.
+       pipefd, err = strconv.Atoi(envInitPipe)
+       if err != nil {
+               return fmt.Errorf("unable to convert _LIBCONTAINER_INITPIPE=%s to int: %s", envInitPipe, err)
+       }
+
+       var (
+               pipe = os.NewFile(uintptr(pipefd), "pipe")
+               it   = initType(os.Getenv("_LIBCONTAINER_INITTYPE"))
+       )
+       defer pipe.Close()
+
+       // Only init processes have FIFOFD.
+       fifofd = -1
+       if it == initStandard {
+               if fifofd, err = strconv.Atoi(envFifoFd); err != nil {
+                       return fmt.Errorf("unable to convert _LIBCONTAINER_FIFOFD=%s to int: %s", envFifoFd, err)
+               }
+       }
+
+       if envConsole != "" {
+               console, err := strconv.Atoi(envConsole)
+               if err != nil {
+                       return fmt.Errorf("unable to convert _LIBCONTAINER_CONSOLE=%s to int: %s", envConsole, err)
+               }
+               consoleSocket = os.NewFile(uintptr(console), "console-socket")
+               defer consoleSocket.Close()
+       }
+
+       // clear the current process's environment to clean any libcontainer
+       // specific env vars.
+       os.Clearenv()
+
+       defer func() {
+               // We have an error during the initialization of the container's init,
+               // send it back to the parent process in the form of an initError.
+               if werr := utils.WriteJSON(pipe, syncT{procError}); werr != nil {
+                       fmt.Fprintln(os.Stderr, err)
+                       return
+               }
+               if werr := utils.WriteJSON(pipe, newSystemError(err)); werr != nil {
+                       fmt.Fprintln(os.Stderr, err)
+                       return
+               }
+       }()
+       defer func() {
+               if e := recover(); e != nil {
+                       err = fmt.Errorf("panic from initialization: %v, %v", e, string(debug.Stack()))
+               }
+       }()
+
+       i, err := newContainerInit(it, pipe, consoleSocket, fifofd)
+       if err != nil {
+               return err
+       }
+
+       // If Init succeeds, syscall.Exec will not return, hence none of the defers will be called.
+       return i.Init()
+}
+
+func (l *LinuxFactory) loadState(root, id string) (*State, error) {
+       stateFilePath, err := securejoin.SecureJoin(root, stateFilename)
+       if err != nil {
+               return nil, err
+       }
+       f, err := os.Open(stateFilePath)
+       if err != nil {
+               if os.IsNotExist(err) {
+                       return nil, newGenericError(fmt.Errorf("container %q does not exist", id), ContainerNotExists)
+               }
+               return nil, newGenericError(err, SystemError)
+       }
+       defer f.Close()
+       var state *State
+       if err := json.NewDecoder(f).Decode(&state); err != nil {
+               return nil, newGenericError(err, SystemError)
+       }
+       return state, nil
+}
+
+func (l *LinuxFactory) validateID(id string) error {
+       if !idRegex.MatchString(id) || string(os.PathSeparator)+id != utils.CleanPath(string(os.PathSeparator)+id) {
+               return newGenericError(fmt.Errorf("invalid id format: %v", id), InvalidIdFormat)
+       }
+
+       return nil
+}
+
+// NewuidmapPath returns an option func to configure a LinuxFactory with the
+// provided ..
+func NewuidmapPath(newuidmapPath string) func(*LinuxFactory) error {
+       return func(l *LinuxFactory) error {
+               l.NewuidmapPath = newuidmapPath
+               return nil
+       }
+}
+
+// NewgidmapPath returns an option func to configure a LinuxFactory with the
+// provided ..
+func NewgidmapPath(newgidmapPath string) func(*LinuxFactory) error {
+       return func(l *LinuxFactory) error {
+               l.NewgidmapPath = newgidmapPath
+               return nil
+       }
+}
diff --git a/libcontainer/factory_linux_test.go b/libcontainer/factory_linux_test.go
new file mode 100644 (file)
index 0000000..8d0ca8a
--- /dev/null
@@ -0,0 +1,235 @@
+// +build linux
+
+package libcontainer
+
+import (
+       "io/ioutil"
+       "os"
+       "path/filepath"
+       "reflect"
+       "testing"
+
+       "github.com/opencontainers/runc/libcontainer/configs"
+       "github.com/opencontainers/runc/libcontainer/mount"
+       "github.com/opencontainers/runc/libcontainer/utils"
+       "github.com/opencontainers/runtime-spec/specs-go"
+
+       "golang.org/x/sys/unix"
+)
+
+func newTestRoot() (string, error) {
+       dir, err := ioutil.TempDir("", "libcontainer")
+       if err != nil {
+               return "", err
+       }
+       return dir, nil
+}
+
+func TestFactoryNew(t *testing.T) {
+       root, rerr := newTestRoot()
+       if rerr != nil {
+               t.Fatal(rerr)
+       }
+       defer os.RemoveAll(root)
+       factory, err := New(root, Cgroupfs)
+       if err != nil {
+               t.Fatal(err)
+       }
+       if factory == nil {
+               t.Fatal("factory should not be nil")
+       }
+       lfactory, ok := factory.(*LinuxFactory)
+       if !ok {
+               t.Fatal("expected linux factory returned on linux based systems")
+       }
+       if lfactory.Root != root {
+               t.Fatalf("expected factory root to be %q but received %q", root, lfactory.Root)
+       }
+
+       if factory.Type() != "libcontainer" {
+               t.Fatalf("unexpected factory type: %q, expected %q", factory.Type(), "libcontainer")
+       }
+}
+
+func TestFactoryNewIntelRdt(t *testing.T) {
+       root, rerr := newTestRoot()
+       if rerr != nil {
+               t.Fatal(rerr)
+       }
+       defer os.RemoveAll(root)
+       factory, err := New(root, Cgroupfs, IntelRdtFs)
+       if err != nil {
+               t.Fatal(err)
+       }
+       if factory == nil {
+               t.Fatal("factory should not be nil")
+       }
+       lfactory, ok := factory.(*LinuxFactory)
+       if !ok {
+               t.Fatal("expected linux factory returned on linux based systems")
+       }
+       if lfactory.Root != root {
+               t.Fatalf("expected factory root to be %q but received %q", root, lfactory.Root)
+       }
+
+       if factory.Type() != "libcontainer" {
+               t.Fatalf("unexpected factory type: %q, expected %q", factory.Type(), "libcontainer")
+       }
+}
+
+func TestFactoryNewTmpfs(t *testing.T) {
+       root, rerr := newTestRoot()
+       if rerr != nil {
+               t.Fatal(rerr)
+       }
+       defer os.RemoveAll(root)
+       factory, err := New(root, Cgroupfs, TmpfsRoot)
+       if err != nil {
+               t.Fatal(err)
+       }
+       if factory == nil {
+               t.Fatal("factory should not be nil")
+       }
+       lfactory, ok := factory.(*LinuxFactory)
+       if !ok {
+               t.Fatal("expected linux factory returned on linux based systems")
+       }
+       if lfactory.Root != root {
+               t.Fatalf("expected factory root to be %q but received %q", root, lfactory.Root)
+       }
+
+       if factory.Type() != "libcontainer" {
+               t.Fatalf("unexpected factory type: %q, expected %q", factory.Type(), "libcontainer")
+       }
+       mounted, err := mount.Mounted(lfactory.Root)
+       if err != nil {
+               t.Fatal(err)
+       }
+       if !mounted {
+               t.Fatalf("Factory Root is not mounted")
+       }
+       mounts, err := mount.GetMounts()
+       if err != nil {
+               t.Fatal(err)
+       }
+       var found bool
+       for _, m := range mounts {
+               if m.Mountpoint == lfactory.Root {
+                       if m.Fstype != "tmpfs" {
+                               t.Fatalf("Fstype of root: %s, expected %s", m.Fstype, "tmpfs")
+                       }
+                       if m.Source != "tmpfs" {
+                               t.Fatalf("Source of root: %s, expected %s", m.Source, "tmpfs")
+                       }
+                       found = true
+               }
+       }
+       if !found {
+               t.Fatalf("Factory Root is not listed in mounts list")
+       }
+       defer unix.Unmount(root, unix.MNT_DETACH)
+}
+
+func TestFactoryLoadNotExists(t *testing.T) {
+       root, rerr := newTestRoot()
+       if rerr != nil {
+               t.Fatal(rerr)
+       }
+       defer os.RemoveAll(root)
+       factory, err := New(root, Cgroupfs)
+       if err != nil {
+               t.Fatal(err)
+       }
+       _, err = factory.Load("nocontainer")
+       if err == nil {
+               t.Fatal("expected nil error loading non-existing container")
+       }
+       lerr, ok := err.(Error)
+       if !ok {
+               t.Fatal("expected libcontainer error type")
+       }
+       if lerr.Code() != ContainerNotExists {
+               t.Fatalf("expected error code %s but received %s", ContainerNotExists, lerr.Code())
+       }
+}
+
+func TestFactoryLoadContainer(t *testing.T) {
+       root, err := newTestRoot()
+       if err != nil {
+               t.Fatal(err)
+       }
+       defer os.RemoveAll(root)
+       // setup default container config and state for mocking
+       var (
+               id            = "1"
+               expectedHooks = &configs.Hooks{
+                       Prestart: []configs.Hook{
+                               configs.CommandHook{Command: configs.Command{Path: "prestart-hook"}},
+                       },
+                       Poststart: []configs.Hook{
+                               configs.CommandHook{Command: configs.Command{Path: "poststart-hook"}},
+                       },
+                       Poststop: []configs.Hook{
+                               unserializableHook{},
+                               configs.CommandHook{Command: configs.Command{Path: "poststop-hook"}},
+                       },
+               }
+               expectedConfig = &configs.Config{
+                       Rootfs: "/mycontainer/root",
+                       Hooks:  expectedHooks,
+               }
+               expectedState = &State{
+                       BaseState: BaseState{
+                               InitProcessPid: 1024,
+                               Config:         *expectedConfig,
+                       },
+               }
+       )
+       if err := os.Mkdir(filepath.Join(root, id), 0700); err != nil {
+               t.Fatal(err)
+       }
+       if err := marshal(filepath.Join(root, id, stateFilename), expectedState); err != nil {
+               t.Fatal(err)
+       }
+       factory, err := New(root, Cgroupfs, IntelRdtFs)
+       if err != nil {
+               t.Fatal(err)
+       }
+       container, err := factory.Load(id)
+       if err != nil {
+               t.Fatal(err)
+       }
+       if container.ID() != id {
+               t.Fatalf("expected container id %q but received %q", id, container.ID())
+       }
+       config := container.Config()
+       if config.Rootfs != expectedConfig.Rootfs {
+               t.Fatalf("expected rootfs %q but received %q", expectedConfig.Rootfs, config.Rootfs)
+       }
+       expectedHooks.Poststop = expectedHooks.Poststop[1:] // expect unserializable hook to be skipped
+       if !reflect.DeepEqual(config.Hooks, expectedHooks) {
+               t.Fatalf("expects hooks %q but received %q", expectedHooks, config.Hooks)
+       }
+       lcontainer, ok := container.(*linuxContainer)
+       if !ok {
+               t.Fatal("expected linux container on linux based systems")
+       }
+       if lcontainer.initProcess.pid() != expectedState.InitProcessPid {
+               t.Fatalf("expected init pid %d but received %d", expectedState.InitProcessPid, lcontainer.initProcess.pid())
+       }
+}
+
+func marshal(path string, v interface{}) error {
+       f, err := os.Create(path)
+       if err != nil {
+               return err
+       }
+       defer f.Close()
+       return utils.WriteJSON(f, v)
+}
+
+type unserializableHook struct{}
+
+func (unserializableHook) Run(*specs.State) error {
+       return nil
+}
diff --git a/libcontainer/generic_error.go b/libcontainer/generic_error.go
new file mode 100644 (file)
index 0000000..6e7de2f
--- /dev/null
@@ -0,0 +1,92 @@
+package libcontainer
+
+import (
+       "fmt"
+       "io"
+       "text/template"
+       "time"
+
+       "github.com/opencontainers/runc/libcontainer/stacktrace"
+)
+
+var errorTemplate = template.Must(template.New("error").Parse(`Timestamp: {{.Timestamp}}
+Code: {{.ECode}}
+{{if .Message }}
+Message: {{.Message}}
+{{end}}
+Frames:{{range $i, $frame := .Stack.Frames}}
+---
+{{$i}}: {{$frame.Function}}
+Package: {{$frame.Package}}
+File: {{$frame.File}}@{{$frame.Line}}{{end}}
+`))
+
+func newGenericError(err error, c ErrorCode) Error {
+       if le, ok := err.(Error); ok {
+               return le
+       }
+       gerr := &genericError{
+               Timestamp: time.Now(),
+               Err:       err,
+               ECode:     c,
+               Stack:     stacktrace.Capture(1),
+       }
+       if err != nil {
+               gerr.Message = err.Error()
+       }
+       return gerr
+}
+
+func newSystemError(err error) Error {
+       return createSystemError(err, "")
+}
+
+func newSystemErrorWithCausef(err error, cause string, v ...interface{}) Error {
+       return createSystemError(err, fmt.Sprintf(cause, v...))
+}
+
+func newSystemErrorWithCause(err error, cause string) Error {
+       return createSystemError(err, cause)
+}
+
+// createSystemError creates the specified error with the correct number of
+// stack frames skipped. This is only to be called by the other functions for
+// formatting the error.
+func createSystemError(err error, cause string) Error {
+       gerr := &genericError{
+               Timestamp: time.Now(),
+               Err:       err,
+               ECode:     SystemError,
+               Cause:     cause,
+               Stack:     stacktrace.Capture(2),
+       }
+       if err != nil {
+               gerr.Message = err.Error()
+       }
+       return gerr
+}
+
+type genericError struct {
+       Timestamp time.Time
+       ECode     ErrorCode
+       Err       error `json:"-"`
+       Cause     string
+       Message   string
+       Stack     stacktrace.Stacktrace
+}
+
+func (e *genericError) Error() string {
+       if e.Cause == "" {
+               return e.Message
+       }
+       frame := e.Stack.Frames[0]
+       return fmt.Sprintf("%s:%d: %s caused %q", frame.File, frame.Line, e.Cause, e.Message)
+}
+
+func (e *genericError) Code() ErrorCode {
+       return e.ECode
+}
+
+func (e *genericError) Detail(w io.Writer) error {
+       return errorTemplate.Execute(w, e)
+}
diff --git a/libcontainer/generic_error_test.go b/libcontainer/generic_error_test.go
new file mode 100644 (file)
index 0000000..8fbdd4d
--- /dev/null
@@ -0,0 +1,49 @@
+package libcontainer
+
+import (
+       "fmt"
+       "io/ioutil"
+       "testing"
+)
+
+func TestErrorDetail(t *testing.T) {
+       err := newGenericError(fmt.Errorf("test error"), SystemError)
+       if derr := err.Detail(ioutil.Discard); derr != nil {
+               t.Fatal(derr)
+       }
+}
+
+func TestErrorWithCode(t *testing.T) {
+       err := newGenericError(fmt.Errorf("test error"), SystemError)
+       if code := err.Code(); code != SystemError {
+               t.Fatalf("expected err code %q but %q", SystemError, code)
+       }
+}
+
+func TestErrorWithError(t *testing.T) {
+       cc := []struct {
+               errmsg string
+               cause  string
+       }{
+               {
+                       errmsg: "test error",
+               },
+               {
+                       errmsg: "test error",
+                       cause:  "test",
+               },
+       }
+
+       for _, v := range cc {
+               err := newSystemErrorWithCause(fmt.Errorf(v.errmsg), v.cause)
+
+               msg := err.Error()
+               if v.cause == "" && msg != v.errmsg {
+                       t.Fatalf("expected err(%q) equal errmsg(%q)", msg, v.errmsg)
+               }
+               if v.cause != "" && msg == v.errmsg {
+                       t.Fatalf("unexpected err(%q) equal errmsg(%q)", msg, v.errmsg)
+               }
+
+       }
+}
diff --git a/libcontainer/init_linux.go b/libcontainer/init_linux.go
new file mode 100644 (file)
index 0000000..c1b1560
--- /dev/null
@@ -0,0 +1,537 @@
+// +build linux
+
+package libcontainer
+
+import (
+       "encoding/json"
+       "fmt"
+       "io"
+       "io/ioutil"
+       "net"
+       "os"
+       "strings"
+       "syscall" // only for Errno
+       "unsafe"
+
+       "golang.org/x/sys/unix"
+
+       "github.com/containerd/console"
+       "github.com/opencontainers/runc/libcontainer/cgroups"
+       "github.com/opencontainers/runc/libcontainer/configs"
+       "github.com/opencontainers/runc/libcontainer/system"
+       "github.com/opencontainers/runc/libcontainer/user"
+       "github.com/opencontainers/runc/libcontainer/utils"
+       "github.com/pkg/errors"
+       "github.com/sirupsen/logrus"
+       "github.com/vishvananda/netlink"
+)
+
+type initType string
+
+const (
+       initSetns    initType = "setns"
+       initStandard initType = "standard"
+)
+
+type pid struct {
+       Pid           int `json:"pid"`
+       PidFirstChild int `json:"pid_first"`
+}
+
+// network is an internal struct used to setup container networks.
+type network struct {
+       configs.Network
+
+       // TempVethPeerName is a unique temporary veth peer name that was placed into
+       // the container's namespace.
+       TempVethPeerName string `json:"temp_veth_peer_name"`
+}
+
+// initConfig is used for transferring parameters from Exec() to Init()
+type initConfig struct {
+       Args             []string              `json:"args"`
+       Env              []string              `json:"env"`
+       Cwd              string                `json:"cwd"`
+       Capabilities     *configs.Capabilities `json:"capabilities"`
+       ProcessLabel     string                `json:"process_label"`
+       AppArmorProfile  string                `json:"apparmor_profile"`
+       NoNewPrivileges  bool                  `json:"no_new_privileges"`
+       User             string                `json:"user"`
+       AdditionalGroups []string              `json:"additional_groups"`
+       Config           *configs.Config       `json:"config"`
+       Networks         []*network            `json:"network"`
+       PassedFilesCount int                   `json:"passed_files_count"`
+       ContainerId      string                `json:"containerid"`
+       Rlimits          []configs.Rlimit      `json:"rlimits"`
+       CreateConsole    bool                  `json:"create_console"`
+       ConsoleWidth     uint16                `json:"console_width"`
+       ConsoleHeight    uint16                `json:"console_height"`
+       RootlessEUID     bool                  `json:"rootless_euid,omitempty"`
+       RootlessCgroups  bool                  `json:"rootless_cgroups,omitempty"`
+}
+
+type initer interface {
+       Init() error
+}
+
+func newContainerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd int) (initer, error) {
+       var config *initConfig
+       if err := json.NewDecoder(pipe).Decode(&config); err != nil {
+               return nil, err
+       }
+       if err := populateProcessEnvironment(config.Env); err != nil {
+               return nil, err
+       }
+       switch t {
+       case initSetns:
+               return &linuxSetnsInit{
+                       pipe:          pipe,
+                       consoleSocket: consoleSocket,
+                       config:        config,
+               }, nil
+       case initStandard:
+               return &linuxStandardInit{
+                       pipe:          pipe,
+                       consoleSocket: consoleSocket,
+                       parentPid:     unix.Getppid(),
+                       config:        config,
+                       fifoFd:        fifoFd,
+               }, nil
+       }
+       return nil, fmt.Errorf("unknown init type %q", t)
+}
+
+// populateProcessEnvironment loads the provided environment variables into the
+// current processes's environment.
+func populateProcessEnvironment(env []string) error {
+       for _, pair := range env {
+               p := strings.SplitN(pair, "=", 2)
+               if len(p) < 2 {
+                       return fmt.Errorf("invalid environment '%v'", pair)
+               }
+               if err := os.Setenv(p[0], p[1]); err != nil {
+                       return err
+               }
+       }
+       return nil
+}
+
+// finalizeNamespace drops the caps, sets the correct user
+// and working dir, and closes any leaked file descriptors
+// before executing the command inside the namespace
+func finalizeNamespace(config *initConfig) error {
+       // Ensure that all unwanted fds we may have accidentally
+       // inherited are marked close-on-exec so they stay out of the
+       // container
+       if err := utils.CloseExecFrom(config.PassedFilesCount + 3); err != nil {
+               return errors.Wrap(err, "close exec fds")
+       }
+
+       if config.Cwd != "" {
+               if err := unix.Chdir(config.Cwd); err != nil {
+                       return fmt.Errorf("chdir to cwd (%q) set in config.json failed: %v", config.Cwd, err)
+               }
+       }
+
+       capabilities := &configs.Capabilities{}
+       if config.Capabilities != nil {
+               capabilities = config.Capabilities
+       } else if config.Config.Capabilities != nil {
+               capabilities = config.Config.Capabilities
+       }
+       w, err := newContainerCapList(capabilities)
+       if err != nil {
+               return err
+       }
+       // drop capabilities in bounding set before changing user
+       if err := w.ApplyBoundingSet(); err != nil {
+               return errors.Wrap(err, "apply bounding set")
+       }
+       // preserve existing capabilities while we change users
+       if err := system.SetKeepCaps(); err != nil {
+               return errors.Wrap(err, "set keep caps")
+       }
+       if err := setupUser(config); err != nil {
+               return errors.Wrap(err, "setup user")
+       }
+       if err := system.ClearKeepCaps(); err != nil {
+               return errors.Wrap(err, "clear keep caps")
+       }
+       if err := w.ApplyCaps(); err != nil {
+               return errors.Wrap(err, "apply caps")
+       }
+       return nil
+}
+
+// setupConsole sets up the console from inside the container, and sends the
+// master pty fd to the config.Pipe (using cmsg). This is done to ensure that
+// consoles are scoped to a container properly (see runc#814 and the many
+// issues related to that). This has to be run *after* we've pivoted to the new
+// rootfs (and the users' configuration is entirely set up).
+func setupConsole(socket *os.File, config *initConfig, mount bool) error {
+       defer socket.Close()
+       // At this point, /dev/ptmx points to something that we would expect. We
+       // used to change the owner of the slave path, but since the /dev/pts mount
+       // can have gid=X set (at the users' option). So touching the owner of the
+       // slave PTY is not necessary, as the kernel will handle that for us. Note
+       // however, that setupUser (specifically fixStdioPermissions) *will* change
+       // the UID owner of the console to be the user the process will run as (so
+       // they can actually control their console).
+
+       pty, slavePath, err := console.NewPty()
+       if err != nil {
+               return err
+       }
+
+       if config.ConsoleHeight != 0 && config.ConsoleWidth != 0 {
+               err = pty.Resize(console.WinSize{
+                       Height: config.ConsoleHeight,
+                       Width:  config.ConsoleWidth,
+               })
+
+               if err != nil {
+                       return err
+               }
+       }
+
+       // After we return from here, we don't need the console anymore.
+       defer pty.Close()
+
+       // Mount the console inside our rootfs.
+       if mount {
+               if err := mountConsole(slavePath); err != nil {
+                       return err
+               }
+       }
+       // While we can access console.master, using the API is a good idea.
+       if err := utils.SendFd(socket, pty.Name(), pty.Fd()); err != nil {
+               return err
+       }
+       // Now, dup over all the things.
+       return dupStdio(slavePath)
+}
+
+// syncParentReady sends to the given pipe a JSON payload which indicates that
+// the init is ready to Exec the child process. It then waits for the parent to
+// indicate that it is cleared to Exec.
+func syncParentReady(pipe io.ReadWriter) error {
+       // Tell parent.
+       if err := writeSync(pipe, procReady); err != nil {
+               return err
+       }
+
+       // Wait for parent to give the all-clear.
+       return readSync(pipe, procRun)
+}
+
+// syncParentHooks sends to the given pipe a JSON payload which indicates that
+// the parent should execute pre-start hooks. It then waits for the parent to
+// indicate that it is cleared to resume.
+func syncParentHooks(pipe io.ReadWriter) error {
+       // Tell parent.
+       if err := writeSync(pipe, procHooks); err != nil {
+               return err
+       }
+
+       // Wait for parent to give the all-clear.
+       return readSync(pipe, procResume)
+}
+
+// setupUser changes the groups, gid, and uid for the user inside the container
+func setupUser(config *initConfig) error {
+       // Set up defaults.
+       defaultExecUser := user.ExecUser{
+               Uid:  0,
+               Gid:  0,
+               Home: "/",
+       }
+
+       passwdPath, err := user.GetPasswdPath()
+       if err != nil {
+               return err
+       }
+
+       groupPath, err := user.GetGroupPath()
+       if err != nil {
+               return err
+       }
+
+       execUser, err := user.GetExecUserPath(config.User, &defaultExecUser, passwdPath, groupPath)
+       if err != nil {
+               return err
+       }
+
+       var addGroups []int
+       if len(config.AdditionalGroups) > 0 {
+               addGroups, err = user.GetAdditionalGroupsPath(config.AdditionalGroups, groupPath)
+               if err != nil {
+                       return err
+               }
+       }
+
+       // Rather than just erroring out later in setuid(2) and setgid(2), check
+       // that the user is mapped here.
+       if _, err := config.Config.HostUID(execUser.Uid); err != nil {
+               return fmt.Errorf("cannot set uid to unmapped user in user namespace")
+       }
+       if _, err := config.Config.HostGID(execUser.Gid); err != nil {
+               return fmt.Errorf("cannot set gid to unmapped user in user namespace")
+       }
+
+       if config.RootlessEUID {
+               // We cannot set any additional groups in a rootless container and thus
+               // we bail if the user asked us to do so. TODO: We currently can't do
+               // this check earlier, but if libcontainer.Process.User was typesafe
+               // this might work.
+               if len(addGroups) > 0 {
+                       return fmt.Errorf("cannot set any additional groups in a rootless container")
+               }
+       }
+
+       // Before we change to the container's user make sure that the processes
+       // STDIO is correctly owned by the user that we are switching to.
+       if err := fixStdioPermissions(config, execUser); err != nil {
+               return err
+       }
+
+       setgroups, err := ioutil.ReadFile("/proc/self/setgroups")
+       if err != nil && !os.IsNotExist(err) {
+               return err
+       }
+
+       // This isn't allowed in an unprivileged user namespace since Linux 3.19.
+       // There's nothing we can do about /etc/group entries, so we silently
+       // ignore setting groups here (since the user didn't explicitly ask us to
+       // set the group).
+       allowSupGroups := !config.RootlessEUID && strings.TrimSpace(string(setgroups)) != "deny"
+
+       if allowSupGroups {
+               suppGroups := append(execUser.Sgids, addGroups...)
+               if err := unix.Setgroups(suppGroups); err != nil {
+                       return err
+               }
+       }
+
+       if err := system.Setgid(execUser.Gid); err != nil {
+               return err
+       }
+       if err := system.Setuid(execUser.Uid); err != nil {
+               return err
+       }
+
+       // if we didn't get HOME already, set it based on the user's HOME
+       if envHome := os.Getenv("HOME"); envHome == "" {
+               if err := os.Setenv("HOME", execUser.Home); err != nil {
+                       return err
+               }
+       }
+       return nil
+}
+
+// fixStdioPermissions fixes the permissions of PID 1's STDIO within the container to the specified user.
+// The ownership needs to match because it is created outside of the container and needs to be
+// localized.
+func fixStdioPermissions(config *initConfig, u *user.ExecUser) error {
+       var null unix.Stat_t
+       if err := unix.Stat("/dev/null", &null); err != nil {
+               return err
+       }
+       for _, fd := range []uintptr{
+               os.Stdin.Fd(),
+               os.Stderr.Fd(),
+               os.Stdout.Fd(),
+       } {
+               var s unix.Stat_t
+               if err := unix.Fstat(int(fd), &s); err != nil {
+                       return err
+               }
+
+               // Skip chown of /dev/null if it was used as one of the STDIO fds.
+               if s.Rdev == null.Rdev {
+                       continue
+               }
+
+               // We only change the uid owner (as it is possible for the mount to
+               // prefer a different gid, and there's no reason for us to change it).
+               // The reason why we don't just leave the default uid=X mount setup is
+               // that users expect to be able to actually use their console. Without
+               // this code, you couldn't effectively run as a non-root user inside a
+               // container and also have a console set up.
+               if err := unix.Fchown(int(fd), u.Uid, int(s.Gid)); err != nil {
+                       // If we've hit an EINVAL then s.Gid isn't mapped in the user
+                       // namespace. If we've hit an EPERM then the inode's current owner
+                       // is not mapped in our user namespace (in particular,
+                       // privileged_wrt_inode_uidgid() has failed). In either case, we
+                       // are in a configuration where it's better for us to just not
+                       // touch the stdio rather than bail at this point.
+                       if err == unix.EINVAL || err == unix.EPERM {
+                               continue
+                       }
+                       return err
+               }
+       }
+       return nil
+}
+
+// setupNetwork sets up and initializes any network interface inside the container.
+func setupNetwork(config *initConfig) error {
+       for _, config := range config.Networks {
+               strategy, err := getStrategy(config.Type)
+               if err != nil {
+                       return err
+               }
+               if err := strategy.initialize(config); err != nil {
+                       return err
+               }
+       }
+       return nil
+}
+
+func setupRoute(config *configs.Config) error {
+       for _, config := range config.Routes {
+               _, dst, err := net.ParseCIDR(config.Destination)
+               if err != nil {
+                       return err
+               }
+               src := net.ParseIP(config.Source)
+               if src == nil {
+                       return fmt.Errorf("Invalid source for route: %s", config.Source)
+               }
+               gw := net.ParseIP(config.Gateway)
+               if gw == nil {
+                       return fmt.Errorf("Invalid gateway for route: %s", config.Gateway)
+               }
+               l, err := netlink.LinkByName(config.InterfaceName)
+               if err != nil {
+                       return err
+               }
+               route := &netlink.Route{
+                       Scope:     netlink.SCOPE_UNIVERSE,
+                       Dst:       dst,
+                       Src:       src,
+                       Gw:        gw,
+                       LinkIndex: l.Attrs().Index,
+               }
+               if err := netlink.RouteAdd(route); err != nil {
+                       return err
+               }
+       }
+       return nil
+}
+
+func setupRlimits(limits []configs.Rlimit, pid int) error {
+       for _, rlimit := range limits {
+               if err := system.Prlimit(pid, rlimit.Type, unix.Rlimit{Max: rlimit.Hard, Cur: rlimit.Soft}); err != nil {
+                       return fmt.Errorf("error setting rlimit type %v: %v", rlimit.Type, err)
+               }
+       }
+       return nil
+}
+
+const _P_PID = 1
+
+type siginfo struct {
+       si_signo int32
+       si_errno int32
+       si_code  int32
+       // below here is a union; si_pid is the only field we use
+       si_pid int32
+       // Pad to 128 bytes as detailed in blockUntilWaitable
+       pad [96]byte
+}
+
+// isWaitable returns true if the process has exited false otherwise.
+// Its based off blockUntilWaitable in src/os/wait_waitid.go
+func isWaitable(pid int) (bool, error) {
+       si := &siginfo{}
+       _, _, e := unix.Syscall6(unix.SYS_WAITID, _P_PID, uintptr(pid), uintptr(unsafe.Pointer(si)), unix.WEXITED|unix.WNOWAIT|unix.WNOHANG, 0, 0)
+       if e != 0 {
+               return false, os.NewSyscallError("waitid", e)
+       }
+
+       return si.si_pid != 0, nil
+}
+
+// isNoChildren returns true if err represents a unix.ECHILD (formerly syscall.ECHILD) false otherwise
+func isNoChildren(err error) bool {
+       switch err := err.(type) {
+       case syscall.Errno:
+               if err == unix.ECHILD {
+                       return true
+               }
+       case *os.SyscallError:
+               if err.Err == unix.ECHILD {
+                       return true
+               }
+       }
+       return false
+}
+
+// signalAllProcesses freezes then iterates over all the processes inside the
+// manager's cgroups sending the signal s to them.
+// If s is SIGKILL then it will wait for each process to exit.
+// For all other signals it will check if the process is ready to report its
+// exit status and only if it is will a wait be performed.
+func signalAllProcesses(m cgroups.Manager, s os.Signal) error {
+       var procs []*os.Process
+       if err := m.Freeze(configs.Frozen); err != nil {
+               logrus.Warn(err)
+       }
+       pids, err := m.GetAllPids()
+       if err != nil {
+               m.Freeze(configs.Thawed)
+               return err
+       }
+       for _, pid := range pids {
+               p, err := os.FindProcess(pid)
+               if err != nil {
+                       logrus.Warn(err)
+                       continue
+               }
+               procs = append(procs, p)
+               if err := p.Signal(s); err != nil {
+                       logrus.Warn(err)
+               }
+       }
+       if err := m.Freeze(configs.Thawed); err != nil {
+               logrus.Warn(err)
+       }
+
+       subreaper, err := system.GetSubreaper()
+       if err != nil {
+               // The error here means that PR_GET_CHILD_SUBREAPER is not
+               // supported because this code might run on a kernel older
+               // than 3.4. We don't want to throw an error in that case,
+               // and we simplify things, considering there is no subreaper
+               // set.
+               subreaper = 0
+       }
+
+       for _, p := range procs {
+               if s != unix.SIGKILL {
+                       if ok, err := isWaitable(p.Pid); err != nil {
+                               if !isNoChildren(err) {
+                                       logrus.Warn("signalAllProcesses: ", p.Pid, err)
+                               }
+                               continue
+                       } else if !ok {
+                               // Not ready to report so don't wait
+                               continue
+                       }
+               }
+
+               // In case a subreaper has been setup, this code must not
+               // wait for the process. Otherwise, we cannot be sure the
+               // current process will be reaped by the subreaper, while
+               // the subreaper might be waiting for this process in order
+               // to retrieve its exit code.
+               if subreaper == 0 {
+                       if _, err := p.Wait(); err != nil {
+                               if !isNoChildren(err) {
+                                       logrus.Warn("wait: ", err)
+                               }
+                       }
+               }
+       }
+       return nil
+}
diff --git a/libcontainer/integration/checkpoint_test.go b/libcontainer/integration/checkpoint_test.go
new file mode 100644 (file)
index 0000000..cdb6810
--- /dev/null
@@ -0,0 +1,264 @@
+package integration
+
+import (
+       "bufio"
+       "bytes"
+       "io/ioutil"
+       "os"
+       "os/exec"
+       "path/filepath"
+       "strings"
+       "testing"
+
+       "github.com/opencontainers/runc/libcontainer"
+       "github.com/opencontainers/runc/libcontainer/cgroups"
+       "github.com/opencontainers/runc/libcontainer/configs"
+
+       "golang.org/x/sys/unix"
+)
+
+func showFile(t *testing.T, fname string) error {
+       t.Logf("=== %s ===\n", fname)
+
+       f, err := os.Open(fname)
+       if err != nil {
+               t.Log(err)
+               return err
+       }
+       defer f.Close()
+
+       scanner := bufio.NewScanner(f)
+       for scanner.Scan() {
+               t.Log(scanner.Text())
+       }
+
+       if err := scanner.Err(); err != nil {
+               return err
+       }
+
+       t.Logf("=== END ===\n")
+
+       return nil
+}
+
+func TestUsernsCheckpoint(t *testing.T) {
+       t.Skip("Ubuntu kernel is broken to run criu (#2196, #2198)")
+       if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) {
+               t.Skip("userns is unsupported")
+       }
+       cmd := exec.Command("criu", "check", "--feature", "userns")
+       if err := cmd.Run(); err != nil {
+               t.Skip("Unable to c/r a container with userns")
+       }
+       testCheckpoint(t, true)
+}
+
+func TestCheckpoint(t *testing.T) {
+       t.Skip("Ubuntu kernel is broken to run criu (#2196, #2198)")
+       testCheckpoint(t, false)
+}
+
+func testCheckpoint(t *testing.T, userns bool) {
+       if testing.Short() {
+               return
+       }
+       if cgroups.IsCgroup2UnifiedMode() {
+               t.Skip("cgroup v1 is not supported")
+       }
+
+       root, err := newTestRoot()
+       if err != nil {
+               t.Fatal(err)
+       }
+       defer os.RemoveAll(root)
+
+       rootfs, err := newRootfs()
+       if err != nil {
+               t.Fatal(err)
+       }
+       defer remove(rootfs)
+
+       config := newTemplateConfig(rootfs)
+
+       config.Mounts = append(config.Mounts, &configs.Mount{
+               Destination: "/sys/fs/cgroup",
+               Device:      "cgroup",
+               Flags:       defaultMountFlags | unix.MS_RDONLY,
+       })
+
+       if userns {
+               config.UidMappings = []configs.IDMap{{HostID: 0, ContainerID: 0, Size: 1000}}
+               config.GidMappings = []configs.IDMap{{HostID: 0, ContainerID: 0, Size: 1000}}
+               config.Namespaces = append(config.Namespaces, configs.Namespace{Type: configs.NEWUSER})
+       }
+
+       factory, err := libcontainer.New(root, libcontainer.Cgroupfs)
+
+       if err != nil {
+               t.Fatal(err)
+       }
+
+       container, err := factory.Create("test", config)
+       if err != nil {
+               t.Fatal(err)
+       }
+       defer container.Destroy()
+
+       stdinR, stdinW, err := os.Pipe()
+       if err != nil {
+               t.Fatal(err)
+       }
+
+       var stdout bytes.Buffer
+
+       pconfig := libcontainer.Process{
+               Cwd:    "/",
+               Args:   []string{"cat"},
+               Env:    standardEnvironment,
+               Stdin:  stdinR,
+               Stdout: &stdout,
+               Init:   true,
+       }
+
+       err = container.Run(&pconfig)
+       stdinR.Close()
+       defer stdinW.Close()
+       if err != nil {
+               t.Fatal(err)
+       }
+
+       pid, err := pconfig.Pid()
+       if err != nil {
+               t.Fatal(err)
+       }
+
+       process, err := os.FindProcess(pid)
+       if err != nil {
+               t.Fatal(err)
+       }
+
+       parentDir, err := ioutil.TempDir("", "criu-parent")
+       if err != nil {
+               t.Fatal(err)
+       }
+       defer os.RemoveAll(parentDir)
+
+       preDumpOpts := &libcontainer.CriuOpts{
+               ImagesDirectory: parentDir,
+               WorkDirectory:   parentDir,
+               PreDump:         true,
+       }
+       preDumpLog := filepath.Join(preDumpOpts.WorkDirectory, "dump.log")
+
+       if err := container.Checkpoint(preDumpOpts); err != nil {
+               showFile(t, preDumpLog)
+               t.Fatal(err)
+       }
+
+       state, err := container.Status()
+       if err != nil {
+               t.Fatal(err)
+       }
+
+       if state != libcontainer.Running {
+               t.Fatal("Unexpected preDump state: ", state)
+       }
+
+       imagesDir, err := ioutil.TempDir("", "criu")
+       if err != nil {
+               t.Fatal(err)
+       }
+       defer os.RemoveAll(imagesDir)
+
+       checkpointOpts := &libcontainer.CriuOpts{
+               ImagesDirectory: imagesDir,
+               WorkDirectory:   imagesDir,
+               ParentImage:     "../criu-parent",
+       }
+       dumpLog := filepath.Join(checkpointOpts.WorkDirectory, "dump.log")
+       restoreLog := filepath.Join(checkpointOpts.WorkDirectory, "restore.log")
+
+       if err := container.Checkpoint(checkpointOpts); err != nil {
+               showFile(t, dumpLog)
+               t.Fatal(err)
+       }
+
+       state, err = container.Status()
+       if err != nil {
+               t.Fatal(err)
+       }
+
+       if state != libcontainer.Stopped {
+               t.Fatal("Unexpected state checkpoint: ", state)
+       }
+
+       stdinW.Close()
+       _, err = process.Wait()
+       if err != nil {
+               t.Fatal(err)
+       }
+
+       // reload the container
+       container, err = factory.Load("test")
+       if err != nil {
+               t.Fatal(err)
+       }
+
+       restoreStdinR, restoreStdinW, err := os.Pipe()
+       if err != nil {
+               t.Fatal(err)
+       }
+
+       restoreProcessConfig := &libcontainer.Process{
+               Cwd:    "/",
+               Stdin:  restoreStdinR,
+               Stdout: &stdout,
+               Init:   true,
+       }
+
+       err = container.Restore(restoreProcessConfig, checkpointOpts)
+       restoreStdinR.Close()
+       defer restoreStdinW.Close()
+       if err != nil {
+               showFile(t, restoreLog)
+               t.Fatal(err)
+       }
+
+       state, err = container.Status()
+       if err != nil {
+               t.Fatal(err)
+       }
+       if state != libcontainer.Running {
+               t.Fatal("Unexpected restore state: ", state)
+       }
+
+       pid, err = restoreProcessConfig.Pid()
+       if err != nil {
+               t.Fatal(err)
+       }
+
+       process, err = os.FindProcess(pid)
+       if err != nil {
+               t.Fatal(err)
+       }
+
+       _, err = restoreStdinW.WriteString("Hello!")
+       if err != nil {
+               t.Fatal(err)
+       }
+
+       restoreStdinW.Close()
+       s, err := process.Wait()
+       if err != nil {
+               t.Fatal(err)
+       }
+
+       if !s.Success() {
+               t.Fatal(s.String(), pid)
+       }
+
+       output := string(stdout.Bytes())
+       if !strings.Contains(output, "Hello!") {
+               t.Fatal("Did not restore the pipe correctly:", output)
+       }
+}
diff --git a/libcontainer/integration/doc.go b/libcontainer/integration/doc.go
new file mode 100644 (file)
index 0000000..87545bc
--- /dev/null
@@ -0,0 +1,2 @@
+// integration is used for integration testing of libcontainer
+package integration
diff --git a/libcontainer/integration/exec_test.go b/libcontainer/integration/exec_test.go
new file mode 100644 (file)
index 0000000..7822fa8
--- /dev/null
@@ -0,0 +1,1793 @@
+package integration
+
+import (
+       "bytes"
+       "encoding/json"
+       "fmt"
+       "io/ioutil"
+       "os"
+       "os/exec"
+       "path/filepath"
+       "reflect"
+       "strconv"
+       "strings"
+       "testing"
+
+       "github.com/opencontainers/runc/libcontainer"
+       "github.com/opencontainers/runc/libcontainer/cgroups"
+       "github.com/opencontainers/runc/libcontainer/cgroups/systemd"
+       "github.com/opencontainers/runc/libcontainer/configs"
+       "github.com/opencontainers/runtime-spec/specs-go"
+
+       "golang.org/x/sys/unix"
+)
+
+func TestExecPS(t *testing.T) {
+       testExecPS(t, false)
+}
+
+func TestUsernsExecPS(t *testing.T) {
+       if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) {
+               t.Skip("userns is unsupported")
+       }
+       testExecPS(t, true)
+}
+
+func testExecPS(t *testing.T, userns bool) {
+       if testing.Short() {
+               return
+       }
+       rootfs, err := newRootfs()
+       ok(t, err)
+       defer remove(rootfs)
+       config := newTemplateConfig(rootfs)
+       if userns {
+               config.UidMappings = []configs.IDMap{{HostID: 0, ContainerID: 0, Size: 1000}}
+               config.GidMappings = []configs.IDMap{{HostID: 0, ContainerID: 0, Size: 1000}}
+               config.Namespaces = append(config.Namespaces, configs.Namespace{Type: configs.NEWUSER})
+       }
+
+       buffers, exitCode, err := runContainer(config, "", "ps", "-o", "pid,user,comm")
+       if err != nil {
+               t.Fatalf("%s: %s", buffers, err)
+       }
+       if exitCode != 0 {
+               t.Fatalf("exit code not 0. code %d stderr %q", exitCode, buffers.Stderr)
+       }
+       lines := strings.Split(buffers.Stdout.String(), "\n")
+       if len(lines) < 2 {
+               t.Fatalf("more than one process running for output %q", buffers.Stdout.String())
+       }
+       expected := `1 root     ps`
+       actual := strings.Trim(lines[1], "\n ")
+       if actual != expected {
+               t.Fatalf("expected output %q but received %q", expected, actual)
+       }
+}
+
+func TestIPCPrivate(t *testing.T) {
+       if testing.Short() {
+               return
+       }
+
+       rootfs, err := newRootfs()
+       ok(t, err)
+       defer remove(rootfs)
+
+       l, err := os.Readlink("/proc/1/ns/ipc")
+       ok(t, err)
+
+       config := newTemplateConfig(rootfs)
+       buffers, exitCode, err := runContainer(config, "", "readlink", "/proc/self/ns/ipc")
+       ok(t, err)
+
+       if exitCode != 0 {
+               t.Fatalf("exit code not 0. code %d stderr %q", exitCode, buffers.Stderr)
+       }
+
+       if actual := strings.Trim(buffers.Stdout.String(), "\n"); actual == l {
+               t.Fatalf("ipc link should be private to the container but equals host %q %q", actual, l)
+       }
+}
+
+func TestIPCHost(t *testing.T) {
+       if testing.Short() {
+               return
+       }
+
+       rootfs, err := newRootfs()
+       ok(t, err)
+       defer remove(rootfs)
+
+       l, err := os.Readlink("/proc/1/ns/ipc")
+       ok(t, err)
+
+       config := newTemplateConfig(rootfs)
+       config.Namespaces.Remove(configs.NEWIPC)
+       buffers, exitCode, err := runContainer(config, "", "readlink", "/proc/self/ns/ipc")
+       ok(t, err)
+
+       if exitCode != 0 {
+               t.Fatalf("exit code not 0. code %d stderr %q", exitCode, buffers.Stderr)
+       }
+
+       if actual := strings.Trim(buffers.Stdout.String(), "\n"); actual != l {
+               t.Fatalf("ipc link not equal to host link %q %q", actual, l)
+       }
+}
+
+func TestIPCJoinPath(t *testing.T) {
+       if testing.Short() {
+               return
+       }
+
+       rootfs, err := newRootfs()
+       ok(t, err)
+       defer remove(rootfs)
+
+       l, err := os.Readlink("/proc/1/ns/ipc")
+       ok(t, err)
+
+       config := newTemplateConfig(rootfs)
+       config.Namespaces.Add(configs.NEWIPC, "/proc/1/ns/ipc")
+
+       buffers, exitCode, err := runContainer(config, "", "readlink", "/proc/self/ns/ipc")
+       ok(t, err)
+
+       if exitCode != 0 {
+               t.Fatalf("exit code not 0. code %d stderr %q", exitCode, buffers.Stderr)
+       }
+
+       if actual := strings.Trim(buffers.Stdout.String(), "\n"); actual != l {
+               t.Fatalf("ipc link not equal to host link %q %q", actual, l)
+       }
+}
+
+func TestIPCBadPath(t *testing.T) {
+       if testing.Short() {
+               return
+       }
+
+       rootfs, err := newRootfs()
+       ok(t, err)
+       defer remove(rootfs)
+
+       config := newTemplateConfig(rootfs)
+       config.Namespaces.Add(configs.NEWIPC, "/proc/1/ns/ipcc")
+
+       _, _, err = runContainer(config, "", "true")
+       if err == nil {
+               t.Fatal("container succeeded with bad ipc path")
+       }
+}
+
+func TestRlimit(t *testing.T) {
+       testRlimit(t, false)
+}
+
+func TestUsernsRlimit(t *testing.T) {
+       if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) {
+               t.Skip("userns is unsupported")
+       }
+
+       testRlimit(t, true)
+}
+
+func testRlimit(t *testing.T, userns bool) {
+       if testing.Short() {
+               return
+       }
+
+       rootfs, err := newRootfs()
+       ok(t, err)
+       defer remove(rootfs)
+
+       config := newTemplateConfig(rootfs)
+       if userns {
+               config.UidMappings = []configs.IDMap{{HostID: 0, ContainerID: 0, Size: 1000}}
+               config.GidMappings = []configs.IDMap{{HostID: 0, ContainerID: 0, Size: 1000}}
+               config.Namespaces = append(config.Namespaces, configs.Namespace{Type: configs.NEWUSER})
+       }
+
+       // ensure limit is lower than what the config requests to test that in a user namespace
+       // the Setrlimit call happens early enough that we still have permissions to raise the limit.
+       ok(t, unix.Setrlimit(unix.RLIMIT_NOFILE, &unix.Rlimit{
+               Max: 1024,
+               Cur: 1024,
+       }))
+
+       out, _, err := runContainer(config, "", "/bin/sh", "-c", "ulimit -n")
+       ok(t, err)
+       if limit := strings.TrimSpace(out.Stdout.String()); limit != "1025" {
+               t.Fatalf("expected rlimit to be 1025, got %s", limit)
+       }
+}
+
+func TestEnter(t *testing.T) {
+       if testing.Short() {
+               return
+       }
+       if cgroups.IsCgroup2UnifiedMode() {
+               t.Skip("cgroup v1 is not supported")
+       }
+
+       rootfs, err := newRootfs()
+       ok(t, err)
+       defer remove(rootfs)
+
+       config := newTemplateConfig(rootfs)
+
+       container, err := newContainerWithName("test", config)
+       ok(t, err)
+       defer container.Destroy()
+
+       // Execute a first process in the container
+       stdinR, stdinW, err := os.Pipe()
+       ok(t, err)
+
+       var stdout, stdout2 bytes.Buffer
+
+       pconfig := libcontainer.Process{
+               Cwd:    "/",
+               Args:   []string{"sh", "-c", "cat && readlink /proc/self/ns/pid"},
+               Env:    standardEnvironment,
+               Stdin:  stdinR,
+               Stdout: &stdout,
+               Init:   true,
+       }
+       err = container.Run(&pconfig)
+       stdinR.Close()
+       defer stdinW.Close()
+       ok(t, err)
+       pid, err := pconfig.Pid()
+       ok(t, err)
+
+       // Execute another process in the container
+       stdinR2, stdinW2, err := os.Pipe()
+       ok(t, err)
+       pconfig2 := libcontainer.Process{
+               Cwd: "/",
+               Env: standardEnvironment,
+       }
+       pconfig2.Args = []string{"sh", "-c", "cat && readlink /proc/self/ns/pid"}
+       pconfig2.Stdin = stdinR2
+       pconfig2.Stdout = &stdout2
+
+       err = container.Run(&pconfig2)
+       stdinR2.Close()
+       defer stdinW2.Close()
+       ok(t, err)
+
+       pid2, err := pconfig2.Pid()
+       ok(t, err)
+
+       processes, err := container.Processes()
+       ok(t, err)
+
+       n := 0
+       for i := range processes {
+               if processes[i] == pid || processes[i] == pid2 {
+                       n++
+               }
+       }
+       if n != 2 {
+               t.Fatal("unexpected number of processes", processes, pid, pid2)
+       }
+
+       // Wait processes
+       stdinW2.Close()
+       waitProcess(&pconfig2, t)
+
+       stdinW.Close()
+       waitProcess(&pconfig, t)
+
+       // Check that both processes live in the same pidns
+       pidns := string(stdout.Bytes())
+       ok(t, err)
+
+       pidns2 := string(stdout2.Bytes())
+       ok(t, err)
+
+       if pidns != pidns2 {
+               t.Fatal("The second process isn't in the required pid namespace", pidns, pidns2)
+       }
+}
+
+func TestProcessEnv(t *testing.T) {
+       if testing.Short() {
+               return
+       }
+
+       rootfs, err := newRootfs()
+       ok(t, err)
+       defer remove(rootfs)
+
+       config := newTemplateConfig(rootfs)
+
+       container, err := newContainerWithName("test", config)
+       ok(t, err)
+       defer container.Destroy()
+
+       var stdout bytes.Buffer
+       pconfig := libcontainer.Process{
+               Cwd:  "/",
+               Args: []string{"sh", "-c", "env"},
+               Env: []string{
+                       "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
+                       "HOSTNAME=integration",
+                       "TERM=xterm",
+                       "FOO=BAR",
+               },
+               Stdin:  nil,
+               Stdout: &stdout,
+               Init:   true,
+       }
+       err = container.Run(&pconfig)
+       ok(t, err)
+
+       // Wait for process
+       waitProcess(&pconfig, t)
+
+       outputEnv := string(stdout.Bytes())
+
+       // Check that the environment has the key/value pair we added
+       if !strings.Contains(outputEnv, "FOO=BAR") {
+               t.Fatal("Environment doesn't have the expected FOO=BAR key/value pair: ", outputEnv)
+       }
+
+       // Make sure that HOME is set
+       if !strings.Contains(outputEnv, "HOME=/root") {
+               t.Fatal("Environment doesn't have HOME set: ", outputEnv)
+       }
+}
+
+func TestProcessEmptyCaps(t *testing.T) {
+       if testing.Short() {
+               return
+       }
+
+       rootfs, err := newRootfs()
+       ok(t, err)
+       defer remove(rootfs)
+
+       config := newTemplateConfig(rootfs)
+       config.Capabilities = nil
+
+       container, err := newContainerWithName("test", config)
+       ok(t, err)
+       defer container.Destroy()
+
+       var stdout bytes.Buffer
+       pconfig := libcontainer.Process{
+               Cwd:    "/",
+               Args:   []string{"sh", "-c", "cat /proc/self/status"},
+               Env:    standardEnvironment,
+               Stdin:  nil,
+               Stdout: &stdout,
+               Init:   true,
+       }
+       err = container.Run(&pconfig)
+       ok(t, err)
+
+       // Wait for process
+       waitProcess(&pconfig, t)
+
+       outputStatus := string(stdout.Bytes())
+
+       lines := strings.Split(outputStatus, "\n")
+
+       effectiveCapsLine := ""
+       for _, l := range lines {
+               line := strings.TrimSpace(l)
+               if strings.Contains(line, "CapEff:") {
+                       effectiveCapsLine = line
+                       break
+               }
+       }
+
+       if effectiveCapsLine == "" {
+               t.Fatal("Couldn't find effective caps: ", outputStatus)
+       }
+}
+
+func TestProcessCaps(t *testing.T) {
+       if testing.Short() {
+               return
+       }
+
+       rootfs, err := newRootfs()
+       ok(t, err)
+       defer remove(rootfs)
+
+       config := newTemplateConfig(rootfs)
+
+       container, err := newContainerWithName("test", config)
+       ok(t, err)
+       defer container.Destroy()
+
+       var stdout bytes.Buffer
+       pconfig := libcontainer.Process{
+               Cwd:          "/",
+               Args:         []string{"sh", "-c", "cat /proc/self/status"},
+               Env:          standardEnvironment,
+               Stdin:        nil,
+               Stdout:       &stdout,
+               Capabilities: &configs.Capabilities{},
+               Init:         true,
+       }
+       pconfig.Capabilities.Bounding = append(config.Capabilities.Bounding, "CAP_NET_ADMIN")
+       pconfig.Capabilities.Permitted = append(config.Capabilities.Permitted, "CAP_NET_ADMIN")
+       pconfig.Capabilities.Effective = append(config.Capabilities.Effective, "CAP_NET_ADMIN")
+       pconfig.Capabilities.Inheritable = append(config.Capabilities.Inheritable, "CAP_NET_ADMIN")
+       err = container.Run(&pconfig)
+       ok(t, err)
+
+       // Wait for process
+       waitProcess(&pconfig, t)
+
+       outputStatus := string(stdout.Bytes())
+
+       lines := strings.Split(outputStatus, "\n")
+
+       effectiveCapsLine := ""
+       for _, l := range lines {
+               line := strings.TrimSpace(l)
+               if strings.Contains(line, "CapEff:") {
+                       effectiveCapsLine = line
+                       break
+               }
+       }
+
+       if effectiveCapsLine == "" {
+               t.Fatal("Couldn't find effective caps: ", outputStatus)
+       }
+
+       parts := strings.Split(effectiveCapsLine, ":")
+       effectiveCapsStr := strings.TrimSpace(parts[1])
+
+       effectiveCaps, err := strconv.ParseUint(effectiveCapsStr, 16, 64)
+       if err != nil {
+               t.Fatal("Could not parse effective caps", err)
+       }
+
+       var netAdminMask uint64
+       var netAdminBit uint
+       netAdminBit = 12 // from capability.h
+       netAdminMask = 1 << netAdminBit
+       if effectiveCaps&netAdminMask != netAdminMask {
+               t.Fatal("CAP_NET_ADMIN is not set as expected")
+       }
+}
+
+func TestAdditionalGroups(t *testing.T) {
+       if testing.Short() {
+               return
+       }
+       rootfs, err := newRootfs()
+       ok(t, err)
+       defer remove(rootfs)
+
+       config := newTemplateConfig(rootfs)
+
+       container, err := newContainerWithName("test", config)
+       ok(t, err)
+       defer container.Destroy()
+
+       var stdout bytes.Buffer
+       pconfig := libcontainer.Process{
+               Cwd:              "/",
+               Args:             []string{"sh", "-c", "id", "-Gn"},
+               Env:              standardEnvironment,
+               Stdin:            nil,
+               Stdout:           &stdout,
+               AdditionalGroups: []string{"plugdev", "audio"},
+               Init:             true,
+       }
+       err = container.Run(&pconfig)
+       ok(t, err)
+
+       // Wait for process
+       waitProcess(&pconfig, t)
+
+       outputGroups := string(stdout.Bytes())
+
+       // Check that the groups output has the groups that we specified
+       if !strings.Contains(outputGroups, "audio") {
+               t.Fatalf("Listed groups do not contain the audio group as expected: %v", outputGroups)
+       }
+
+       if !strings.Contains(outputGroups, "plugdev") {
+               t.Fatalf("Listed groups do not contain the plugdev group as expected: %v", outputGroups)
+       }
+}
+
+func TestFreeze(t *testing.T) {
+       testFreeze(t, false)
+}
+
+func TestSystemdFreeze(t *testing.T) {
+       if !systemd.UseSystemd() {
+               t.Skip("Systemd is unsupported")
+       }
+       testFreeze(t, true)
+}
+
+func testFreeze(t *testing.T, systemd bool) {
+       if testing.Short() {
+               return
+       }
+       if cgroups.IsCgroup2UnifiedMode() {
+               t.Skip("cgroup v1 is not supported")
+       }
+
+       rootfs, err := newRootfs()
+       ok(t, err)
+       defer remove(rootfs)
+
+       config := newTemplateConfig(rootfs)
+       container, err := newContainerWithName("test", config)
+       ok(t, err)
+       defer container.Destroy()
+
+       stdinR, stdinW, err := os.Pipe()
+       ok(t, err)
+
+       pconfig := &libcontainer.Process{
+               Cwd:   "/",
+               Args:  []string{"cat"},
+               Env:   standardEnvironment,
+               Stdin: stdinR,
+               Init:  true,
+       }
+       err = container.Run(pconfig)
+       stdinR.Close()
+       defer stdinW.Close()
+       ok(t, err)
+
+       err = container.Pause()
+       ok(t, err)
+       state, err := container.Status()
+       ok(t, err)
+       err = container.Resume()
+       ok(t, err)
+       if state != libcontainer.Paused {
+               t.Fatal("Unexpected state: ", state)
+       }
+
+       stdinW.Close()
+       waitProcess(pconfig, t)
+}
+
+func TestCpuShares(t *testing.T) {
+       testCpuShares(t, false)
+}
+
+func TestCpuSharesSystemd(t *testing.T) {
+       if !systemd.UseSystemd() {
+               t.Skip("Systemd is unsupported")
+       }
+       testCpuShares(t, true)
+}
+
+func testCpuShares(t *testing.T, systemd bool) {
+       if testing.Short() {
+               return
+       }
+       if cgroups.IsCgroup2UnifiedMode() {
+               t.Skip("cgroup v1 is not supported")
+       }
+
+       rootfs, err := newRootfs()
+       ok(t, err)
+       defer remove(rootfs)
+
+       config := newTemplateConfig(rootfs)
+       if systemd {
+               config.Cgroups.Parent = "system.slice"
+       }
+       config.Cgroups.Resources.CpuShares = 1
+
+       _, _, err = runContainer(config, "", "ps")
+       if err == nil {
+               t.Fatalf("runContainer should failed with invalid CpuShares")
+       }
+}
+
+func TestPids(t *testing.T) {
+       testPids(t, false)
+}
+
+func TestPidsSystemd(t *testing.T) {
+       if !systemd.UseSystemd() {
+               t.Skip("Systemd is unsupported")
+       }
+       testPids(t, true)
+}
+
+func testPids(t *testing.T, systemd bool) {
+       if testing.Short() {
+               return
+       }
+       if cgroups.IsCgroup2UnifiedMode() {
+               t.Skip("cgroup v1 is not supported")
+       }
+
+       rootfs, err := newRootfs()
+       ok(t, err)
+       defer remove(rootfs)
+
+       config := newTemplateConfig(rootfs)
+       if systemd {
+               config.Cgroups.Parent = "system.slice"
+       }
+       config.Cgroups.Resources.PidsLimit = -1
+
+       // Running multiple processes.
+       _, ret, err := runContainer(config, "", "/bin/sh", "-c", "/bin/true | /bin/true | /bin/true | /bin/true")
+       if err != nil && strings.Contains(err.Error(), "no such directory for pids.max") {
+               t.Skip("PIDs cgroup is unsupported")
+       }
+       ok(t, err)
+
+       if ret != 0 {
+               t.Fatalf("expected fork() to succeed with no pids limit")
+       }
+
+       // Enforce a permissive limit. This needs to be fairly hand-wavey due to the
+       // issues with running Go binaries with pids restrictions (see below).
+       config.Cgroups.Resources.PidsLimit = 64
+       _, ret, err = runContainer(config, "", "/bin/sh", "-c", `
+       /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | bin/true | /bin/true |
+       /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | bin/true | /bin/true |
+       /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | bin/true | /bin/true |
+       /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | bin/true | /bin/true`)
+       if err != nil && strings.Contains(err.Error(), "no such directory for pids.max") {
+               t.Skip("PIDs cgroup is unsupported")
+       }
+       ok(t, err)
+
+       if ret != 0 {
+               t.Fatalf("expected fork() to succeed with permissive pids limit")
+       }
+
+       // Enforce a restrictive limit. 64 * /bin/true + 1 * shell should cause this
+       // to fail reliability.
+       config.Cgroups.Resources.PidsLimit = 64
+       out, _, err := runContainer(config, "", "/bin/sh", "-c", `
+       /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | bin/true | /bin/true |
+       /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | bin/true | /bin/true |
+       /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | bin/true | /bin/true |
+       /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | bin/true | /bin/true |
+       /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | bin/true | /bin/true |
+       /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | bin/true | /bin/true |
+       /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | bin/true | /bin/true |
+       /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | bin/true | /bin/true`)
+       if err != nil && strings.Contains(err.Error(), "no such directory for pids.max") {
+               t.Skip("PIDs cgroup is unsupported")
+       }
+       if err != nil && !strings.Contains(out.String(), "sh: can't fork") {
+               ok(t, err)
+       }
+
+       if err == nil {
+               t.Fatalf("expected fork() to fail with restrictive pids limit")
+       }
+
+       // Minimal restrictions are not really supported, due to quirks in using Go
+       // due to the fact that it spawns random processes. While we do our best with
+       // late setting cgroup values, it's just too unreliable with very small pids.max.
+       // As such, we don't test that case. YMMV.
+}
+
+func TestRunWithKernelMemory(t *testing.T) {
+       testRunWithKernelMemory(t, false)
+}
+
+func TestRunWithKernelMemorySystemd(t *testing.T) {
+       if !systemd.UseSystemd() {
+               t.Skip("Systemd is unsupported")
+       }
+       testRunWithKernelMemory(t, true)
+}
+
+func testRunWithKernelMemory(t *testing.T, systemd bool) {
+       if testing.Short() {
+               return
+       }
+       if cgroups.IsCgroup2UnifiedMode() {
+               t.Skip("cgroup v1 is not supported")
+       }
+
+       rootfs, err := newRootfs()
+       ok(t, err)
+       defer remove(rootfs)
+
+       config := newTemplateConfig(rootfs)
+       if systemd {
+               config.Cgroups.Parent = "system.slice"
+       }
+       config.Cgroups.Resources.KernelMemory = 52428800
+
+       _, _, err = runContainer(config, "", "ps")
+       if err != nil {
+               t.Fatalf("runContainer failed with kernel memory limit: %v", err)
+       }
+}
+
+func TestContainerState(t *testing.T) {
+       if testing.Short() {
+               return
+       }
+
+       rootfs, err := newRootfs()
+       if err != nil {
+               t.Fatal(err)
+       }
+       defer remove(rootfs)
+
+       l, err := os.Readlink("/proc/1/ns/ipc")
+       if err != nil {
+               t.Fatal(err)
+       }
+
+       config := newTemplateConfig(rootfs)
+       config.Namespaces = configs.Namespaces([]configs.Namespace{
+               {Type: configs.NEWNS},
+               {Type: configs.NEWUTS},
+               // host for IPC
+               //{Type: configs.NEWIPC},
+               {Type: configs.NEWPID},
+               {Type: configs.NEWNET},
+       })
+
+       container, err := newContainerWithName("test", config)
+       if err != nil {
+               t.Fatal(err)
+       }
+       defer container.Destroy()
+
+       stdinR, stdinW, err := os.Pipe()
+       if err != nil {
+               t.Fatal(err)
+       }
+       p := &libcontainer.Process{
+               Cwd:   "/",
+               Args:  []string{"cat"},
+               Env:   standardEnvironment,
+               Stdin: stdinR,
+               Init:  true,
+       }
+       err = container.Run(p)
+       if err != nil {
+               t.Fatal(err)
+       }
+       stdinR.Close()
+       defer stdinW.Close()
+
+       st, err := container.State()
+       if err != nil {
+               t.Fatal(err)
+       }
+
+       l1, err := os.Readlink(st.NamespacePaths[configs.NEWIPC])
+       if err != nil {
+               t.Fatal(err)
+       }
+       if l1 != l {
+               t.Fatal("Container using non-host ipc namespace")
+       }
+       stdinW.Close()
+       waitProcess(p, t)
+}
+
+func TestPassExtraFiles(t *testing.T) {
+       if testing.Short() {
+               return
+       }
+
+       rootfs, err := newRootfs()
+       if err != nil {
+               t.Fatal(err)
+       }
+       defer remove(rootfs)
+
+       config := newTemplateConfig(rootfs)
+
+       container, err := newContainerWithName("test", config)
+       if err != nil {
+               t.Fatal(err)
+       }
+       defer container.Destroy()
+
+       var stdout bytes.Buffer
+       pipeout1, pipein1, err := os.Pipe()
+       if err != nil {
+               t.Fatal(err)
+       }
+       pipeout2, pipein2, err := os.Pipe()
+       if err != nil {
+               t.Fatal(err)
+       }
+       process := libcontainer.Process{
+               Cwd:        "/",
+               Args:       []string{"sh", "-c", "cd /proc/$$/fd; echo -n *; echo -n 1 >3; echo -n 2 >4"},
+               Env:        []string{"PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"},
+               ExtraFiles: []*os.File{pipein1, pipein2},
+               Stdin:      nil,
+               Stdout:     &stdout,
+               Init:       true,
+       }
+       err = container.Run(&process)
+       if err != nil {
+               t.Fatal(err)
+       }
+
+       waitProcess(&process, t)
+
+       out := string(stdout.Bytes())
+       // fd 5 is the directory handle for /proc/$$/fd
+       if out != "0 1 2 3 4 5" {
+               t.Fatalf("expected to have the file descriptors '0 1 2 3 4 5' passed to init, got '%s'", out)
+       }
+       var buf = []byte{0}
+       _, err = pipeout1.Read(buf)
+       if err != nil {
+               t.Fatal(err)
+       }
+       out1 := string(buf)
+       if out1 != "1" {
+               t.Fatalf("expected first pipe to receive '1', got '%s'", out1)
+       }
+
+       _, err = pipeout2.Read(buf)
+       if err != nil {
+               t.Fatal(err)
+       }
+       out2 := string(buf)
+       if out2 != "2" {
+               t.Fatalf("expected second pipe to receive '2', got '%s'", out2)
+       }
+}
+
+func TestMountCmds(t *testing.T) {
+       if testing.Short() {
+               return
+       }
+
+       rootfs, err := newRootfs()
+       if err != nil {
+               t.Fatal(err)
+       }
+       defer remove(rootfs)
+
+       tmpDir, err := ioutil.TempDir("", "tmpdir")
+       if err != nil {
+               t.Fatal(err)
+       }
+       defer os.RemoveAll(tmpDir)
+
+       config := newTemplateConfig(rootfs)
+       config.Mounts = append(config.Mounts, &configs.Mount{
+               Source:      tmpDir,
+               Destination: "/tmp",
+               Device:      "bind",
+               Flags:       unix.MS_BIND | unix.MS_REC,
+               PremountCmds: []configs.Command{
+                       {Path: "touch", Args: []string{filepath.Join(tmpDir, "hello")}},
+                       {Path: "touch", Args: []string{filepath.Join(tmpDir, "world")}},
+               },
+               PostmountCmds: []configs.Command{
+                       {Path: "cp", Args: []string{filepath.Join(rootfs, "tmp", "hello"), filepath.Join(rootfs, "tmp", "hello-backup")}},
+                       {Path: "cp", Args: []string{filepath.Join(rootfs, "tmp", "world"), filepath.Join(rootfs, "tmp", "world-backup")}},
+               },
+       })
+
+       container, err := newContainerWithName("test", config)
+       if err != nil {
+               t.Fatal(err)
+       }
+       defer container.Destroy()
+
+       pconfig := libcontainer.Process{
+               Cwd:  "/",
+               Args: []string{"sh", "-c", "env"},
+               Env:  standardEnvironment,
+               Init: true,
+       }
+       err = container.Run(&pconfig)
+       if err != nil {
+               t.Fatal(err)
+       }
+
+       // Wait for process
+       waitProcess(&pconfig, t)
+
+       entries, err := ioutil.ReadDir(tmpDir)
+       if err != nil {
+               t.Fatal(err)
+       }
+       expected := []string{"hello", "hello-backup", "world", "world-backup"}
+       for i, e := range entries {
+               if e.Name() != expected[i] {
+                       t.Errorf("Got(%s), expect %s", e.Name(), expected[i])
+               }
+       }
+}
+
+func TestSysctl(t *testing.T) {
+       if testing.Short() {
+               return
+       }
+
+       rootfs, err := newRootfs()
+       ok(t, err)
+       defer remove(rootfs)
+
+       config := newTemplateConfig(rootfs)
+       config.Sysctl = map[string]string{
+               "kernel.shmmni": "8192",
+       }
+
+       container, err := newContainerWithName("test", config)
+       ok(t, err)
+       defer container.Destroy()
+
+       var stdout bytes.Buffer
+       pconfig := libcontainer.Process{
+               Cwd:    "/",
+               Args:   []string{"sh", "-c", "cat /proc/sys/kernel/shmmni"},
+               Env:    standardEnvironment,
+               Stdin:  nil,
+               Stdout: &stdout,
+               Init:   true,
+       }
+       err = container.Run(&pconfig)
+       ok(t, err)
+
+       // Wait for process
+       waitProcess(&pconfig, t)
+
+       shmmniOutput := strings.TrimSpace(string(stdout.Bytes()))
+       if shmmniOutput != "8192" {
+               t.Fatalf("kernel.shmmni property expected to be 8192, but is %s", shmmniOutput)
+       }
+}
+
+func TestMountCgroupRO(t *testing.T) {
+       if testing.Short() {
+               return
+       }
+       rootfs, err := newRootfs()
+       ok(t, err)
+       defer remove(rootfs)
+       config := newTemplateConfig(rootfs)
+
+       config.Mounts = append(config.Mounts, &configs.Mount{
+               Destination: "/sys/fs/cgroup",
+               Device:      "cgroup",
+               Flags:       defaultMountFlags | unix.MS_RDONLY,
+       })
+
+       buffers, exitCode, err := runContainer(config, "", "mount")
+       if err != nil {
+               t.Fatalf("%s: %s", buffers, err)
+       }
+       if exitCode != 0 {
+               t.Fatalf("exit code not 0. code %d stderr %q", exitCode, buffers.Stderr)
+       }
+       mountInfo := buffers.Stdout.String()
+       lines := strings.Split(mountInfo, "\n")
+       for _, l := range lines {
+               if strings.HasPrefix(l, "tmpfs on /sys/fs/cgroup") {
+                       if !strings.Contains(l, "ro") ||
+                               !strings.Contains(l, "nosuid") ||
+                               !strings.Contains(l, "nodev") ||
+                               !strings.Contains(l, "noexec") {
+                               t.Fatalf("Mode expected to contain 'ro,nosuid,nodev,noexec': %s", l)
+                       }
+                       if !strings.Contains(l, "mode=755") {
+                               t.Fatalf("Mode expected to contain 'mode=755': %s", l)
+                       }
+                       continue
+               }
+               if !strings.HasPrefix(l, "cgroup") {
+                       continue
+               }
+               if !strings.Contains(l, "ro") ||
+                       !strings.Contains(l, "nosuid") ||
+                       !strings.Contains(l, "nodev") ||
+                       !strings.Contains(l, "noexec") {
+                       t.Fatalf("Mode expected to contain 'ro,nosuid,nodev,noexec': %s", l)
+               }
+       }
+}
+
+func TestMountCgroupRW(t *testing.T) {
+       if testing.Short() {
+               return
+       }
+       rootfs, err := newRootfs()
+       ok(t, err)
+       defer remove(rootfs)
+       config := newTemplateConfig(rootfs)
+
+       config.Mounts = append(config.Mounts, &configs.Mount{
+               Destination: "/sys/fs/cgroup",
+               Device:      "cgroup",
+               Flags:       defaultMountFlags,
+       })
+
+       buffers, exitCode, err := runContainer(config, "", "mount")
+       if err != nil {
+               t.Fatalf("%s: %s", buffers, err)
+       }
+       if exitCode != 0 {
+               t.Fatalf("exit code not 0. code %d stderr %q", exitCode, buffers.Stderr)
+       }
+       mountInfo := buffers.Stdout.String()
+       lines := strings.Split(mountInfo, "\n")
+       for _, l := range lines {
+               if strings.HasPrefix(l, "tmpfs on /sys/fs/cgroup") {
+                       if !strings.Contains(l, "rw") ||
+                               !strings.Contains(l, "nosuid") ||
+                               !strings.Contains(l, "nodev") ||
+                               !strings.Contains(l, "noexec") {
+                               t.Fatalf("Mode expected to contain 'rw,nosuid,nodev,noexec': %s", l)
+                       }
+                       if !strings.Contains(l, "mode=755") {
+                               t.Fatalf("Mode expected to contain 'mode=755': %s", l)
+                       }
+                       continue
+               }
+               if !strings.HasPrefix(l, "cgroup") {
+                       continue
+               }
+               if !strings.Contains(l, "rw") ||
+                       !strings.Contains(l, "nosuid") ||
+                       !strings.Contains(l, "nodev") ||
+                       !strings.Contains(l, "noexec") {
+                       t.Fatalf("Mode expected to contain 'rw,nosuid,nodev,noexec': %s", l)
+               }
+       }
+}
+
+func TestOomScoreAdj(t *testing.T) {
+       if testing.Short() {
+               return
+       }
+
+       rootfs, err := newRootfs()
+       ok(t, err)
+       defer remove(rootfs)
+
+       config := newTemplateConfig(rootfs)
+       config.OomScoreAdj = ptrInt(200)
+
+       container, err := newContainerWithName("test", config)
+       ok(t, err)
+       defer container.Destroy()
+
+       var stdout bytes.Buffer
+       pconfig := libcontainer.Process{
+               Cwd:    "/",
+               Args:   []string{"sh", "-c", "cat /proc/self/oom_score_adj"},
+               Env:    standardEnvironment,
+               Stdin:  nil,
+               Stdout: &stdout,
+               Init:   true,
+       }
+       err = container.Run(&pconfig)
+       ok(t, err)
+
+       // Wait for process
+       waitProcess(&pconfig, t)
+       outputOomScoreAdj := strings.TrimSpace(string(stdout.Bytes()))
+
+       // Check that the oom_score_adj matches the value that was set as part of config.
+       if outputOomScoreAdj != strconv.Itoa(*config.OomScoreAdj) {
+               t.Fatalf("Expected oom_score_adj %d; got %q", *config.OomScoreAdj, outputOomScoreAdj)
+       }
+}
+
+func TestHook(t *testing.T) {
+       if testing.Short() {
+               return
+       }
+
+       bundle, err := newTestBundle()
+       ok(t, err)
+       defer remove(bundle)
+
+       rootfs, err := newRootfs()
+       ok(t, err)
+       defer remove(rootfs)
+
+       config := newTemplateConfig(rootfs)
+       expectedBundle := bundle
+       config.Labels = append(config.Labels, fmt.Sprintf("bundle=%s", expectedBundle))
+
+       getRootfsFromBundle := func(bundle string) (string, error) {
+               f, err := os.Open(filepath.Join(bundle, "config.json"))
+               if err != nil {
+                       return "", err
+               }
+
+               var config configs.Config
+               if err = json.NewDecoder(f).Decode(&config); err != nil {
+                       return "", err
+               }
+               return config.Rootfs, nil
+       }
+
+       config.Hooks = &configs.Hooks{
+               Prestart: []configs.Hook{
+                       configs.NewFunctionHook(func(s *specs.State) error {
+                               if s.Bundle != expectedBundle {
+                                       t.Fatalf("Expected prestart hook bundlePath '%s'; got '%s'", expectedBundle, s.Bundle)
+                               }
+
+                               root, err := getRootfsFromBundle(s.Bundle)
+                               if err != nil {
+                                       return err
+                               }
+                               f, err := os.Create(filepath.Join(root, "test"))
+                               if err != nil {
+                                       return err
+                               }
+                               return f.Close()
+                       }),
+               },
+               Poststart: []configs.Hook{
+                       configs.NewFunctionHook(func(s *specs.State) error {
+                               if s.Bundle != expectedBundle {
+                                       t.Fatalf("Expected poststart hook bundlePath '%s'; got '%s'", expectedBundle, s.Bundle)
+                               }
+
+                               root, err := getRootfsFromBundle(s.Bundle)
+                               if err != nil {
+                                       return err
+                               }
+                               return ioutil.WriteFile(filepath.Join(root, "test"), []byte("hello world"), 0755)
+                       }),
+               },
+               Poststop: []configs.Hook{
+                       configs.NewFunctionHook(func(s *specs.State) error {
+                               if s.Bundle != expectedBundle {
+                                       t.Fatalf("Expected poststop hook bundlePath '%s'; got '%s'", expectedBundle, s.Bundle)
+                               }
+
+                               root, err := getRootfsFromBundle(s.Bundle)
+                               if err != nil {
+                                       return err
+                               }
+                               return os.RemoveAll(filepath.Join(root, "test"))
+                       }),
+               },
+       }
+
+       // write config of json format into config.json under bundle
+       f, err := os.OpenFile(filepath.Join(bundle, "config.json"), os.O_CREATE|os.O_RDWR, 0644)
+       ok(t, err)
+       ok(t, json.NewEncoder(f).Encode(config))
+
+       container, err := newContainerWithName("test", config)
+       ok(t, err)
+
+       var stdout bytes.Buffer
+       pconfig := libcontainer.Process{
+               Cwd:    "/",
+               Args:   []string{"sh", "-c", "ls /test"},
+               Env:    standardEnvironment,
+               Stdin:  nil,
+               Stdout: &stdout,
+               Init:   true,
+       }
+       err = container.Run(&pconfig)
+       ok(t, err)
+
+       // Wait for process
+       waitProcess(&pconfig, t)
+
+       outputLs := string(stdout.Bytes())
+
+       // Check that the ls output has the expected file touched by the prestart hook
+       if !strings.Contains(outputLs, "/test") {
+               container.Destroy()
+               t.Fatalf("ls output doesn't have the expected file: %s", outputLs)
+       }
+
+       // Check that the file is written by the poststart hook
+       testFilePath := filepath.Join(rootfs, "test")
+       contents, err := ioutil.ReadFile(testFilePath)
+       if err != nil {
+               t.Fatalf("cannot read file '%s': %s", testFilePath, err)
+       }
+       if string(contents) != "hello world" {
+               t.Fatalf("Expected test file to contain 'hello world'; got '%s'", string(contents))
+       }
+
+       if err := container.Destroy(); err != nil {
+               t.Fatalf("container destroy %s", err)
+       }
+       fi, err := os.Stat(filepath.Join(rootfs, "test"))
+       if err == nil || !os.IsNotExist(err) {
+               t.Fatalf("expected file to not exist, got %s", fi.Name())
+       }
+}
+
+func TestSTDIOPermissions(t *testing.T) {
+       if testing.Short() {
+               return
+       }
+
+       rootfs, err := newRootfs()
+       ok(t, err)
+       defer remove(rootfs)
+       config := newTemplateConfig(rootfs)
+       buffers, exitCode, err := runContainer(config, "", "sh", "-c", "echo hi > /dev/stderr")
+       ok(t, err)
+       if exitCode != 0 {
+               t.Fatalf("exit code not 0. code %d stderr %q", exitCode, buffers.Stderr)
+       }
+
+       if actual := strings.Trim(buffers.Stderr.String(), "\n"); actual != "hi" {
+               t.Fatalf("stderr should equal be equal %q %q", actual, "hi")
+       }
+}
+
+func unmountOp(path string) error {
+       return unix.Unmount(path, unix.MNT_DETACH)
+}
+
+// Launch container with rootfsPropagation in rslave mode. Also
+// bind mount a volume /mnt1host at /mnt1cont at the time of launch. Now do
+// another mount on host (/mnt1host/mnt2host) and this new mount should
+// propagate to container (/mnt1cont/mnt2host)
+func TestRootfsPropagationSlaveMount(t *testing.T) {
+       var mountPropagated bool
+       var dir1cont string
+       var dir2cont string
+
+       dir1cont = "/root/mnt1cont"
+
+       if testing.Short() {
+               return
+       }
+       rootfs, err := newRootfs()
+       ok(t, err)
+       defer remove(rootfs)
+       config := newTemplateConfig(rootfs)
+
+       config.RootPropagation = unix.MS_SLAVE | unix.MS_REC
+
+       // Bind mount a volume
+       dir1host, err := ioutil.TempDir("", "mnt1host")
+       ok(t, err)
+       defer os.RemoveAll(dir1host)
+
+       // Make this dir a "shared" mount point. This will make sure a
+       // slave relationship can be established in container.
+       err = unix.Mount(dir1host, dir1host, "bind", unix.MS_BIND|unix.MS_REC, "")
+       ok(t, err)
+       err = unix.Mount("", dir1host, "", unix.MS_SHARED|unix.MS_REC, "")
+       ok(t, err)
+       defer unmountOp(dir1host)
+
+       config.Mounts = append(config.Mounts, &configs.Mount{
+               Source:      dir1host,
+               Destination: dir1cont,
+               Device:      "bind",
+               Flags:       unix.MS_BIND | unix.MS_REC})
+
+       container, err := newContainerWithName("testSlaveMount", config)
+       ok(t, err)
+       defer container.Destroy()
+
+       stdinR, stdinW, err := os.Pipe()
+       ok(t, err)
+
+       pconfig := &libcontainer.Process{
+               Cwd:   "/",
+               Args:  []string{"cat"},
+               Env:   standardEnvironment,
+               Stdin: stdinR,
+               Init:  true,
+       }
+
+       err = container.Run(pconfig)
+       stdinR.Close()
+       defer stdinW.Close()
+       ok(t, err)
+
+       // Create mnt1host/mnt2host and bind mount itself on top of it. This
+       // should be visible in container.
+       dir2host, err := ioutil.TempDir(dir1host, "mnt2host")
+       ok(t, err)
+       defer os.RemoveAll(dir2host)
+
+       err = unix.Mount(dir2host, dir2host, "bind", unix.MS_BIND, "")
+       defer unmountOp(dir2host)
+       ok(t, err)
+
+       // Run "cat /proc/self/mountinfo" in container and look at mount points.
+       var stdout2 bytes.Buffer
+
+       stdinR2, stdinW2, err := os.Pipe()
+       ok(t, err)
+
+       pconfig2 := &libcontainer.Process{
+               Cwd:    "/",
+               Args:   []string{"cat", "/proc/self/mountinfo"},
+               Env:    standardEnvironment,
+               Stdin:  stdinR2,
+               Stdout: &stdout2,
+       }
+
+       err = container.Run(pconfig2)
+       stdinR2.Close()
+       defer stdinW2.Close()
+       ok(t, err)
+
+       stdinW2.Close()
+       waitProcess(pconfig2, t)
+       stdinW.Close()
+       waitProcess(pconfig, t)
+
+       mountPropagated = false
+       dir2cont = filepath.Join(dir1cont, filepath.Base(dir2host))
+
+       propagationInfo := string(stdout2.Bytes())
+       lines := strings.Split(propagationInfo, "\n")
+       for _, l := range lines {
+               linefields := strings.Split(l, " ")
+               if len(linefields) < 5 {
+                       continue
+               }
+
+               if linefields[4] == dir2cont {
+                       mountPropagated = true
+                       break
+               }
+       }
+
+       if mountPropagated != true {
+               t.Fatalf("Mount on host %s did not propagate in container at %s\n", dir2host, dir2cont)
+       }
+}
+
+// Launch container with rootfsPropagation 0 so no propagation flags are
+// applied. Also bind mount a volume /mnt1host at /mnt1cont at the time of
+// launch. Now do a mount in container (/mnt1cont/mnt2cont) and this new
+// mount should propagate to host (/mnt1host/mnt2cont)
+
+func TestRootfsPropagationSharedMount(t *testing.T) {
+       var dir1cont string
+       var dir2cont string
+
+       dir1cont = "/root/mnt1cont"
+
+       if testing.Short() {
+               return
+       }
+       rootfs, err := newRootfs()
+       ok(t, err)
+       defer remove(rootfs)
+       config := newTemplateConfig(rootfs)
+       config.RootPropagation = unix.MS_PRIVATE
+
+       // Bind mount a volume
+       dir1host, err := ioutil.TempDir("", "mnt1host")
+       ok(t, err)
+       defer os.RemoveAll(dir1host)
+
+       // Make this dir a "shared" mount point. This will make sure a
+       // shared relationship can be established in container.
+       err = unix.Mount(dir1host, dir1host, "bind", unix.MS_BIND|unix.MS_REC, "")
+       ok(t, err)
+       err = unix.Mount("", dir1host, "", unix.MS_SHARED|unix.MS_REC, "")
+       ok(t, err)
+       defer unmountOp(dir1host)
+
+       config.Mounts = append(config.Mounts, &configs.Mount{
+               Source:      dir1host,
+               Destination: dir1cont,
+               Device:      "bind",
+               Flags:       unix.MS_BIND | unix.MS_REC})
+
+       container, err := newContainerWithName("testSharedMount", config)
+       ok(t, err)
+       defer container.Destroy()
+
+       stdinR, stdinW, err := os.Pipe()
+       ok(t, err)
+
+       pconfig := &libcontainer.Process{
+               Cwd:   "/",
+               Args:  []string{"cat"},
+               Env:   standardEnvironment,
+               Stdin: stdinR,
+               Init:  true,
+       }
+
+       err = container.Run(pconfig)
+       stdinR.Close()
+       defer stdinW.Close()
+       ok(t, err)
+
+       // Create mnt1host/mnt2cont.  This will become visible inside container
+       // at mnt1cont/mnt2cont. Bind mount itself on top of it. This
+       // should be visible on host now.
+       dir2host, err := ioutil.TempDir(dir1host, "mnt2cont")
+       ok(t, err)
+       defer os.RemoveAll(dir2host)
+
+       dir2cont = filepath.Join(dir1cont, filepath.Base(dir2host))
+
+       // Mount something in container and see if it is visible on host.
+       var stdout2 bytes.Buffer
+
+       stdinR2, stdinW2, err := os.Pipe()
+       ok(t, err)
+
+       pconfig2 := &libcontainer.Process{
+               Cwd:          "/",
+               Args:         []string{"mount", "--bind", dir2cont, dir2cont},
+               Env:          standardEnvironment,
+               Stdin:        stdinR2,
+               Stdout:       &stdout2,
+               Capabilities: &configs.Capabilities{},
+       }
+
+       // Provide CAP_SYS_ADMIN
+       pconfig2.Capabilities.Bounding = append(config.Capabilities.Bounding, "CAP_SYS_ADMIN")
+       pconfig2.Capabilities.Permitted = append(config.Capabilities.Permitted, "CAP_SYS_ADMIN")
+       pconfig2.Capabilities.Effective = append(config.Capabilities.Effective, "CAP_SYS_ADMIN")
+       pconfig2.Capabilities.Inheritable = append(config.Capabilities.Inheritable, "CAP_SYS_ADMIN")
+
+       err = container.Run(pconfig2)
+       stdinR2.Close()
+       defer stdinW2.Close()
+       ok(t, err)
+
+       // Wait for process
+       stdinW2.Close()
+       waitProcess(pconfig2, t)
+       stdinW.Close()
+       waitProcess(pconfig, t)
+
+       defer unmountOp(dir2host)
+
+       // Check if mount is visible on host or not.
+       out, err := exec.Command("findmnt", "-n", "-f", "-oTARGET", dir2host).CombinedOutput()
+       outtrim := strings.TrimSpace(string(out))
+       if err != nil {
+               t.Logf("findmnt error %q: %q", err, outtrim)
+       }
+
+       if string(outtrim) != dir2host {
+               t.Fatalf("Mount in container on %s did not propagate to host on %s. finmnt output=%s", dir2cont, dir2host, outtrim)
+       }
+}
+
+func TestPIDHost(t *testing.T) {
+       if testing.Short() {
+               return
+       }
+
+       rootfs, err := newRootfs()
+       ok(t, err)
+       defer remove(rootfs)
+
+       l, err := os.Readlink("/proc/1/ns/pid")
+       ok(t, err)
+
+       config := newTemplateConfig(rootfs)
+       config.Namespaces.Remove(configs.NEWPID)
+       buffers, exitCode, err := runContainer(config, "", "readlink", "/proc/self/ns/pid")
+       ok(t, err)
+
+       if exitCode != 0 {
+               t.Fatalf("exit code not 0. code %d stderr %q", exitCode, buffers.Stderr)
+       }
+
+       if actual := strings.Trim(buffers.Stdout.String(), "\n"); actual != l {
+               t.Fatalf("ipc link not equal to host link %q %q", actual, l)
+       }
+}
+
+func TestInitJoinPID(t *testing.T) {
+       if testing.Short() {
+               return
+       }
+       rootfs, err := newRootfs()
+       ok(t, err)
+       defer remove(rootfs)
+
+       // Execute a long-running container
+       container1, err := newContainer(newTemplateConfig(rootfs))
+       ok(t, err)
+       defer container1.Destroy()
+
+       stdinR1, stdinW1, err := os.Pipe()
+       ok(t, err)
+       init1 := &libcontainer.Process{
+               Cwd:   "/",
+               Args:  []string{"cat"},
+               Env:   standardEnvironment,
+               Stdin: stdinR1,
+               Init:  true,
+       }
+       err = container1.Run(init1)
+       stdinR1.Close()
+       defer stdinW1.Close()
+       ok(t, err)
+
+       // get the state of the first container
+       state1, err := container1.State()
+       ok(t, err)
+       pidns1 := state1.NamespacePaths[configs.NEWPID]
+
+       // Run a container inside the existing pidns but with different cgroups
+       config2 := newTemplateConfig(rootfs)
+       config2.Namespaces.Add(configs.NEWPID, pidns1)
+       config2.Cgroups.Path = "integration/test2"
+       container2, err := newContainerWithName("testCT2", config2)
+       ok(t, err)
+       defer container2.Destroy()
+
+       stdinR2, stdinW2, err := os.Pipe()
+       ok(t, err)
+       init2 := &libcontainer.Process{
+               Cwd:   "/",
+               Args:  []string{"cat"},
+               Env:   standardEnvironment,
+               Stdin: stdinR2,
+               Init:  true,
+       }
+       err = container2.Run(init2)
+       stdinR2.Close()
+       defer stdinW2.Close()
+       ok(t, err)
+       // get the state of the second container
+       state2, err := container2.State()
+       ok(t, err)
+
+       ns1, err := os.Readlink(fmt.Sprintf("/proc/%d/ns/pid", state1.InitProcessPid))
+       ok(t, err)
+       ns2, err := os.Readlink(fmt.Sprintf("/proc/%d/ns/pid", state2.InitProcessPid))
+       ok(t, err)
+       if ns1 != ns2 {
+               t.Errorf("pidns(%s), wanted %s", ns2, ns1)
+       }
+
+       // check that namespaces are not the same
+       if reflect.DeepEqual(state2.NamespacePaths, state1.NamespacePaths) {
+               t.Errorf("Namespaces(%v), original %v", state2.NamespacePaths,
+                       state1.NamespacePaths)
+       }
+       // check that pidns is joined correctly. The initial container process list
+       // should contain the second container's init process
+       buffers := newStdBuffers()
+       ps := &libcontainer.Process{
+               Cwd:    "/",
+               Args:   []string{"ps"},
+               Env:    standardEnvironment,
+               Stdout: buffers.Stdout,
+       }
+       err = container1.Run(ps)
+       ok(t, err)
+       waitProcess(ps, t)
+
+       // Stop init processes one by one. Stop the second container should
+       // not stop the first.
+       stdinW2.Close()
+       waitProcess(init2, t)
+       stdinW1.Close()
+       waitProcess(init1, t)
+
+       out := strings.TrimSpace(buffers.Stdout.String())
+       // output of ps inside the initial PID namespace should have
+       // 1 line of header,
+       // 2 lines of init processes,
+       // 1 line of ps process
+       if len(strings.Split(out, "\n")) != 4 {
+               t.Errorf("unexpected running process, output %q", out)
+       }
+}
+
+func TestInitJoinNetworkAndUser(t *testing.T) {
+       if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) {
+               t.Skip("userns is unsupported")
+       }
+       if testing.Short() {
+               return
+       }
+       rootfs, err := newRootfs()
+       ok(t, err)
+       defer remove(rootfs)
+
+       // Execute a long-running container
+       config1 := newTemplateConfig(rootfs)
+       config1.UidMappings = []configs.IDMap{{HostID: 0, ContainerID: 0, Size: 1000}}
+       config1.GidMappings = []configs.IDMap{{HostID: 0, ContainerID: 0, Size: 1000}}
+       config1.Namespaces = append(config1.Namespaces, configs.Namespace{Type: configs.NEWUSER})
+       container1, err := newContainer(config1)
+       ok(t, err)
+       defer container1.Destroy()
+
+       stdinR1, stdinW1, err := os.Pipe()
+       ok(t, err)
+       init1 := &libcontainer.Process{
+               Cwd:   "/",
+               Args:  []string{"cat"},
+               Env:   standardEnvironment,
+               Stdin: stdinR1,
+               Init:  true,
+       }
+       err = container1.Run(init1)
+       stdinR1.Close()
+       defer stdinW1.Close()
+       ok(t, err)
+
+       // get the state of the first container
+       state1, err := container1.State()
+       ok(t, err)
+       netns1 := state1.NamespacePaths[configs.NEWNET]
+       userns1 := state1.NamespacePaths[configs.NEWUSER]
+
+       // Run a container inside the existing pidns but with different cgroups
+       rootfs2, err := newRootfs()
+       ok(t, err)
+       defer remove(rootfs2)
+
+       config2 := newTemplateConfig(rootfs2)
+       config2.UidMappings = []configs.IDMap{{HostID: 0, ContainerID: 0, Size: 1000}}
+       config2.GidMappings = []configs.IDMap{{HostID: 0, ContainerID: 0, Size: 1000}}
+       config2.Namespaces.Add(configs.NEWNET, netns1)
+       config2.Namespaces.Add(configs.NEWUSER, userns1)
+       config2.Cgroups.Path = "integration/test2"
+       container2, err := newContainerWithName("testCT2", config2)
+       ok(t, err)
+       defer container2.Destroy()
+
+       stdinR2, stdinW2, err := os.Pipe()
+       ok(t, err)
+       init2 := &libcontainer.Process{
+               Cwd:   "/",
+               Args:  []string{"cat"},
+               Env:   standardEnvironment,
+               Stdin: stdinR2,
+               Init:  true,
+       }
+       err = container2.Run(init2)
+       stdinR2.Close()
+       defer stdinW2.Close()
+       ok(t, err)
+
+       // get the state of the second container
+       state2, err := container2.State()
+       ok(t, err)
+
+       for _, ns := range []string{"net", "user"} {
+               ns1, err := os.Readlink(fmt.Sprintf("/proc/%d/ns/%s", state1.InitProcessPid, ns))
+               ok(t, err)
+               ns2, err := os.Readlink(fmt.Sprintf("/proc/%d/ns/%s", state2.InitProcessPid, ns))
+               ok(t, err)
+               if ns1 != ns2 {
+                       t.Errorf("%s(%s), wanted %s", ns, ns2, ns1)
+               }
+       }
+
+       // check that namespaces are not the same
+       if reflect.DeepEqual(state2.NamespacePaths, state1.NamespacePaths) {
+               t.Errorf("Namespaces(%v), original %v", state2.NamespacePaths,
+                       state1.NamespacePaths)
+       }
+       // Stop init processes one by one. Stop the second container should
+       // not stop the first.
+       stdinW2.Close()
+       waitProcess(init2, t)
+       stdinW1.Close()
+       waitProcess(init1, t)
+}
+
+func TestTmpfsCopyUp(t *testing.T) {
+       if testing.Short() {
+               return
+       }
+
+       rootfs, err := newRootfs()
+       ok(t, err)
+       defer remove(rootfs)
+
+       config := newTemplateConfig(rootfs)
+
+       config.Mounts = append(config.Mounts, &configs.Mount{
+               Source:      "tmpfs",
+               Destination: "/etc",
+               Device:      "tmpfs",
+               Extensions:  configs.EXT_COPYUP,
+       })
+
+       container, err := newContainerWithName("test", config)
+       ok(t, err)
+       defer container.Destroy()
+
+       var stdout bytes.Buffer
+       pconfig := libcontainer.Process{
+               Args:   []string{"ls", "/etc/passwd"},
+               Env:    standardEnvironment,
+               Stdin:  nil,
+               Stdout: &stdout,
+               Init:   true,
+       }
+       err = container.Run(&pconfig)
+       ok(t, err)
+
+       // Wait for process
+       waitProcess(&pconfig, t)
+
+       outputLs := string(stdout.Bytes())
+
+       // Check that the ls output has /etc/passwd
+       if !strings.Contains(outputLs, "/etc/passwd") {
+               t.Fatalf("/etc/passwd not copied up as expected: %v", outputLs)
+       }
+}
+
+func TestCGROUPPrivate(t *testing.T) {
+       if _, err := os.Stat("/proc/self/ns/cgroup"); os.IsNotExist(err) {
+               t.Skip("cgroupns is unsupported")
+       }
+       if testing.Short() {
+               return
+       }
+
+       rootfs, err := newRootfs()
+       ok(t, err)
+       defer remove(rootfs)
+
+       l, err := os.Readlink("/proc/1/ns/cgroup")
+       ok(t, err)
+
+       config := newTemplateConfig(rootfs)
+       config.Namespaces.Add(configs.NEWCGROUP, "")
+       buffers, exitCode, err := runContainer(config, "", "readlink", "/proc/self/ns/cgroup")
+       ok(t, err)
+
+       if exitCode != 0 {
+               t.Fatalf("exit code not 0. code %d stderr %q", exitCode, buffers.Stderr)
+       }
+
+       if actual := strings.Trim(buffers.Stdout.String(), "\n"); actual == l {
+               t.Fatalf("cgroup link should be private to the container but equals host %q %q", actual, l)
+       }
+}
+
+func TestCGROUPHost(t *testing.T) {
+       if _, err := os.Stat("/proc/self/ns/cgroup"); os.IsNotExist(err) {
+               t.Skip("cgroupns is unsupported")
+       }
+       if testing.Short() {
+               return
+       }
+
+       rootfs, err := newRootfs()
+       ok(t, err)
+       defer remove(rootfs)
+
+       l, err := os.Readlink("/proc/1/ns/cgroup")
+       ok(t, err)
+
+       config := newTemplateConfig(rootfs)
+       buffers, exitCode, err := runContainer(config, "", "readlink", "/proc/self/ns/cgroup")
+       ok(t, err)
+
+       if exitCode != 0 {
+               t.Fatalf("exit code not 0. code %d stderr %q", exitCode, buffers.Stderr)
+       }
+
+       if actual := strings.Trim(buffers.Stdout.String(), "\n"); actual != l {
+               t.Fatalf("cgroup link not equal to host link %q %q", actual, l)
+       }
+}
diff --git a/libcontainer/integration/execin_test.go b/libcontainer/integration/execin_test.go
new file mode 100644 (file)
index 0000000..14f8a59
--- /dev/null
@@ -0,0 +1,608 @@
+package integration
+
+import (
+       "bytes"
+       "fmt"
+       "io"
+       "os"
+       "strconv"
+       "strings"
+       "testing"
+       "time"
+
+       "github.com/containerd/console"
+       "github.com/opencontainers/runc/libcontainer"
+       "github.com/opencontainers/runc/libcontainer/configs"
+       "github.com/opencontainers/runc/libcontainer/utils"
+
+       "golang.org/x/sys/unix"
+)
+
+func TestExecIn(t *testing.T) {
+       if testing.Short() {
+               return
+       }
+       rootfs, err := newRootfs()
+       ok(t, err)
+       defer remove(rootfs)
+       config := newTemplateConfig(rootfs)
+       container, err := newContainer(config)
+       ok(t, err)
+       defer container.Destroy()
+
+       // Execute a first process in the container
+       stdinR, stdinW, err := os.Pipe()
+       ok(t, err)
+       process := &libcontainer.Process{
+               Cwd:   "/",
+               Args:  []string{"cat"},
+               Env:   standardEnvironment,
+               Stdin: stdinR,
+               Init:  true,
+       }
+       err = container.Run(process)
+       stdinR.Close()
+       defer stdinW.Close()
+       ok(t, err)
+
+       buffers := newStdBuffers()
+       ps := &libcontainer.Process{
+               Cwd:    "/",
+               Args:   []string{"ps"},
+               Env:    standardEnvironment,
+               Stdin:  buffers.Stdin,
+               Stdout: buffers.Stdout,
+               Stderr: buffers.Stderr,
+       }
+
+       err = container.Run(ps)
+       ok(t, err)
+       waitProcess(ps, t)
+       stdinW.Close()
+       waitProcess(process, t)
+
+       out := buffers.Stdout.String()
+       if !strings.Contains(out, "cat") || !strings.Contains(out, "ps") {
+               t.Fatalf("unexpected running process, output %q", out)
+       }
+       if strings.Contains(out, "\r") {
+               t.Fatalf("unexpected carriage-return in output")
+       }
+}
+
+func TestExecInUsernsRlimit(t *testing.T) {
+       if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) {
+               t.Skip("userns is unsupported")
+       }
+
+       testExecInRlimit(t, true)
+}
+
+func TestExecInRlimit(t *testing.T) {
+       testExecInRlimit(t, false)
+}
+
+func testExecInRlimit(t *testing.T, userns bool) {
+       if testing.Short() {
+               return
+       }
+
+       rootfs, err := newRootfs()
+       ok(t, err)
+       defer remove(rootfs)
+
+       config := newTemplateConfig(rootfs)
+       if userns {
+               config.UidMappings = []configs.IDMap{{HostID: 0, ContainerID: 0, Size: 1000}}
+               config.GidMappings = []configs.IDMap{{HostID: 0, ContainerID: 0, Size: 1000}}
+               config.Namespaces = append(config.Namespaces, configs.Namespace{Type: configs.NEWUSER})
+       }
+
+       container, err := newContainer(config)
+       ok(t, err)
+       defer container.Destroy()
+
+       stdinR, stdinW, err := os.Pipe()
+       ok(t, err)
+       process := &libcontainer.Process{
+               Cwd:   "/",
+               Args:  []string{"cat"},
+               Env:   standardEnvironment,
+               Stdin: stdinR,
+               Init:  true,
+       }
+       err = container.Run(process)
+       stdinR.Close()
+       defer stdinW.Close()
+       ok(t, err)
+
+       buffers := newStdBuffers()
+       ps := &libcontainer.Process{
+               Cwd:    "/",
+               Args:   []string{"/bin/sh", "-c", "ulimit -n"},
+               Env:    standardEnvironment,
+               Stdin:  buffers.Stdin,
+               Stdout: buffers.Stdout,
+               Stderr: buffers.Stderr,
+               Rlimits: []configs.Rlimit{
+                       // increase process rlimit higher than container rlimit to test per-process limit
+                       {Type: unix.RLIMIT_NOFILE, Hard: 1026, Soft: 1026},
+               },
+               Init: true,
+       }
+       err = container.Run(ps)
+       ok(t, err)
+       waitProcess(ps, t)
+
+       stdinW.Close()
+       waitProcess(process, t)
+
+       out := buffers.Stdout.String()
+       if limit := strings.TrimSpace(out); limit != "1026" {
+               t.Fatalf("expected rlimit to be 1026, got %s", limit)
+       }
+}
+
+func TestExecInAdditionalGroups(t *testing.T) {
+       if testing.Short() {
+               return
+       }
+
+       rootfs, err := newRootfs()
+       ok(t, err)
+       defer remove(rootfs)
+
+       config := newTemplateConfig(rootfs)
+       container, err := newContainer(config)
+       ok(t, err)
+       defer container.Destroy()
+
+       // Execute a first process in the container
+       stdinR, stdinW, err := os.Pipe()
+       ok(t, err)
+       process := &libcontainer.Process{
+               Cwd:   "/",
+               Args:  []string{"cat"},
+               Env:   standardEnvironment,
+               Stdin: stdinR,
+               Init:  true,
+       }
+       err = container.Run(process)
+       stdinR.Close()
+       defer stdinW.Close()
+       ok(t, err)
+
+       var stdout bytes.Buffer
+       pconfig := libcontainer.Process{
+               Cwd:              "/",
+               Args:             []string{"sh", "-c", "id", "-Gn"},
+               Env:              standardEnvironment,
+               Stdin:            nil,
+               Stdout:           &stdout,
+               AdditionalGroups: []string{"plugdev", "audio"},
+       }
+       err = container.Run(&pconfig)
+       ok(t, err)
+
+       // Wait for process
+       waitProcess(&pconfig, t)
+
+       stdinW.Close()
+       waitProcess(process, t)
+
+       outputGroups := string(stdout.Bytes())
+
+       // Check that the groups output has the groups that we specified
+       if !strings.Contains(outputGroups, "audio") {
+               t.Fatalf("Listed groups do not contain the audio group as expected: %v", outputGroups)
+       }
+
+       if !strings.Contains(outputGroups, "plugdev") {
+               t.Fatalf("Listed groups do not contain the plugdev group as expected: %v", outputGroups)
+       }
+}
+
+func TestExecInError(t *testing.T) {
+       if testing.Short() {
+               return
+       }
+       rootfs, err := newRootfs()
+       ok(t, err)
+       defer remove(rootfs)
+       config := newTemplateConfig(rootfs)
+       container, err := newContainer(config)
+       ok(t, err)
+       defer container.Destroy()
+
+       // Execute a first process in the container
+       stdinR, stdinW, err := os.Pipe()
+       ok(t, err)
+       process := &libcontainer.Process{
+               Cwd:   "/",
+               Args:  []string{"cat"},
+               Env:   standardEnvironment,
+               Stdin: stdinR,
+               Init:  true,
+       }
+       err = container.Run(process)
+       stdinR.Close()
+       defer func() {
+               stdinW.Close()
+               if _, err := process.Wait(); err != nil {
+                       t.Log(err)
+               }
+       }()
+       ok(t, err)
+
+       for i := 0; i < 42; i++ {
+               var out bytes.Buffer
+               unexistent := &libcontainer.Process{
+                       Cwd:    "/",
+                       Args:   []string{"unexistent"},
+                       Env:    standardEnvironment,
+                       Stderr: &out,
+               }
+               err = container.Run(unexistent)
+               if err == nil {
+                       t.Fatal("Should be an error")
+               }
+               if !strings.Contains(err.Error(), "executable file not found") {
+                       t.Fatalf("Should be error about not found executable, got %s", err)
+               }
+               if !bytes.Contains(out.Bytes(), []byte("executable file not found")) {
+                       t.Fatalf("executable file not found error not delivered to stdio:\n%s", out.String())
+               }
+       }
+}
+
+func TestExecInTTY(t *testing.T) {
+       if testing.Short() {
+               return
+       }
+       rootfs, err := newRootfs()
+       ok(t, err)
+       defer remove(rootfs)
+       config := newTemplateConfig(rootfs)
+       container, err := newContainer(config)
+       ok(t, err)
+       defer container.Destroy()
+
+       // Execute a first process in the container
+       stdinR, stdinW, err := os.Pipe()
+       ok(t, err)
+       process := &libcontainer.Process{
+               Cwd:   "/",
+               Args:  []string{"cat"},
+               Env:   standardEnvironment,
+               Stdin: stdinR,
+               Init:  true,
+       }
+       err = container.Run(process)
+       stdinR.Close()
+       defer stdinW.Close()
+       ok(t, err)
+
+       var stdout bytes.Buffer
+       ps := &libcontainer.Process{
+               Cwd:  "/",
+               Args: []string{"ps"},
+               Env:  standardEnvironment,
+       }
+       parent, child, err := utils.NewSockPair("console")
+       if err != nil {
+               ok(t, err)
+       }
+       defer parent.Close()
+       defer child.Close()
+       ps.ConsoleSocket = child
+       type cdata struct {
+               c   console.Console
+               err error
+       }
+       dc := make(chan *cdata, 1)
+       go func() {
+               f, err := utils.RecvFd(parent)
+               if err != nil {
+                       dc <- &cdata{
+                               err: err,
+                       }
+                       return
+               }
+               c, err := console.ConsoleFromFile(f)
+               if err != nil {
+                       dc <- &cdata{
+                               err: err,
+                       }
+                       return
+               }
+               console.ClearONLCR(c.Fd())
+               dc <- &cdata{
+                       c: c,
+               }
+       }()
+       err = container.Run(ps)
+       ok(t, err)
+       data := <-dc
+       if data.err != nil {
+               ok(t, data.err)
+       }
+       console := data.c
+       copy := make(chan struct{})
+       go func() {
+               io.Copy(&stdout, console)
+               close(copy)
+       }()
+       ok(t, err)
+       select {
+       case <-time.After(5 * time.Second):
+               t.Fatal("Waiting for copy timed out")
+       case <-copy:
+       }
+       waitProcess(ps, t)
+
+       stdinW.Close()
+       waitProcess(process, t)
+
+       out := stdout.String()
+       if !strings.Contains(out, "cat") || !strings.Contains(out, "ps") {
+               t.Fatalf("unexpected running process, output %q", out)
+       }
+       if strings.Contains(out, "\r") {
+               t.Fatalf("unexpected carriage-return in output")
+       }
+}
+
+func TestExecInEnvironment(t *testing.T) {
+       if testing.Short() {
+               return
+       }
+       rootfs, err := newRootfs()
+       ok(t, err)
+       defer remove(rootfs)
+       config := newTemplateConfig(rootfs)
+       container, err := newContainer(config)
+       ok(t, err)
+       defer container.Destroy()
+
+       // Execute a first process in the container
+       stdinR, stdinW, err := os.Pipe()
+       ok(t, err)
+       process := &libcontainer.Process{
+               Cwd:   "/",
+               Args:  []string{"cat"},
+               Env:   standardEnvironment,
+               Stdin: stdinR,
+               Init:  true,
+       }
+       err = container.Run(process)
+       stdinR.Close()
+       defer stdinW.Close()
+       ok(t, err)
+
+       buffers := newStdBuffers()
+       process2 := &libcontainer.Process{
+               Cwd:  "/",
+               Args: []string{"env"},
+               Env: []string{
+                       "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
+                       "DEBUG=true",
+                       "DEBUG=false",
+                       "ENV=test",
+               },
+               Stdin:  buffers.Stdin,
+               Stdout: buffers.Stdout,
+               Stderr: buffers.Stderr,
+               Init:   true,
+       }
+       err = container.Run(process2)
+       ok(t, err)
+       waitProcess(process2, t)
+
+       stdinW.Close()
+       waitProcess(process, t)
+
+       out := buffers.Stdout.String()
+       // check execin's process environment
+       if !strings.Contains(out, "DEBUG=false") ||
+               !strings.Contains(out, "ENV=test") ||
+               !strings.Contains(out, "HOME=/root") ||
+               !strings.Contains(out, "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin") ||
+               strings.Contains(out, "DEBUG=true") {
+               t.Fatalf("unexpected running process, output %q", out)
+       }
+}
+
+func TestExecinPassExtraFiles(t *testing.T) {
+       if testing.Short() {
+               return
+       }
+       rootfs, err := newRootfs()
+       if err != nil {
+               t.Fatal(err)
+       }
+       defer remove(rootfs)
+       config := newTemplateConfig(rootfs)
+       container, err := newContainer(config)
+       if err != nil {
+               t.Fatal(err)
+       }
+       defer container.Destroy()
+
+       // Execute a first process in the container
+       stdinR, stdinW, err := os.Pipe()
+       if err != nil {
+               t.Fatal(err)
+       }
+       process := &libcontainer.Process{
+               Cwd:   "/",
+               Args:  []string{"cat"},
+               Env:   standardEnvironment,
+               Stdin: stdinR,
+               Init:  true,
+       }
+       err = container.Run(process)
+       stdinR.Close()
+       defer stdinW.Close()
+       if err != nil {
+               t.Fatal(err)
+       }
+
+       var stdout bytes.Buffer
+       pipeout1, pipein1, err := os.Pipe()
+       if err != nil {
+               t.Fatal(err)
+       }
+       pipeout2, pipein2, err := os.Pipe()
+       if err != nil {
+               t.Fatal(err)
+       }
+       inprocess := &libcontainer.Process{
+               Cwd:        "/",
+               Args:       []string{"sh", "-c", "cd /proc/$$/fd; echo -n *; echo -n 1 >3; echo -n 2 >4"},
+               Env:        []string{"PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"},
+               ExtraFiles: []*os.File{pipein1, pipein2},
+               Stdin:      nil,
+               Stdout:     &stdout,
+       }
+       err = container.Run(inprocess)
+       if err != nil {
+               t.Fatal(err)
+       }
+
+       waitProcess(inprocess, t)
+       stdinW.Close()
+       waitProcess(process, t)
+
+       out := string(stdout.Bytes())
+       // fd 5 is the directory handle for /proc/$$/fd
+       if out != "0 1 2 3 4 5" {
+               t.Fatalf("expected to have the file descriptors '0 1 2 3 4 5' passed to exec, got '%s'", out)
+       }
+       var buf = []byte{0}
+       _, err = pipeout1.Read(buf)
+       if err != nil {
+               t.Fatal(err)
+       }
+       out1 := string(buf)
+       if out1 != "1" {
+               t.Fatalf("expected first pipe to receive '1', got '%s'", out1)
+       }
+
+       _, err = pipeout2.Read(buf)
+       if err != nil {
+               t.Fatal(err)
+       }
+       out2 := string(buf)
+       if out2 != "2" {
+               t.Fatalf("expected second pipe to receive '2', got '%s'", out2)
+       }
+}
+
+func TestExecInOomScoreAdj(t *testing.T) {
+       if testing.Short() {
+               return
+       }
+       rootfs, err := newRootfs()
+       ok(t, err)
+       defer remove(rootfs)
+       config := newTemplateConfig(rootfs)
+       config.OomScoreAdj = ptrInt(200)
+       container, err := newContainer(config)
+       ok(t, err)
+       defer container.Destroy()
+
+       stdinR, stdinW, err := os.Pipe()
+       ok(t, err)
+       process := &libcontainer.Process{
+               Cwd:   "/",
+               Args:  []string{"cat"},
+               Env:   standardEnvironment,
+               Stdin: stdinR,
+               Init:  true,
+       }
+       err = container.Run(process)
+       stdinR.Close()
+       defer stdinW.Close()
+       ok(t, err)
+
+       buffers := newStdBuffers()
+       ps := &libcontainer.Process{
+               Cwd:    "/",
+               Args:   []string{"/bin/sh", "-c", "cat /proc/self/oom_score_adj"},
+               Env:    standardEnvironment,
+               Stdin:  buffers.Stdin,
+               Stdout: buffers.Stdout,
+               Stderr: buffers.Stderr,
+       }
+       err = container.Run(ps)
+       ok(t, err)
+       waitProcess(ps, t)
+
+       stdinW.Close()
+       waitProcess(process, t)
+
+       out := buffers.Stdout.String()
+       if oomScoreAdj := strings.TrimSpace(out); oomScoreAdj != strconv.Itoa(*config.OomScoreAdj) {
+               t.Fatalf("expected oomScoreAdj to be %d, got %s", *config.OomScoreAdj, oomScoreAdj)
+       }
+}
+
+func TestExecInUserns(t *testing.T) {
+       if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) {
+               t.Skip("userns is unsupported")
+       }
+       if testing.Short() {
+               return
+       }
+       rootfs, err := newRootfs()
+       ok(t, err)
+       defer remove(rootfs)
+       config := newTemplateConfig(rootfs)
+       config.UidMappings = []configs.IDMap{{HostID: 0, ContainerID: 0, Size: 1000}}
+       config.GidMappings = []configs.IDMap{{HostID: 0, ContainerID: 0, Size: 1000}}
+       config.Namespaces = append(config.Namespaces, configs.Namespace{Type: configs.NEWUSER})
+       container, err := newContainer(config)
+       ok(t, err)
+       defer container.Destroy()
+
+       // Execute a first process in the container
+       stdinR, stdinW, err := os.Pipe()
+       ok(t, err)
+
+       process := &libcontainer.Process{
+               Cwd:   "/",
+               Args:  []string{"cat"},
+               Env:   standardEnvironment,
+               Stdin: stdinR,
+               Init:  true,
+       }
+       err = container.Run(process)
+       stdinR.Close()
+       defer stdinW.Close()
+       ok(t, err)
+
+       initPID, err := process.Pid()
+       ok(t, err)
+       initUserns, err := os.Readlink(fmt.Sprintf("/proc/%d/ns/user", initPID))
+       ok(t, err)
+
+       buffers := newStdBuffers()
+       process2 := &libcontainer.Process{
+               Cwd:  "/",
+               Args: []string{"readlink", "/proc/self/ns/user"},
+               Env: []string{
+                       "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
+               },
+               Stdout: buffers.Stdout,
+               Stderr: os.Stderr,
+       }
+       err = container.Run(process2)
+       ok(t, err)
+       waitProcess(process2, t)
+       stdinW.Close()
+       waitProcess(process, t)
+
+       if out := strings.TrimSpace(buffers.Stdout.String()); out != initUserns {
+               t.Errorf("execin userns(%s), wanted %s", out, initUserns)
+       }
+}
diff --git a/libcontainer/integration/init_test.go b/libcontainer/integration/init_test.go
new file mode 100644 (file)
index 0000000..f5180ea
--- /dev/null
@@ -0,0 +1,46 @@
+package integration
+
+import (
+       "os"
+       "runtime"
+       "testing"
+
+       "github.com/opencontainers/runc/libcontainer"
+       _ "github.com/opencontainers/runc/libcontainer/nsenter"
+
+       "github.com/sirupsen/logrus"
+)
+
+// init runs the libcontainer initialization code because of the busybox style needs
+// to work around the go runtime and the issues with forking
+func init() {
+       if len(os.Args) < 2 || os.Args[1] != "init" {
+               return
+       }
+       runtime.GOMAXPROCS(1)
+       runtime.LockOSThread()
+       factory, err := libcontainer.New("")
+       if err != nil {
+               logrus.Fatalf("unable to initialize for container: %s", err)
+       }
+       if err := factory.StartInitialization(); err != nil {
+               logrus.Fatal(err)
+       }
+}
+
+var testRoots []string
+
+func TestMain(m *testing.M) {
+       logrus.SetOutput(os.Stderr)
+       logrus.SetLevel(logrus.InfoLevel)
+
+       // Clean up roots after running everything.
+       defer func() {
+               for _, root := range testRoots {
+                       os.RemoveAll(root)
+               }
+       }()
+
+       ret := m.Run()
+       os.Exit(ret)
+}
diff --git a/libcontainer/integration/seccomp_test.go b/libcontainer/integration/seccomp_test.go
new file mode 100644 (file)
index 0000000..77f1a8d
--- /dev/null
@@ -0,0 +1,422 @@
+// +build linux,cgo,seccomp
+
+package integration
+
+import (
+       "strings"
+       "syscall"
+       "testing"
+
+       "github.com/opencontainers/runc/libcontainer"
+       "github.com/opencontainers/runc/libcontainer/configs"
+       libseccomp "github.com/seccomp/libseccomp-golang"
+)
+
+func TestSeccompDenyGetcwd(t *testing.T) {
+       if testing.Short() {
+               return
+       }
+
+       rootfs, err := newRootfs()
+       if err != nil {
+               t.Fatal(err)
+       }
+       defer remove(rootfs)
+
+       config := newTemplateConfig(rootfs)
+       config.Seccomp = &configs.Seccomp{
+               DefaultAction: configs.Allow,
+               Syscalls: []*configs.Syscall{
+                       {
+                               Name:   "getcwd",
+                               Action: configs.Errno,
+                       },
+               },
+       }
+
+       container, err := newContainer(config)
+       if err != nil {
+               t.Fatal(err)
+       }
+       defer container.Destroy()
+
+       buffers := newStdBuffers()
+       pwd := &libcontainer.Process{
+               Cwd:    "/",
+               Args:   []string{"pwd"},
+               Env:    standardEnvironment,
+               Stdin:  buffers.Stdin,
+               Stdout: buffers.Stdout,
+               Stderr: buffers.Stderr,
+               Init:   true,
+       }
+
+       err = container.Run(pwd)
+       if err != nil {
+               t.Fatal(err)
+       }
+       ps, err := pwd.Wait()
+       if err == nil {
+               t.Fatal("Expecting error (negative return code); instead exited cleanly!")
+       }
+
+       var exitCode int
+       status := ps.Sys().(syscall.WaitStatus)
+       if status.Exited() {
+               exitCode = status.ExitStatus()
+       } else if status.Signaled() {
+               exitCode = -int(status.Signal())
+       } else {
+               t.Fatalf("Unrecognized exit reason!")
+       }
+
+       if exitCode == 0 {
+               t.Fatalf("Getcwd should fail with negative exit code, instead got %d!", exitCode)
+       }
+
+       expected := "pwd: getcwd: Operation not permitted"
+       actual := strings.Trim(buffers.Stderr.String(), "\n")
+       if actual != expected {
+               t.Fatalf("Expected output %s but got %s\n", expected, actual)
+       }
+}
+
+func TestSeccompPermitWriteConditional(t *testing.T) {
+       if testing.Short() {
+               return
+       }
+
+       rootfs, err := newRootfs()
+       if err != nil {
+               t.Fatal(err)
+       }
+       defer remove(rootfs)
+
+       config := newTemplateConfig(rootfs)
+       config.Seccomp = &configs.Seccomp{
+               DefaultAction: configs.Allow,
+               Syscalls: []*configs.Syscall{
+                       {
+                               Name:   "write",
+                               Action: configs.Errno,
+                               Args: []*configs.Arg{
+                                       {
+                                               Index: 0,
+                                               Value: 2,
+                                               Op:    configs.EqualTo,
+                                       },
+                               },
+                       },
+               },
+       }
+
+       container, err := newContainer(config)
+       if err != nil {
+               t.Fatal(err)
+       }
+       defer container.Destroy()
+
+       buffers := newStdBuffers()
+       dmesg := &libcontainer.Process{
+               Cwd:    "/",
+               Args:   []string{"busybox", "ls", "/"},
+               Env:    standardEnvironment,
+               Stdin:  buffers.Stdin,
+               Stdout: buffers.Stdout,
+               Stderr: buffers.Stderr,
+               Init:   true,
+       }
+
+       err = container.Run(dmesg)
+       if err != nil {
+               t.Fatal(err)
+       }
+       if _, err := dmesg.Wait(); err != nil {
+               t.Fatalf("%s: %s", err, buffers.Stderr)
+       }
+}
+
+func TestSeccompDenyWriteConditional(t *testing.T) {
+       if testing.Short() {
+               return
+       }
+
+       // Only test if library version is v2.2.1 or higher
+       // Conditional filtering will always error in v2.2.0 and lower
+       major, minor, micro := libseccomp.GetLibraryVersion()
+       if (major == 2 && minor < 2) || (major == 2 && minor == 2 && micro < 1) {
+               return
+       }
+
+       rootfs, err := newRootfs()
+       if err != nil {
+               t.Fatal(err)
+       }
+       defer remove(rootfs)
+
+       config := newTemplateConfig(rootfs)
+       config.Seccomp = &configs.Seccomp{
+               DefaultAction: configs.Allow,
+               Syscalls: []*configs.Syscall{
+                       {
+                               Name:   "write",
+                               Action: configs.Errno,
+                               Args: []*configs.Arg{
+                                       {
+                                               Index: 0,
+                                               Value: 2,
+                                               Op:    configs.EqualTo,
+                                       },
+                               },
+                       },
+               },
+       }
+
+       container, err := newContainer(config)
+       if err != nil {
+               t.Fatal(err)
+       }
+       defer container.Destroy()
+
+       buffers := newStdBuffers()
+       dmesg := &libcontainer.Process{
+               Cwd:    "/",
+               Args:   []string{"busybox", "ls", "does_not_exist"},
+               Env:    standardEnvironment,
+               Stdin:  buffers.Stdin,
+               Stdout: buffers.Stdout,
+               Stderr: buffers.Stderr,
+               Init:   true,
+       }
+
+       err = container.Run(dmesg)
+       if err != nil {
+               t.Fatal(err)
+       }
+
+       ps, err := dmesg.Wait()
+       if err == nil {
+               t.Fatal("Expecting negative return, instead got 0!")
+       }
+
+       var exitCode int
+       status := ps.Sys().(syscall.WaitStatus)
+       if status.Exited() {
+               exitCode = status.ExitStatus()
+       } else if status.Signaled() {
+               exitCode = -int(status.Signal())
+       } else {
+               t.Fatalf("Unrecognized exit reason!")
+       }
+
+       if exitCode == 0 {
+               t.Fatalf("Busybox should fail with negative exit code, instead got %d!", exitCode)
+       }
+
+       // We're denying write to stderr, so we expect an empty buffer
+       expected := ""
+       actual := strings.Trim(buffers.Stderr.String(), "\n")
+       if actual != expected {
+               t.Fatalf("Expected output %s but got %s\n", expected, actual)
+       }
+}
+
+func TestSeccompPermitWriteMultipleConditions(t *testing.T) {
+       if testing.Short() {
+               return
+       }
+
+       rootfs, err := newRootfs()
+       if err != nil {
+               t.Fatal(err)
+       }
+       defer remove(rootfs)
+
+       config := newTemplateConfig(rootfs)
+       config.Seccomp = &configs.Seccomp{
+               DefaultAction: configs.Allow,
+               Syscalls: []*configs.Syscall{
+                       {
+                               Name:   "write",
+                               Action: configs.Errno,
+                               Args: []*configs.Arg{
+                                       {
+                                               Index: 0,
+                                               Value: 2,
+                                               Op:    configs.EqualTo,
+                                       },
+                                       {
+                                               Index: 2,
+                                               Value: 0,
+                                               Op:    configs.NotEqualTo,
+                                       },
+                               },
+                       },
+               },
+       }
+
+       buffers, exitCode, err := runContainer(config, "", "ls", "/")
+       if err != nil {
+               t.Fatalf("%s: %s", buffers, err)
+       }
+       if exitCode != 0 {
+               t.Fatalf("exit code not 0. code %d buffers %s", exitCode, buffers)
+       }
+       // We don't need to verify the actual thing printed
+       // Just that something was written to stdout
+       if len(buffers.Stdout.String()) == 0 {
+               t.Fatalf("Nothing was written to stdout, write call failed!\n")
+       }
+}
+
+func TestSeccompDenyWriteMultipleConditions(t *testing.T) {
+       if testing.Short() {
+               return
+       }
+
+       // Only test if library version is v2.2.1 or higher
+       // Conditional filtering will always error in v2.2.0 and lower
+       major, minor, micro := libseccomp.GetLibraryVersion()
+       if (major == 2 && minor < 2) || (major == 2 && minor == 2 && micro < 1) {
+               return
+       }
+
+       rootfs, err := newRootfs()
+       if err != nil {
+               t.Fatal(err)
+       }
+       defer remove(rootfs)
+
+       config := newTemplateConfig(rootfs)
+       config.Seccomp = &configs.Seccomp{
+               DefaultAction: configs.Allow,
+               Syscalls: []*configs.Syscall{
+                       {
+                               Name:   "write",
+                               Action: configs.Errno,
+                               Args: []*configs.Arg{
+                                       {
+                                               Index: 0,
+                                               Value: 2,
+                                               Op:    configs.EqualTo,
+                                       },
+                                       {
+                                               Index: 2,
+                                               Value: 0,
+                                               Op:    configs.NotEqualTo,
+                                       },
+                               },
+                       },
+               },
+       }
+
+       buffers, exitCode, err := runContainer(config, "", "ls", "/does_not_exist")
+       if err == nil {
+               t.Fatalf("Expecting error return, instead got 0")
+       }
+       if exitCode == 0 {
+               t.Fatalf("Busybox should fail with negative exit code, instead got %d!", exitCode)
+       }
+
+       expected := ""
+       actual := strings.Trim(buffers.Stderr.String(), "\n")
+       if actual != expected {
+               t.Fatalf("Expected output %s but got %s\n", expected, actual)
+       }
+}
+
+func TestSeccompMultipleConditionSameArgDeniesStdout(t *testing.T) {
+       if testing.Short() {
+               return
+       }
+
+       rootfs, err := newRootfs()
+       if err != nil {
+               t.Fatal(err)
+       }
+       defer remove(rootfs)
+
+       // Prevent writing to both stdout and stderr
+       config := newTemplateConfig(rootfs)
+       config.Seccomp = &configs.Seccomp{
+               DefaultAction: configs.Allow,
+               Syscalls: []*configs.Syscall{
+                       {
+                               Name:   "write",
+                               Action: configs.Errno,
+                               Args: []*configs.Arg{
+                                       {
+                                               Index: 0,
+                                               Value: 1,
+                                               Op:    configs.EqualTo,
+                                       },
+                                       {
+                                               Index: 0,
+                                               Value: 2,
+                                               Op:    configs.EqualTo,
+                                       },
+                               },
+                       },
+               },
+       }
+
+       buffers, exitCode, err := runContainer(config, "", "ls", "/")
+       if err != nil {
+               t.Fatalf("%s: %s", buffers, err)
+       }
+       if exitCode != 0 {
+               t.Fatalf("exit code not 0. code %d buffers %s", exitCode, buffers)
+       }
+       // Verify that nothing was printed
+       if len(buffers.Stdout.String()) != 0 {
+               t.Fatalf("Something was written to stdout, write call succeeded!\n")
+       }
+}
+
+func TestSeccompMultipleConditionSameArgDeniesStderr(t *testing.T) {
+       if testing.Short() {
+               return
+       }
+
+       rootfs, err := newRootfs()
+       if err != nil {
+               t.Fatal(err)
+       }
+       defer remove(rootfs)
+
+       // Prevent writing to both stdout and stderr
+       config := newTemplateConfig(rootfs)
+       config.Seccomp = &configs.Seccomp{
+               DefaultAction: configs.Allow,
+               Syscalls: []*configs.Syscall{
+                       {
+                               Name:   "write",
+                               Action: configs.Errno,
+                               Args: []*configs.Arg{
+                                       {
+                                               Index: 0,
+                                               Value: 1,
+                                               Op:    configs.EqualTo,
+                                       },
+                                       {
+                                               Index: 0,
+                                               Value: 2,
+                                               Op:    configs.EqualTo,
+                                       },
+                               },
+                       },
+               },
+       }
+
+       buffers, exitCode, err := runContainer(config, "", "ls", "/does_not_exist")
+       if err == nil {
+               t.Fatalf("Expecting error return, instead got 0")
+       }
+       if exitCode == 0 {
+               t.Fatalf("Busybox should fail with negative exit code, instead got %d!", exitCode)
+       }
+       // Verify nothing was printed
+       if len(buffers.Stderr.String()) != 0 {
+               t.Fatalf("Something was written to stderr, write call succeeded!\n")
+       }
+}
diff --git a/libcontainer/integration/template_test.go b/libcontainer/integration/template_test.go
new file mode 100644 (file)
index 0000000..5f7cab5
--- /dev/null
@@ -0,0 +1,191 @@
+package integration
+
+import (
+       "github.com/opencontainers/runc/libcontainer/configs"
+
+       "golang.org/x/sys/unix"
+)
+
+var standardEnvironment = []string{
+       "HOME=/root",
+       "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
+       "HOSTNAME=integration",
+       "TERM=xterm",
+}
+
+const defaultMountFlags = unix.MS_NOEXEC | unix.MS_NOSUID | unix.MS_NODEV
+
+// newTemplateConfig returns a base template for running a container
+//
+// it uses a network strategy of just setting a loopback interface
+// and the default setup for devices
+func newTemplateConfig(rootfs string) *configs.Config {
+       allowAllDevices := false
+       return &configs.Config{
+               Rootfs: rootfs,
+               Capabilities: &configs.Capabilities{
+                       Bounding: []string{
+                               "CAP_CHOWN",
+                               "CAP_DAC_OVERRIDE",
+                               "CAP_FSETID",
+                               "CAP_FOWNER",
+                               "CAP_MKNOD",
+                               "CAP_NET_RAW",
+                               "CAP_SETGID",
+                               "CAP_SETUID",
+                               "CAP_SETFCAP",
+                               "CAP_SETPCAP",
+                               "CAP_NET_BIND_SERVICE",
+                               "CAP_SYS_CHROOT",
+                               "CAP_KILL",
+                               "CAP_AUDIT_WRITE",
+                       },
+                       Permitted: []string{
+                               "CAP_CHOWN",
+                               "CAP_DAC_OVERRIDE",
+                               "CAP_FSETID",
+                               "CAP_FOWNER",
+                               "CAP_MKNOD",
+                               "CAP_NET_RAW",
+                               "CAP_SETGID",
+                               "CAP_SETUID",
+                               "CAP_SETFCAP",
+                               "CAP_SETPCAP",
+                               "CAP_NET_BIND_SERVICE",
+                               "CAP_SYS_CHROOT",
+                               "CAP_KILL",
+                               "CAP_AUDIT_WRITE",
+                       },
+                       Inheritable: []string{
+                               "CAP_CHOWN",
+                               "CAP_DAC_OVERRIDE",
+                               "CAP_FSETID",
+                               "CAP_FOWNER",
+                               "CAP_MKNOD",
+                               "CAP_NET_RAW",
+                               "CAP_SETGID",
+                               "CAP_SETUID",
+                               "CAP_SETFCAP",
+                               "CAP_SETPCAP",
+                               "CAP_NET_BIND_SERVICE",
+                               "CAP_SYS_CHROOT",
+                               "CAP_KILL",
+                               "CAP_AUDIT_WRITE",
+                       },
+                       Ambient: []string{
+                               "CAP_CHOWN",
+                               "CAP_DAC_OVERRIDE",
+                               "CAP_FSETID",
+                               "CAP_FOWNER",
+                               "CAP_MKNOD",
+                               "CAP_NET_RAW",
+                               "CAP_SETGID",
+                               "CAP_SETUID",
+                               "CAP_SETFCAP",
+                               "CAP_SETPCAP",
+                               "CAP_NET_BIND_SERVICE",
+                               "CAP_SYS_CHROOT",
+                               "CAP_KILL",
+                               "CAP_AUDIT_WRITE",
+                       },
+                       Effective: []string{
+                               "CAP_CHOWN",
+                               "CAP_DAC_OVERRIDE",
+                               "CAP_FSETID",
+                               "CAP_FOWNER",
+                               "CAP_MKNOD",
+                               "CAP_NET_RAW",
+                               "CAP_SETGID",
+                               "CAP_SETUID",
+                               "CAP_SETFCAP",
+                               "CAP_SETPCAP",
+                               "CAP_NET_BIND_SERVICE",
+                               "CAP_SYS_CHROOT",
+                               "CAP_KILL",
+                               "CAP_AUDIT_WRITE",
+                       },
+               },
+               Namespaces: configs.Namespaces([]configs.Namespace{
+                       {Type: configs.NEWNS},
+                       {Type: configs.NEWUTS},
+                       {Type: configs.NEWIPC},
+                       {Type: configs.NEWPID},
+                       {Type: configs.NEWNET},
+               }),
+               Cgroups: &configs.Cgroup{
+                       Path: "integration/test",
+                       Resources: &configs.Resources{
+                               MemorySwappiness: nil,
+                               AllowAllDevices:  &allowAllDevices,
+                               AllowedDevices:   configs.DefaultAllowedDevices,
+                       },
+               },
+               MaskPaths: []string{
+                       "/proc/kcore",
+                       "/sys/firmware",
+               },
+               ReadonlyPaths: []string{
+                       "/proc/sys", "/proc/sysrq-trigger", "/proc/irq", "/proc/bus",
+               },
+               Devices:  configs.DefaultAutoCreatedDevices,
+               Hostname: "integration",
+               Mounts: []*configs.Mount{
+                       {
+                               Source:      "proc",
+                               Destination: "/proc",
+                               Device:      "proc",
+                               Flags:       defaultMountFlags,
+                       },
+                       {
+                               Source:      "tmpfs",
+                               Destination: "/dev",
+                               Device:      "tmpfs",
+                               Flags:       unix.MS_NOSUID | unix.MS_STRICTATIME,
+                               Data:        "mode=755",
+                       },
+                       {
+                               Source:      "devpts",
+                               Destination: "/dev/pts",
+                               Device:      "devpts",
+                               Flags:       unix.MS_NOSUID | unix.MS_NOEXEC,
+                               Data:        "newinstance,ptmxmode=0666,mode=0620,gid=5",
+                       },
+                       {
+                               Device:      "tmpfs",
+                               Source:      "shm",
+                               Destination: "/dev/shm",
+                               Data:        "mode=1777,size=65536k",
+                               Flags:       defaultMountFlags,
+                       },
+                       /*
+                                           CI is broken on the debian based kernels with this
+                                                       {
+                                                               Source:      "mqueue",
+                                                               Destination: "/dev/mqueue",
+                                                               Device:      "mqueue",
+                                                               Flags:       defaultMountFlags,
+                                                       },
+                       */
+                       {
+                               Source:      "sysfs",
+                               Destination: "/sys",
+                               Device:      "sysfs",
+                               Flags:       defaultMountFlags | unix.MS_RDONLY,
+                       },
+               },
+               Networks: []*configs.Network{
+                       {
+                               Type:    "loopback",
+                               Address: "127.0.0.1/0",
+                               Gateway: "localhost",
+                       },
+               },
+               Rlimits: []configs.Rlimit{
+                       {
+                               Type: unix.RLIMIT_NOFILE,
+                               Hard: uint64(1025),
+                               Soft: uint64(1025),
+                       },
+               },
+       }
+}
diff --git a/libcontainer/integration/utils_test.go b/libcontainer/integration/utils_test.go
new file mode 100644 (file)
index 0000000..8b2d714
--- /dev/null
@@ -0,0 +1,187 @@
+package integration
+
+import (
+       "bytes"
+       "crypto/md5"
+       "encoding/hex"
+       "fmt"
+       "io/ioutil"
+       "os"
+       "os/exec"
+       "path/filepath"
+       "runtime"
+       "strings"
+       "syscall"
+       "testing"
+       "time"
+
+       "github.com/opencontainers/runc/libcontainer"
+       "github.com/opencontainers/runc/libcontainer/configs"
+)
+
+func ptrInt(v int) *int {
+       return &v
+}
+
+func newStdBuffers() *stdBuffers {
+       return &stdBuffers{
+               Stdin:  bytes.NewBuffer(nil),
+               Stdout: bytes.NewBuffer(nil),
+               Stderr: bytes.NewBuffer(nil),
+       }
+}
+
+type stdBuffers struct {
+       Stdin  *bytes.Buffer
+       Stdout *bytes.Buffer
+       Stderr *bytes.Buffer
+}
+
+func (b *stdBuffers) String() string {
+       s := []string{}
+       if b.Stderr != nil {
+               s = append(s, b.Stderr.String())
+       }
+       if b.Stdout != nil {
+               s = append(s, b.Stdout.String())
+       }
+       return strings.Join(s, "|")
+}
+
+// ok fails the test if an err is not nil.
+func ok(t testing.TB, err error) {
+       if err != nil {
+               _, file, line, _ := runtime.Caller(1)
+               t.Fatalf("%s:%d: unexpected error: %s\n\n", filepath.Base(file), line, err.Error())
+       }
+}
+
+func waitProcess(p *libcontainer.Process, t *testing.T) {
+       _, file, line, _ := runtime.Caller(1)
+       status, err := p.Wait()
+
+       if err != nil {
+               t.Fatalf("%s:%d: unexpected error: %s\n\n", filepath.Base(file), line, err.Error())
+       }
+
+       if !status.Success() {
+               t.Fatalf("%s:%d: unexpected status: %s\n\n", filepath.Base(file), line, status.String())
+       }
+}
+
+func newTestRoot() (string, error) {
+       dir, err := ioutil.TempDir("", "libcontainer")
+       if err != nil {
+               return "", err
+       }
+       if err := os.MkdirAll(dir, 0700); err != nil {
+               return "", err
+       }
+       testRoots = append(testRoots, dir)
+       return dir, nil
+}
+
+func newTestBundle() (string, error) {
+       dir, err := ioutil.TempDir("", "bundle")
+       if err != nil {
+               return "", err
+       }
+       if err := os.MkdirAll(dir, 0700); err != nil {
+               return "", err
+       }
+       return dir, nil
+}
+
+// newRootfs creates a new tmp directory and copies the busybox root filesystem
+func newRootfs() (string, error) {
+       dir, err := ioutil.TempDir("", "")
+       if err != nil {
+               return "", err
+       }
+       if err := os.MkdirAll(dir, 0700); err != nil {
+               return "", err
+       }
+       if err := copyBusybox(dir); err != nil {
+               return "", err
+       }
+       return dir, nil
+}
+
+func remove(dir string) {
+       os.RemoveAll(dir)
+}
+
+// copyBusybox copies the rootfs for a busybox container created for the test image
+// into the new directory for the specific test
+func copyBusybox(dest string) error {
+       out, err := exec.Command("sh", "-c", fmt.Sprintf("cp -a /busybox/* %s/", dest)).CombinedOutput()
+       if err != nil {
+               return fmt.Errorf("copy error %q: %q", err, out)
+       }
+       return nil
+}
+
+func newContainer(config *configs.Config) (libcontainer.Container, error) {
+       h := md5.New()
+       h.Write([]byte(time.Now().String()))
+       return newContainerWithName(hex.EncodeToString(h.Sum(nil)), config)
+}
+
+func newContainerWithName(name string, config *configs.Config) (libcontainer.Container, error) {
+       root, err := newTestRoot()
+       if err != nil {
+               return nil, err
+       }
+
+       f, err := libcontainer.New(root, libcontainer.Cgroupfs)
+       if err != nil {
+               return nil, err
+       }
+       if config.Cgroups != nil && config.Cgroups.Parent == "system.slice" {
+               f, err = libcontainer.New(root, libcontainer.SystemdCgroups)
+               if err != nil {
+                       return nil, err
+               }
+       }
+       return f.Create(name, config)
+}
+
+// runContainer runs the container with the specific config and arguments
+//
+// buffers are returned containing the STDOUT and STDERR output for the run
+// along with the exit code and any go error
+func runContainer(config *configs.Config, console string, args ...string) (buffers *stdBuffers, exitCode int, err error) {
+       container, err := newContainer(config)
+       if err != nil {
+               return nil, -1, err
+       }
+       defer container.Destroy()
+       buffers = newStdBuffers()
+       process := &libcontainer.Process{
+               Cwd:    "/",
+               Args:   args,
+               Env:    standardEnvironment,
+               Stdin:  buffers.Stdin,
+               Stdout: buffers.Stdout,
+               Stderr: buffers.Stderr,
+               Init:   true,
+       }
+
+       err = container.Run(process)
+       if err != nil {
+               return buffers, -1, err
+       }
+       ps, err := process.Wait()
+       if err != nil {
+               return buffers, -1, err
+       }
+       status := ps.Sys().(syscall.WaitStatus)
+       if status.Exited() {
+               exitCode = status.ExitStatus()
+       } else if status.Signaled() {
+               exitCode = -int(status.Signal())
+       } else {
+               return buffers, -1, err
+       }
+       return
+}
diff --git a/libcontainer/intelrdt/intelrdt.go b/libcontainer/intelrdt/intelrdt.go
new file mode 100644 (file)
index 0000000..0071ce7
--- /dev/null
@@ -0,0 +1,773 @@
+// +build linux
+
+package intelrdt
+
+import (
+       "bufio"
+       "fmt"
+       "io/ioutil"
+       "os"
+       "path/filepath"
+       "strconv"
+       "strings"
+       "sync"
+
+       "github.com/opencontainers/runc/libcontainer/configs"
+)
+
+/*
+ * About Intel RDT features:
+ * Intel platforms with new Xeon CPU support Resource Director Technology (RDT).
+ * Cache Allocation Technology (CAT) and Memory Bandwidth Allocation (MBA) are
+ * two sub-features of RDT.
+ *
+ * Cache Allocation Technology (CAT) provides a way for the software to restrict
+ * cache allocation to a defined 'subset' of L3 cache which may be overlapping
+ * with other 'subsets'. The different subsets are identified by class of
+ * service (CLOS) and each CLOS has a capacity bitmask (CBM).
+ *
+ * Memory Bandwidth Allocation (MBA) provides indirect and approximate throttle
+ * over memory bandwidth for the software. A user controls the resource by
+ * indicating the percentage of maximum memory bandwidth or memory bandwidth
+ * limit in MBps unit if MBA Software Controller is enabled.
+ *
+ * More details about Intel RDT CAT and MBA can be found in the section 17.18
+ * of Intel Software Developer Manual:
+ * https://software.intel.com/en-us/articles/intel-sdm
+ *
+ * About Intel RDT kernel interface:
+ * In Linux 4.10 kernel or newer, the interface is defined and exposed via
+ * "resource control" filesystem, which is a "cgroup-like" interface.
+ *
+ * Comparing with cgroups, it has similar process management lifecycle and
+ * interfaces in a container. But unlike cgroups' hierarchy, it has single level
+ * filesystem layout.
+ *
+ * CAT and MBA features are introduced in Linux 4.10 and 4.12 kernel via
+ * "resource control" filesystem.
+ *
+ * Intel RDT "resource control" filesystem hierarchy:
+ * mount -t resctrl resctrl /sys/fs/resctrl
+ * tree /sys/fs/resctrl
+ * /sys/fs/resctrl/
+ * |-- info
+ * |   |-- L3
+ * |   |   |-- cbm_mask
+ * |   |   |-- min_cbm_bits
+ * |   |   |-- num_closids
+ * |   |-- MB
+ * |       |-- bandwidth_gran
+ * |       |-- delay_linear
+ * |       |-- min_bandwidth
+ * |       |-- num_closids
+ * |-- ...
+ * |-- schemata
+ * |-- tasks
+ * |-- <container_id>
+ *     |-- ...
+ *     |-- schemata
+ *     |-- tasks
+ *
+ * For runc, we can make use of `tasks` and `schemata` configuration for L3
+ * cache and memory bandwidth resources constraints.
+ *
+ * The file `tasks` has a list of tasks that belongs to this group (e.g.,
+ * <container_id>" group). Tasks can be added to a group by writing the task ID
+ * to the "tasks" file (which will automatically remove them from the previous
+ * group to which they belonged). New tasks created by fork(2) and clone(2) are
+ * added to the same group as their parent.
+ *
+ * The file `schemata` has a list of all the resources available to this group.
+ * Each resource (L3 cache, memory bandwidth) has its own line and format.
+ *
+ * L3 cache schema:
+ * It has allocation bitmasks/values for L3 cache on each socket, which
+ * contains L3 cache id and capacity bitmask (CBM).
+ *     Format: "L3:<cache_id0>=<cbm0>;<cache_id1>=<cbm1>;..."
+ * For example, on a two-socket machine, the schema line could be "L3:0=ff;1=c0"
+ * which means L3 cache id 0's CBM is 0xff, and L3 cache id 1's CBM is 0xc0.
+ *
+ * The valid L3 cache CBM is a *contiguous bits set* and number of bits that can
+ * be set is less than the max bit. The max bits in the CBM is varied among
+ * supported Intel CPU models. Kernel will check if it is valid when writing.
+ * e.g., default value 0xfffff in root indicates the max bits of CBM is 20
+ * bits, which mapping to entire L3 cache capacity. Some valid CBM values to
+ * set in a group: 0xf, 0xf0, 0x3ff, 0x1f00 and etc.
+ *
+ * Memory bandwidth schema:
+ * It has allocation values for memory bandwidth on each socket, which contains
+ * L3 cache id and memory bandwidth.
+ *     Format: "MB:<cache_id0>=bandwidth0;<cache_id1>=bandwidth1;..."
+ * For example, on a two-socket machine, the schema line could be "MB:0=20;1=70"
+ *
+ * The minimum bandwidth percentage value for each CPU model is predefined and
+ * can be looked up through "info/MB/min_bandwidth". The bandwidth granularity
+ * that is allocated is also dependent on the CPU model and can be looked up at
+ * "info/MB/bandwidth_gran". The available bandwidth control steps are:
+ * min_bw + N * bw_gran. Intermediate values are rounded to the next control
+ * step available on the hardware.
+ *
+ * If MBA Software Controller is enabled through mount option "-o mba_MBps":
+ * mount -t resctrl resctrl -o mba_MBps /sys/fs/resctrl
+ * We could specify memory bandwidth in "MBps" (Mega Bytes per second) unit
+ * instead of "percentages". The kernel underneath would use a software feedback
+ * mechanism or a "Software Controller" which reads the actual bandwidth using
+ * MBM counters and adjust the memory bandwidth percentages to ensure:
+ * "actual memory bandwidth < user specified memory bandwidth".
+ *
+ * For example, on a two-socket machine, the schema line could be
+ * "MB:0=5000;1=7000" which means 5000 MBps memory bandwidth limit on socket 0
+ * and 7000 MBps memory bandwidth limit on socket 1.
+ *
+ * For more information about Intel RDT kernel interface:
+ * https://www.kernel.org/doc/Documentation/x86/intel_rdt_ui.txt
+ *
+ * An example for runc:
+ * Consider a two-socket machine with two L3 caches where the default CBM is
+ * 0x7ff and the max CBM length is 11 bits, and minimum memory bandwidth of 10%
+ * with a memory bandwidth granularity of 10%.
+ *
+ * Tasks inside the container only have access to the "upper" 7/11 of L3 cache
+ * on socket 0 and the "lower" 5/11 L3 cache on socket 1, and may use a
+ * maximum memory bandwidth of 20% on socket 0 and 70% on socket 1.
+ *
+ * "linux": {
+ *     "intelRdt": {
+ *         "l3CacheSchema": "L3:0=7f0;1=1f",
+ *         "memBwSchema": "MB:0=20;1=70"
+ *     }
+ * }
+ */
+
+type Manager interface {
+       // Applies Intel RDT configuration to the process with the specified pid
+       Apply(pid int) error
+
+       // Returns statistics for Intel RDT
+       GetStats() (*Stats, error)
+
+       // Destroys the Intel RDT 'container_id' group
+       Destroy() error
+
+       // Returns Intel RDT path to save in a state file and to be able to
+       // restore the object later
+       GetPath() string
+
+       // Set Intel RDT "resource control" filesystem as configured.
+       Set(container *configs.Config) error
+}
+
+// This implements interface Manager
+type IntelRdtManager struct {
+       mu     sync.Mutex
+       Config *configs.Config
+       Id     string
+       Path   string
+}
+
+const (
+       IntelRdtTasks = "tasks"
+)
+
+var (
+       // The absolute root path of the Intel RDT "resource control" filesystem
+       intelRdtRoot     string
+       intelRdtRootLock sync.Mutex
+
+       // The flag to indicate if Intel RDT/CAT is enabled
+       isCatEnabled bool
+       // The flag to indicate if Intel RDT/MBA is enabled
+       isMbaEnabled bool
+       // The flag to indicate if Intel RDT/MBA Software Controller is enabled
+       isMbaScEnabled bool
+)
+
+type intelRdtData struct {
+       root   string
+       config *configs.Config
+       pid    int
+}
+
+// Check if Intel RDT sub-features are enabled in init()
+func init() {
+       // 1. Check if hardware and kernel support Intel RDT sub-features
+       // "cat_l3" flag for CAT and "mba" flag for MBA
+       isCatFlagSet, isMbaFlagSet, err := parseCpuInfoFile("/proc/cpuinfo")
+       if err != nil {
+               return
+       }
+
+       // 2. Check if Intel RDT "resource control" filesystem is mounted
+       // The user guarantees to mount the filesystem
+       if !isIntelRdtMounted() {
+               return
+       }
+
+       // 3. Double check if Intel RDT sub-features are available in
+       // "resource control" filesystem. Intel RDT sub-features can be
+       // selectively disabled or enabled by kernel command line
+       // (e.g., rdt=!l3cat,mba) in 4.14 and newer kernel
+       if isCatFlagSet {
+               if _, err := os.Stat(filepath.Join(intelRdtRoot, "info", "L3")); err == nil {
+                       isCatEnabled = true
+               }
+       }
+       if isMbaScEnabled {
+               // We confirm MBA Software Controller is enabled in step 2,
+               // MBA should be enabled because MBA Software Controller
+               // depends on MBA
+               isMbaEnabled = true
+       } else if isMbaFlagSet {
+               if _, err := os.Stat(filepath.Join(intelRdtRoot, "info", "MB")); err == nil {
+                       isMbaEnabled = true
+               }
+       }
+}
+
+// Return the mount point path of Intel RDT "resource control" filesysem
+func findIntelRdtMountpointDir() (string, error) {
+       f, err := os.Open("/proc/self/mountinfo")
+       if err != nil {
+               return "", err
+       }
+       defer f.Close()
+
+       s := bufio.NewScanner(f)
+       for s.Scan() {
+               text := s.Text()
+               fields := strings.Split(text, " ")
+               // Safe as mountinfo encodes mountpoints with spaces as \040.
+               index := strings.Index(text, " - ")
+               postSeparatorFields := strings.Fields(text[index+3:])
+               numPostFields := len(postSeparatorFields)
+
+               // This is an error as we can't detect if the mount is for "Intel RDT"
+               if numPostFields == 0 {
+                       return "", fmt.Errorf("Found no fields post '-' in %q", text)
+               }
+
+               if postSeparatorFields[0] == "resctrl" {
+                       // Check that the mount is properly formatted.
+                       if numPostFields < 3 {
+                               return "", fmt.Errorf("Error found less than 3 fields post '-' in %q", text)
+                       }
+
+                       // Check if MBA Software Controller is enabled through mount option "-o mba_MBps"
+                       if strings.Contains(postSeparatorFields[2], "mba_MBps") {
+                               isMbaScEnabled = true
+                       }
+
+                       return fields[4], nil
+               }
+       }
+       if err := s.Err(); err != nil {
+               return "", err
+       }
+
+       return "", NewNotFoundError("Intel RDT")
+}
+
+// Gets the root path of Intel RDT "resource control" filesystem
+func getIntelRdtRoot() (string, error) {
+       intelRdtRootLock.Lock()
+       defer intelRdtRootLock.Unlock()
+
+       if intelRdtRoot != "" {
+               return intelRdtRoot, nil
+       }
+
+       root, err := findIntelRdtMountpointDir()
+       if err != nil {
+               return "", err
+       }
+
+       if _, err := os.Stat(root); err != nil {
+               return "", err
+       }
+
+       intelRdtRoot = root
+       return intelRdtRoot, nil
+}
+
+func isIntelRdtMounted() bool {
+       _, err := getIntelRdtRoot()
+       if err != nil {
+               return false
+       }
+
+       return true
+}
+
+func parseCpuInfoFile(path string) (bool, bool, error) {
+       isCatFlagSet := false
+       isMbaFlagSet := false
+
+       f, err := os.Open(path)
+       if err != nil {
+               return false, false, err
+       }
+       defer f.Close()
+
+       s := bufio.NewScanner(f)
+       for s.Scan() {
+               if err := s.Err(); err != nil {
+                       return false, false, err
+               }
+
+               line := s.Text()
+
+               // Search "cat_l3" and "mba" flags in first "flags" line
+               if strings.Contains(line, "flags") {
+                       flags := strings.Split(line, " ")
+                       // "cat_l3" flag for CAT and "mba" flag for MBA
+                       for _, flag := range flags {
+                               switch flag {
+                               case "cat_l3":
+                                       isCatFlagSet = true
+                               case "mba":
+                                       isMbaFlagSet = true
+                               }
+                       }
+                       return isCatFlagSet, isMbaFlagSet, nil
+               }
+       }
+       return isCatFlagSet, isMbaFlagSet, nil
+}
+
+func parseUint(s string, base, bitSize int) (uint64, error) {
+       value, err := strconv.ParseUint(s, base, bitSize)
+       if err != nil {
+               intValue, intErr := strconv.ParseInt(s, base, bitSize)
+               // 1. Handle negative values greater than MinInt64 (and)
+               // 2. Handle negative values lesser than MinInt64
+               if intErr == nil && intValue < 0 {
+                       return 0, nil
+               } else if intErr != nil && intErr.(*strconv.NumError).Err == strconv.ErrRange && intValue < 0 {
+                       return 0, nil
+               }
+
+               return value, err
+       }
+
+       return value, nil
+}
+
+// Gets a single uint64 value from the specified file.
+func getIntelRdtParamUint(path, file string) (uint64, error) {
+       fileName := filepath.Join(path, file)
+       contents, err := ioutil.ReadFile(fileName)
+       if err != nil {
+               return 0, err
+       }
+
+       res, err := parseUint(strings.TrimSpace(string(contents)), 10, 64)
+       if err != nil {
+               return res, fmt.Errorf("unable to parse %q as a uint from file %q", string(contents), fileName)
+       }
+       return res, nil
+}
+
+// Gets a string value from the specified file
+func getIntelRdtParamString(path, file string) (string, error) {
+       contents, err := ioutil.ReadFile(filepath.Join(path, file))
+       if err != nil {
+               return "", err
+       }
+
+       return strings.TrimSpace(string(contents)), nil
+}
+
+func writeFile(dir, file, data string) error {
+       if dir == "" {
+               return fmt.Errorf("no such directory for %s", file)
+       }
+       if err := ioutil.WriteFile(filepath.Join(dir, file), []byte(data+"\n"), 0700); err != nil {
+               return fmt.Errorf("failed to write %v to %v: %v", data, file, err)
+       }
+       return nil
+}
+
+func getIntelRdtData(c *configs.Config, pid int) (*intelRdtData, error) {
+       rootPath, err := getIntelRdtRoot()
+       if err != nil {
+               return nil, err
+       }
+       return &intelRdtData{
+               root:   rootPath,
+               config: c,
+               pid:    pid,
+       }, nil
+}
+
+// Get the read-only L3 cache information
+func getL3CacheInfo() (*L3CacheInfo, error) {
+       l3CacheInfo := &L3CacheInfo{}
+
+       rootPath, err := getIntelRdtRoot()
+       if err != nil {
+               return l3CacheInfo, err
+       }
+
+       path := filepath.Join(rootPath, "info", "L3")
+       cbmMask, err := getIntelRdtParamString(path, "cbm_mask")
+       if err != nil {
+               return l3CacheInfo, err
+       }
+       minCbmBits, err := getIntelRdtParamUint(path, "min_cbm_bits")
+       if err != nil {
+               return l3CacheInfo, err
+       }
+       numClosids, err := getIntelRdtParamUint(path, "num_closids")
+       if err != nil {
+               return l3CacheInfo, err
+       }
+
+       l3CacheInfo.CbmMask = cbmMask
+       l3CacheInfo.MinCbmBits = minCbmBits
+       l3CacheInfo.NumClosids = numClosids
+
+       return l3CacheInfo, nil
+}
+
+// Get the read-only memory bandwidth information
+func getMemBwInfo() (*MemBwInfo, error) {
+       memBwInfo := &MemBwInfo{}
+
+       rootPath, err := getIntelRdtRoot()
+       if err != nil {
+               return memBwInfo, err
+       }
+
+       path := filepath.Join(rootPath, "info", "MB")
+       bandwidthGran, err := getIntelRdtParamUint(path, "bandwidth_gran")
+       if err != nil {
+               return memBwInfo, err
+       }
+       delayLinear, err := getIntelRdtParamUint(path, "delay_linear")
+       if err != nil {
+               return memBwInfo, err
+       }
+       minBandwidth, err := getIntelRdtParamUint(path, "min_bandwidth")
+       if err != nil {
+               return memBwInfo, err
+       }
+       numClosids, err := getIntelRdtParamUint(path, "num_closids")
+       if err != nil {
+               return memBwInfo, err
+       }
+
+       memBwInfo.BandwidthGran = bandwidthGran
+       memBwInfo.DelayLinear = delayLinear
+       memBwInfo.MinBandwidth = minBandwidth
+       memBwInfo.NumClosids = numClosids
+
+       return memBwInfo, nil
+}
+
+// Get diagnostics for last filesystem operation error from file info/last_cmd_status
+func getLastCmdStatus() (string, error) {
+       rootPath, err := getIntelRdtRoot()
+       if err != nil {
+               return "", err
+       }
+
+       path := filepath.Join(rootPath, "info")
+       lastCmdStatus, err := getIntelRdtParamString(path, "last_cmd_status")
+       if err != nil {
+               return "", err
+       }
+
+       return lastCmdStatus, nil
+}
+
+// WriteIntelRdtTasks writes the specified pid into the "tasks" file
+func WriteIntelRdtTasks(dir string, pid int) error {
+       if dir == "" {
+               return fmt.Errorf("no such directory for %s", IntelRdtTasks)
+       }
+
+       // Don't attach any pid if -1 is specified as a pid
+       if pid != -1 {
+               if err := ioutil.WriteFile(filepath.Join(dir, IntelRdtTasks), []byte(strconv.Itoa(pid)), 0700); err != nil {
+                       return fmt.Errorf("failed to write %v to %v: %v", pid, IntelRdtTasks, err)
+               }
+       }
+       return nil
+}
+
+// Check if Intel RDT/CAT is enabled
+func IsCatEnabled() bool {
+       return isCatEnabled
+}
+
+// Check if Intel RDT/MBA is enabled
+func IsMbaEnabled() bool {
+       return isMbaEnabled
+}
+
+// Check if Intel RDT/MBA Software Controller is enabled
+func IsMbaScEnabled() bool {
+       return isMbaScEnabled
+}
+
+// Get the 'container_id' path in Intel RDT "resource control" filesystem
+func GetIntelRdtPath(id string) (string, error) {
+       rootPath, err := getIntelRdtRoot()
+       if err != nil {
+               return "", err
+       }
+
+       path := filepath.Join(rootPath, id)
+       return path, nil
+}
+
+// Applies Intel RDT configuration to the process with the specified pid
+func (m *IntelRdtManager) Apply(pid int) (err error) {
+       // If intelRdt is not specified in config, we do nothing
+       if m.Config.IntelRdt == nil {
+               return nil
+       }
+       d, err := getIntelRdtData(m.Config, pid)
+       if err != nil && !IsNotFound(err) {
+               return err
+       }
+
+       m.mu.Lock()
+       defer m.mu.Unlock()
+       path, err := d.join(m.Id)
+       if err != nil {
+               return err
+       }
+
+       m.Path = path
+       return nil
+}
+
+// Destroys the Intel RDT 'container_id' group
+func (m *IntelRdtManager) Destroy() error {
+       m.mu.Lock()
+       defer m.mu.Unlock()
+       if err := os.RemoveAll(m.GetPath()); err != nil {
+               return err
+       }
+       m.Path = ""
+       return nil
+}
+
+// Returns Intel RDT path to save in a state file and to be able to
+// restore the object later
+func (m *IntelRdtManager) GetPath() string {
+       if m.Path == "" {
+               m.Path, _ = GetIntelRdtPath(m.Id)
+       }
+       return m.Path
+}
+
+// Returns statistics for Intel RDT
+func (m *IntelRdtManager) GetStats() (*Stats, error) {
+       // If intelRdt is not specified in config
+       if m.Config.IntelRdt == nil {
+               return nil, nil
+       }
+
+       m.mu.Lock()
+       defer m.mu.Unlock()
+       stats := NewStats()
+
+       rootPath, err := getIntelRdtRoot()
+       if err != nil {
+               return nil, err
+       }
+       // The read-only L3 cache and memory bandwidth schemata in root
+       tmpRootStrings, err := getIntelRdtParamString(rootPath, "schemata")
+       if err != nil {
+               return nil, err
+       }
+       schemaRootStrings := strings.Split(tmpRootStrings, "\n")
+
+       // The L3 cache and memory bandwidth schemata in 'container_id' group
+       tmpStrings, err := getIntelRdtParamString(m.GetPath(), "schemata")
+       if err != nil {
+               return nil, err
+       }
+       schemaStrings := strings.Split(tmpStrings, "\n")
+
+       if IsCatEnabled() {
+               // The read-only L3 cache information
+               l3CacheInfo, err := getL3CacheInfo()
+               if err != nil {
+                       return nil, err
+               }
+               stats.L3CacheInfo = l3CacheInfo
+
+               // The read-only L3 cache schema in root
+               for _, schemaRoot := range schemaRootStrings {
+                       if strings.Contains(schemaRoot, "L3") {
+                               stats.L3CacheSchemaRoot = strings.TrimSpace(schemaRoot)
+                       }
+               }
+
+               // The L3 cache schema in 'container_id' group
+               for _, schema := range schemaStrings {
+                       if strings.Contains(schema, "L3") {
+                               stats.L3CacheSchema = strings.TrimSpace(schema)
+                       }
+               }
+       }
+
+       if IsMbaEnabled() {
+               // The read-only memory bandwidth information
+               memBwInfo, err := getMemBwInfo()
+               if err != nil {
+                       return nil, err
+               }
+               stats.MemBwInfo = memBwInfo
+
+               // The read-only memory bandwidth information
+               for _, schemaRoot := range schemaRootStrings {
+                       if strings.Contains(schemaRoot, "MB") {
+                               stats.MemBwSchemaRoot = strings.TrimSpace(schemaRoot)
+                       }
+               }
+
+               // The memory bandwidth schema in 'container_id' group
+               for _, schema := range schemaStrings {
+                       if strings.Contains(schema, "MB") {
+                               stats.MemBwSchema = strings.TrimSpace(schema)
+                       }
+               }
+       }
+
+       return stats, nil
+}
+
+// Set Intel RDT "resource control" filesystem as configured.
+func (m *IntelRdtManager) Set(container *configs.Config) error {
+       // About L3 cache schema:
+       // It has allocation bitmasks/values for L3 cache on each socket,
+       // which contains L3 cache id and capacity bitmask (CBM).
+       //      Format: "L3:<cache_id0>=<cbm0>;<cache_id1>=<cbm1>;..."
+       // For example, on a two-socket machine, the schema line could be:
+       //      L3:0=ff;1=c0
+       // which means L3 cache id 0's CBM is 0xff, and L3 cache id 1's CBM
+       // is 0xc0.
+       //
+       // The valid L3 cache CBM is a *contiguous bits set* and number of
+       // bits that can be set is less than the max bit. The max bits in the
+       // CBM is varied among supported Intel CPU models. Kernel will check
+       // if it is valid when writing. e.g., default value 0xfffff in root
+       // indicates the max bits of CBM is 20 bits, which mapping to entire
+       // L3 cache capacity. Some valid CBM values to set in a group:
+       // 0xf, 0xf0, 0x3ff, 0x1f00 and etc.
+       //
+       //
+       // About memory bandwidth schema:
+       // It has allocation values for memory bandwidth on each socket, which
+       // contains L3 cache id and memory bandwidth.
+       //      Format: "MB:<cache_id0>=bandwidth0;<cache_id1>=bandwidth1;..."
+       // For example, on a two-socket machine, the schema line could be:
+       //      "MB:0=20;1=70"
+       //
+       // The minimum bandwidth percentage value for each CPU model is
+       // predefined and can be looked up through "info/MB/min_bandwidth".
+       // The bandwidth granularity that is allocated is also dependent on
+       // the CPU model and can be looked up at "info/MB/bandwidth_gran".
+       // The available bandwidth control steps are: min_bw + N * bw_gran.
+       // Intermediate values are rounded to the next control step available
+       // on the hardware.
+       //
+       // If MBA Software Controller is enabled through mount option
+       // "-o mba_MBps": mount -t resctrl resctrl -o mba_MBps /sys/fs/resctrl
+       // We could specify memory bandwidth in "MBps" (Mega Bytes per second)
+       // unit instead of "percentages". The kernel underneath would use a
+       // software feedback mechanism or a "Software Controller" which reads
+       // the actual bandwidth using MBM counters and adjust the memory
+       // bandwidth percentages to ensure:
+       // "actual memory bandwidth < user specified memory bandwidth".
+       //
+       // For example, on a two-socket machine, the schema line could be
+       // "MB:0=5000;1=7000" which means 5000 MBps memory bandwidth limit on
+       // socket 0 and 7000 MBps memory bandwidth limit on socket 1.
+       if container.IntelRdt != nil {
+               path := m.GetPath()
+               l3CacheSchema := container.IntelRdt.L3CacheSchema
+               memBwSchema := container.IntelRdt.MemBwSchema
+
+               // Write a single joint schema string to schemata file
+               if l3CacheSchema != "" && memBwSchema != "" {
+                       if err := writeFile(path, "schemata", l3CacheSchema+"\n"+memBwSchema); err != nil {
+                               return NewLastCmdError(err)
+                       }
+               }
+
+               // Write only L3 cache schema string to schemata file
+               if l3CacheSchema != "" && memBwSchema == "" {
+                       if err := writeFile(path, "schemata", l3CacheSchema); err != nil {
+                               return NewLastCmdError(err)
+                       }
+               }
+
+               // Write only memory bandwidth schema string to schemata file
+               if l3CacheSchema == "" && memBwSchema != "" {
+                       if err := writeFile(path, "schemata", memBwSchema); err != nil {
+                               return NewLastCmdError(err)
+                       }
+               }
+       }
+
+       return nil
+}
+
+func (raw *intelRdtData) join(id string) (string, error) {
+       path := filepath.Join(raw.root, id)
+       if err := os.MkdirAll(path, 0755); err != nil {
+               return "", NewLastCmdError(err)
+       }
+
+       if err := WriteIntelRdtTasks(path, raw.pid); err != nil {
+               return "", NewLastCmdError(err)
+       }
+       return path, nil
+}
+
+type NotFoundError struct {
+       ResourceControl string
+}
+
+func (e *NotFoundError) Error() string {
+       return fmt.Sprintf("mountpoint for %s not found", e.ResourceControl)
+}
+
+func NewNotFoundError(res string) error {
+       return &NotFoundError{
+               ResourceControl: res,
+       }
+}
+
+func IsNotFound(err error) bool {
+       if err == nil {
+               return false
+       }
+       _, ok := err.(*NotFoundError)
+       return ok
+}
+
+type LastCmdError struct {
+       LastCmdStatus string
+       Err           error
+}
+
+func (e *LastCmdError) Error() string {
+       return fmt.Sprintf(e.Err.Error() + ", last_cmd_status: " + e.LastCmdStatus)
+}
+
+func NewLastCmdError(err error) error {
+       lastCmdStatus, err1 := getLastCmdStatus()
+       if err1 == nil {
+               return &LastCmdError{
+                       LastCmdStatus: lastCmdStatus,
+                       Err:           err,
+               }
+       }
+       return err
+}
diff --git a/libcontainer/intelrdt/intelrdt_test.go b/libcontainer/intelrdt/intelrdt_test.go
new file mode 100644 (file)
index 0000000..a19b961
--- /dev/null
@@ -0,0 +1,122 @@
+// +build linux
+
+package intelrdt
+
+import (
+       "strings"
+       "testing"
+)
+
+func TestIntelRdtSetL3CacheSchema(t *testing.T) {
+       if !IsCatEnabled() {
+               return
+       }
+
+       helper := NewIntelRdtTestUtil(t)
+       defer helper.cleanup()
+
+       const (
+               l3CacheSchemaBefore = "L3:0=f;1=f0"
+               l3CacheSchemeAfter  = "L3:0=f0;1=f"
+       )
+
+       helper.writeFileContents(map[string]string{
+               "schemata": l3CacheSchemaBefore + "\n",
+       })
+
+       helper.IntelRdtData.config.IntelRdt.L3CacheSchema = l3CacheSchemeAfter
+       intelrdt := &IntelRdtManager{
+               Config: helper.IntelRdtData.config,
+               Path:   helper.IntelRdtPath,
+       }
+       if err := intelrdt.Set(helper.IntelRdtData.config); err != nil {
+               t.Fatal(err)
+       }
+
+       tmpStrings, err := getIntelRdtParamString(helper.IntelRdtPath, "schemata")
+       if err != nil {
+               t.Fatalf("Failed to parse file 'schemata' - %s", err)
+       }
+       values := strings.Split(tmpStrings, "\n")
+       value := values[0]
+
+       if value != l3CacheSchemeAfter {
+               t.Fatal("Got the wrong value, set 'schemata' failed.")
+       }
+}
+
+func TestIntelRdtSetMemBwSchema(t *testing.T) {
+       if !IsMbaEnabled() {
+               return
+       }
+
+       helper := NewIntelRdtTestUtil(t)
+       defer helper.cleanup()
+
+       const (
+               memBwSchemaBefore = "MB:0=20;1=70"
+               memBwSchemeAfter  = "MB:0=70;1=20"
+       )
+
+       helper.writeFileContents(map[string]string{
+               "schemata": memBwSchemaBefore + "\n",
+       })
+
+       helper.IntelRdtData.config.IntelRdt.MemBwSchema = memBwSchemeAfter
+       intelrdt := &IntelRdtManager{
+               Config: helper.IntelRdtData.config,
+               Path:   helper.IntelRdtPath,
+       }
+       if err := intelrdt.Set(helper.IntelRdtData.config); err != nil {
+               t.Fatal(err)
+       }
+
+       tmpStrings, err := getIntelRdtParamString(helper.IntelRdtPath, "schemata")
+       if err != nil {
+               t.Fatalf("Failed to parse file 'schemata' - %s", err)
+       }
+       values := strings.Split(tmpStrings, "\n")
+       value := values[0]
+
+       if value != memBwSchemeAfter {
+               t.Fatal("Got the wrong value, set 'schemata' failed.")
+       }
+}
+
+func TestIntelRdtSetMemBwScSchema(t *testing.T) {
+       if !IsMbaScEnabled() {
+               return
+       }
+
+       helper := NewIntelRdtTestUtil(t)
+       defer helper.cleanup()
+
+       const (
+               memBwScSchemaBefore = "MB:0=5000;1=7000"
+               memBwScSchemeAfter  = "MB:0=9000;1=4000"
+       )
+
+       helper.writeFileContents(map[string]string{
+               "schemata": memBwScSchemaBefore + "\n",
+       })
+
+       helper.IntelRdtData.config.IntelRdt.MemBwSchema = memBwScSchemeAfter
+       intelrdt := &IntelRdtManager{
+               Config: helper.IntelRdtData.config,
+               Path:   helper.IntelRdtPath,
+       }
+       if err := intelrdt.Set(helper.IntelRdtData.config); err != nil {
+               t.Fatal(err)
+       }
+
+       tmpStrings, err := getIntelRdtParamString(helper.IntelRdtPath, "schemata")
+       if err != nil {
+               t.Fatalf("Failed to parse file 'schemata' - %s", err)
+       }
+       values := strings.Split(tmpStrings, "\n")
+       value := values[0]
+
+       if value != memBwScSchemeAfter {
+               t.Fatal("Got the wrong value, set 'schemata' failed.")
+       }
+}
diff --git a/libcontainer/intelrdt/stats.go b/libcontainer/intelrdt/stats.go
new file mode 100644 (file)
index 0000000..df5686f
--- /dev/null
@@ -0,0 +1,40 @@
+// +build linux
+
+package intelrdt
+
+type L3CacheInfo struct {
+       CbmMask    string `json:"cbm_mask,omitempty"`
+       MinCbmBits uint64 `json:"min_cbm_bits,omitempty"`
+       NumClosids uint64 `json:"num_closids,omitempty"`
+}
+
+type MemBwInfo struct {
+       BandwidthGran uint64 `json:"bandwidth_gran,omitempty"`
+       DelayLinear   uint64 `json:"delay_linear,omitempty"`
+       MinBandwidth  uint64 `json:"min_bandwidth,omitempty"`
+       NumClosids    uint64 `json:"num_closids,omitempty"`
+}
+
+type Stats struct {
+       // The read-only L3 cache information
+       L3CacheInfo *L3CacheInfo `json:"l3_cache_info,omitempty"`
+
+       // The read-only L3 cache schema in root
+       L3CacheSchemaRoot string `json:"l3_cache_schema_root,omitempty"`
+
+       // The L3 cache schema in 'container_id' group
+       L3CacheSchema string `json:"l3_cache_schema,omitempty"`
+
+       // The read-only memory bandwidth information
+       MemBwInfo *MemBwInfo `json:"mem_bw_info,omitempty"`
+
+       // The read-only memory bandwidth schema in root
+       MemBwSchemaRoot string `json:"mem_bw_schema_root,omitempty"`
+
+       // The memory bandwidth schema in 'container_id' group
+       MemBwSchema string `json:"mem_bw_schema,omitempty"`
+}
+
+func NewStats() *Stats {
+       return &Stats{}
+}
diff --git a/libcontainer/intelrdt/util_test.go b/libcontainer/intelrdt/util_test.go
new file mode 100644 (file)
index 0000000..970b6ce
--- /dev/null
@@ -0,0 +1,67 @@
+// +build linux
+
+/*
+ * Utility for testing Intel RDT operations.
+ * Creates a mock of the Intel RDT "resource control" filesystem for the duration of the test.
+ */
+package intelrdt
+
+import (
+       "io/ioutil"
+       "os"
+       "path/filepath"
+       "testing"
+
+       "github.com/opencontainers/runc/libcontainer/configs"
+)
+
+type intelRdtTestUtil struct {
+       // intelRdt data to use in tests
+       IntelRdtData *intelRdtData
+
+       // Path to the mock Intel RDT "resource control" filesystem directory
+       IntelRdtPath string
+
+       // Temporary directory to store mock Intel RDT "resource control" filesystem
+       tempDir string
+       t       *testing.T
+}
+
+// Creates a new test util
+func NewIntelRdtTestUtil(t *testing.T) *intelRdtTestUtil {
+       d := &intelRdtData{
+               config: &configs.Config{
+                       IntelRdt: &configs.IntelRdt{},
+               },
+       }
+       tempDir, err := ioutil.TempDir("", "intelrdt_test")
+       if err != nil {
+               t.Fatal(err)
+       }
+       d.root = tempDir
+       testIntelRdtPath := filepath.Join(d.root, "resctrl")
+       if err != nil {
+               t.Fatal(err)
+       }
+
+       // Ensure the full mock Intel RDT "resource control" filesystem path exists
+       err = os.MkdirAll(testIntelRdtPath, 0755)
+       if err != nil {
+               t.Fatal(err)
+       }
+       return &intelRdtTestUtil{IntelRdtData: d, IntelRdtPath: testIntelRdtPath, tempDir: tempDir, t: t}
+}
+
+func (c *intelRdtTestUtil) cleanup() {
+       os.RemoveAll(c.tempDir)
+}
+
+// Write the specified contents on the mock of the specified Intel RDT "resource control" files
+func (c *intelRdtTestUtil) writeFileContents(fileContents map[string]string) {
+       for file, contents := range fileContents {
+               err := writeFile(c.IntelRdtPath, file, contents)
+               if err != nil {
+                       c.t.Fatal(err)
+               }
+       }
+}
diff --git a/libcontainer/keys/keyctl.go b/libcontainer/keys/keyctl.go
new file mode 100644 (file)
index 0000000..74dedd5
--- /dev/null
@@ -0,0 +1,48 @@
+// +build linux
+
+package keys
+
+import (
+       "fmt"
+       "strconv"
+       "strings"
+
+       "github.com/pkg/errors"
+
+       "golang.org/x/sys/unix"
+)
+
+type KeySerial uint32
+
+func JoinSessionKeyring(name string) (KeySerial, error) {
+       sessKeyId, err := unix.KeyctlJoinSessionKeyring(name)
+       if err != nil {
+               return 0, errors.Wrap(err, "create session key")
+       }
+       return KeySerial(sessKeyId), nil
+}
+
+// ModKeyringPerm modifies permissions on a keyring by reading the current permissions,
+// anding the bits with the given mask (clearing permissions) and setting
+// additional permission bits
+func ModKeyringPerm(ringId KeySerial, mask, setbits uint32) error {
+       dest, err := unix.KeyctlString(unix.KEYCTL_DESCRIBE, int(ringId))
+       if err != nil {
+               return err
+       }
+
+       res := strings.Split(dest, ";")
+       if len(res) < 5 {
+               return fmt.Errorf("Destination buffer for key description is too small")
+       }
+
+       // parse permissions
+       perm64, err := strconv.ParseUint(res[3], 16, 32)
+       if err != nil {
+               return err
+       }
+
+       perm := (uint32(perm64) & mask) | setbits
+
+       return unix.KeyctlSetperm(int(ringId), perm)
+}
diff --git a/libcontainer/logs/logs.go b/libcontainer/logs/logs.go
new file mode 100644 (file)
index 0000000..1077e7b
--- /dev/null
@@ -0,0 +1,102 @@
+package logs
+
+import (
+       "bufio"
+       "encoding/json"
+       "fmt"
+       "io"
+       "os"
+       "strconv"
+       "sync"
+
+       "github.com/sirupsen/logrus"
+)
+
+var (
+       configureMutex = sync.Mutex{}
+       // loggingConfigured will be set once logging has been configured via invoking `ConfigureLogging`.
+       // Subsequent invocations of `ConfigureLogging` would be no-op
+       loggingConfigured = false
+)
+
+type Config struct {
+       LogLevel    logrus.Level
+       LogFormat   string
+       LogFilePath string
+       LogPipeFd   string
+}
+
+func ForwardLogs(logPipe io.Reader) {
+       lineReader := bufio.NewReader(logPipe)
+       for {
+               line, err := lineReader.ReadBytes('\n')
+               if len(line) > 0 {
+                       processEntry(line)
+               }
+               if err == io.EOF {
+                       logrus.Debugf("log pipe has been closed: %+v", err)
+                       return
+               }
+               if err != nil {
+                       logrus.Errorf("log pipe read error: %+v", err)
+               }
+       }
+}
+
+func processEntry(text []byte) {
+       type jsonLog struct {
+               Level string `json:"level"`
+               Msg   string `json:"msg"`
+       }
+
+       var jl jsonLog
+       if err := json.Unmarshal(text, &jl); err != nil {
+               logrus.Errorf("failed to decode %q to json: %+v", text, err)
+               return
+       }
+
+       lvl, err := logrus.ParseLevel(jl.Level)
+       if err != nil {
+               logrus.Errorf("failed to parse log level %q: %v\n", jl.Level, err)
+               return
+       }
+       logrus.StandardLogger().Logf(lvl, jl.Msg)
+}
+
+func ConfigureLogging(config Config) error {
+       configureMutex.Lock()
+       defer configureMutex.Unlock()
+
+       if loggingConfigured {
+               logrus.Debug("logging has already been configured")
+               return nil
+       }
+
+       logrus.SetLevel(config.LogLevel)
+
+       if config.LogPipeFd != "" {
+               logPipeFdInt, err := strconv.Atoi(config.LogPipeFd)
+               if err != nil {
+                       return fmt.Errorf("failed to convert _LIBCONTAINER_LOGPIPE environment variable value %q to int: %v", config.LogPipeFd, err)
+               }
+               logrus.SetOutput(os.NewFile(uintptr(logPipeFdInt), "logpipe"))
+       } else if config.LogFilePath != "" {
+               f, err := os.OpenFile(config.LogFilePath, os.O_CREATE|os.O_WRONLY|os.O_APPEND|os.O_SYNC, 0644)
+               if err != nil {
+                       return err
+               }
+               logrus.SetOutput(f)
+       }
+
+       switch config.LogFormat {
+       case "text":
+               // retain logrus's default.
+       case "json":
+               logrus.SetFormatter(new(logrus.JSONFormatter))
+       default:
+               return fmt.Errorf("unknown log-format %q", config.LogFormat)
+       }
+
+       loggingConfigured = true
+       return nil
+}
diff --git a/libcontainer/logs/logs_linux_test.go b/libcontainer/logs/logs_linux_test.go
new file mode 100644 (file)
index 0000000..83166fa
--- /dev/null
@@ -0,0 +1,160 @@
+package logs
+
+import (
+       "errors"
+       "io/ioutil"
+       "os"
+       "strings"
+       "testing"
+       "time"
+
+       "github.com/sirupsen/logrus"
+)
+
+func TestLoggingToFile(t *testing.T) {
+       logW, logFile, _ := runLogForwarding(t)
+       defer os.Remove(logFile)
+       defer logW.Close()
+
+       logToLogWriter(t, logW, `{"level": "info","msg":"kitten"}`)
+
+       logFileContent := waitForLogContent(t, logFile)
+       if !strings.Contains(string(logFileContent), "kitten") {
+               t.Fatalf("%s does not contain kitten", string(logFileContent))
+       }
+}
+
+func TestLogForwardingDoesNotStopOnJsonDecodeErr(t *testing.T) {
+       logW, logFile, _ := runLogForwarding(t)
+       defer os.Remove(logFile)
+       defer logW.Close()
+
+       logToLogWriter(t, logW, "invalid-json-with-kitten")
+
+       logFileContent := waitForLogContent(t, logFile)
+       if !strings.Contains(string(logFileContent), "failed to decode") {
+               t.Fatalf("%q does not contain decoding error", string(logFileContent))
+       }
+
+       truncateLogFile(t, logFile)
+
+       logToLogWriter(t, logW, `{"level": "info","msg":"puppy"}`)
+
+       logFileContent = waitForLogContent(t, logFile)
+       if !strings.Contains(string(logFileContent), "puppy") {
+               t.Fatalf("%s does not contain puppy", string(logFileContent))
+       }
+}
+
+func TestLogForwardingDoesNotStopOnLogLevelParsingErr(t *testing.T) {
+       logW, logFile, _ := runLogForwarding(t)
+       defer os.Remove(logFile)
+       defer logW.Close()
+
+       logToLogWriter(t, logW, `{"level": "alert","msg":"puppy"}`)
+
+       logFileContent := waitForLogContent(t, logFile)
+       if !strings.Contains(string(logFileContent), "failed to parse log level") {
+               t.Fatalf("%q does not contain log level parsing error", string(logFileContent))
+       }
+
+       truncateLogFile(t, logFile)
+
+       logToLogWriter(t, logW, `{"level": "info","msg":"puppy"}`)
+
+       logFileContent = waitForLogContent(t, logFile)
+       if !strings.Contains(string(logFileContent), "puppy") {
+               t.Fatalf("%s does not contain puppy", string(logFileContent))
+       }
+}
+
+func TestLogForwardingStopsAfterClosingTheWriter(t *testing.T) {
+       logW, logFile, doneForwarding := runLogForwarding(t)
+       defer os.Remove(logFile)
+
+       logToLogWriter(t, logW, `{"level": "info","msg":"sync"}`)
+
+       logFileContent := waitForLogContent(t, logFile)
+       if !strings.Contains(string(logFileContent), "sync") {
+               t.Fatalf("%q does not contain sync message", string(logFileContent))
+       }
+
+       logW.Close()
+       select {
+       case <-doneForwarding:
+       case <-time.After(10 * time.Second):
+               t.Fatal("log forwarding did not stop after closing the pipe")
+       }
+}
+
+func logToLogWriter(t *testing.T, logW *os.File, message string) {
+       _, err := logW.Write([]byte(message + "\n"))
+       if err != nil {
+               t.Fatalf("failed to write %q to log writer: %v", message, err)
+       }
+}
+
+func runLogForwarding(t *testing.T) (*os.File, string, chan struct{}) {
+       logR, logW, err := os.Pipe()
+       if err != nil {
+               t.Fatal(err)
+       }
+
+       tempFile, err := ioutil.TempFile("", "")
+       if err != nil {
+               t.Fatal(err)
+       }
+       logFile := tempFile.Name()
+
+       logConfig := Config{LogLevel: logrus.InfoLevel, LogFormat: "json", LogFilePath: logFile}
+       return logW, logFile, startLogForwarding(t, logConfig, logR)
+}
+
+func startLogForwarding(t *testing.T, logConfig Config, logR *os.File) chan struct{} {
+       loggingConfigured = false
+       if err := ConfigureLogging(logConfig); err != nil {
+               t.Fatal(err)
+       }
+       doneForwarding := make(chan struct{})
+       go func() {
+               ForwardLogs(logR)
+               close(doneForwarding)
+       }()
+       return doneForwarding
+}
+
+func waitForLogContent(t *testing.T, logFile string) string {
+       startTime := time.Now()
+
+       for {
+               if time.Now().After(startTime.Add(10 * time.Second)) {
+                       t.Fatal(errors.New("No content in log file after 10 seconds"))
+                       break
+               }
+
+               fileContent, err := ioutil.ReadFile(logFile)
+               if err != nil {
+                       t.Fatal(err)
+               }
+               if len(fileContent) == 0 {
+                       continue
+               }
+               return string(fileContent)
+       }
+
+       return ""
+}
+
+func truncateLogFile(t *testing.T, logFile string) {
+       file, err := os.OpenFile(logFile, os.O_RDWR, 0666)
+       if err != nil {
+               t.Fatalf("failed to open log file: %v", err)
+               return
+       }
+       defer file.Close()
+
+       err = file.Truncate(0)
+       if err != nil {
+               t.Fatalf("failed to truncate log file: %v", err)
+       }
+}
diff --git a/libcontainer/message_linux.go b/libcontainer/message_linux.go
new file mode 100644 (file)
index 0000000..1d4f503
--- /dev/null
@@ -0,0 +1,89 @@
+// +build linux
+
+package libcontainer
+
+import (
+       "github.com/vishvananda/netlink/nl"
+       "golang.org/x/sys/unix"
+)
+
+// list of known message types we want to send to bootstrap program
+// The number is randomly chosen to not conflict with known netlink types
+const (
+       InitMsg          uint16 = 62000
+       CloneFlagsAttr   uint16 = 27281
+       NsPathsAttr      uint16 = 27282
+       UidmapAttr       uint16 = 27283
+       GidmapAttr       uint16 = 27284
+       SetgroupAttr     uint16 = 27285
+       OomScoreAdjAttr  uint16 = 27286
+       RootlessEUIDAttr uint16 = 27287
+       UidmapPathAttr   uint16 = 27288
+       GidmapPathAttr   uint16 = 27289
+)
+
+type Int32msg struct {
+       Type  uint16
+       Value uint32
+}
+
+// Serialize serializes the message.
+// Int32msg has the following representation
+// | nlattr len | nlattr type |
+// | uint32 value             |
+func (msg *Int32msg) Serialize() []byte {
+       buf := make([]byte, msg.Len())
+       native := nl.NativeEndian()
+       native.PutUint16(buf[0:2], uint16(msg.Len()))
+       native.PutUint16(buf[2:4], msg.Type)
+       native.PutUint32(buf[4:8], msg.Value)
+       return buf
+}
+
+func (msg *Int32msg) Len() int {
+       return unix.NLA_HDRLEN + 4
+}
+
+// Bytemsg has the following representation
+// | nlattr len | nlattr type |
+// | value              | pad |
+type Bytemsg struct {
+       Type  uint16
+       Value []byte
+}
+
+func (msg *Bytemsg) Serialize() []byte {
+       l := msg.Len()
+       buf := make([]byte, (l+unix.NLA_ALIGNTO-1) & ^(unix.NLA_ALIGNTO-1))
+       native := nl.NativeEndian()
+       native.PutUint16(buf[0:2], uint16(l))
+       native.PutUint16(buf[2:4], msg.Type)
+       copy(buf[4:], msg.Value)
+       return buf
+}
+
+func (msg *Bytemsg) Len() int {
+       return unix.NLA_HDRLEN + len(msg.Value) + 1 // null-terminated
+}
+
+type Boolmsg struct {
+       Type  uint16
+       Value bool
+}
+
+func (msg *Boolmsg) Serialize() []byte {
+       buf := make([]byte, msg.Len())
+       native := nl.NativeEndian()
+       native.PutUint16(buf[0:2], uint16(msg.Len()))
+       native.PutUint16(buf[2:4], msg.Type)
+       if msg.Value {
+               native.PutUint32(buf[4:8], uint32(1))
+       } else {
+               native.PutUint32(buf[4:8], uint32(0))
+       }
+       return buf
+}
+
+func (msg *Boolmsg) Len() int {
+       return unix.NLA_HDRLEN + 4 // alignment
+}
diff --git a/libcontainer/mount/mount.go b/libcontainer/mount/mount.go
new file mode 100644 (file)
index 0000000..e8965e0
--- /dev/null
@@ -0,0 +1,23 @@
+package mount
+
+// GetMounts retrieves a list of mounts for the current running process.
+func GetMounts() ([]*Info, error) {
+       return parseMountTable()
+}
+
+// Mounted looks at /proc/self/mountinfo to determine of the specified
+// mountpoint has been mounted
+func Mounted(mountpoint string) (bool, error) {
+       entries, err := parseMountTable()
+       if err != nil {
+               return false, err
+       }
+
+       // Search the table for the mountpoint
+       for _, e := range entries {
+               if e.Mountpoint == mountpoint {
+                       return true, nil
+               }
+       }
+       return false, nil
+}
diff --git a/libcontainer/mount/mount_linux.go b/libcontainer/mount/mount_linux.go
new file mode 100644 (file)
index 0000000..1e51919
--- /dev/null
@@ -0,0 +1,82 @@
+// +build linux
+
+package mount
+
+import (
+       "bufio"
+       "fmt"
+       "io"
+       "os"
+       "strings"
+)
+
+const (
+       /* 36 35 98:0 /mnt1 /mnt2 rw,noatime master:1 - ext3 /dev/root rw,errors=continue
+          (1)(2)(3)   (4)   (5)      (6)      (7)   (8) (9)   (10)         (11)
+
+          (1) mount ID:  unique identifier of the mount (may be reused after umount)
+          (2) parent ID:  ID of parent (or of self for the top of the mount tree)
+          (3) major:minor:  value of st_dev for files on filesystem
+          (4) root:  root of the mount within the filesystem
+          (5) mount point:  mount point relative to the process's root
+          (6) mount options:  per mount options
+          (7) optional fields:  zero or more fields of the form "tag[:value]"
+          (8) separator:  marks the end of the optional fields
+          (9) filesystem type:  name of filesystem of the form "type[.subtype]"
+          (10) mount source:  filesystem specific information or "none"
+          (11) super options:  per super block options*/
+       mountinfoFormat = "%d %d %d:%d %s %s %s %s"
+)
+
+// Parse /proc/self/mountinfo because comparing Dev and ino does not work from
+// bind mounts
+func parseMountTable() ([]*Info, error) {
+       f, err := os.Open("/proc/self/mountinfo")
+       if err != nil {
+               return nil, err
+       }
+       defer f.Close()
+
+       return parseInfoFile(f)
+}
+
+func parseInfoFile(r io.Reader) ([]*Info, error) {
+       var (
+               s   = bufio.NewScanner(r)
+               out = []*Info{}
+       )
+
+       for s.Scan() {
+               if err := s.Err(); err != nil {
+                       return nil, err
+               }
+
+               var (
+                       p              = &Info{}
+                       text           = s.Text()
+                       optionalFields string
+               )
+
+               if _, err := fmt.Sscanf(text, mountinfoFormat,
+                       &p.ID, &p.Parent, &p.Major, &p.Minor,
+                       &p.Root, &p.Mountpoint, &p.Opts, &optionalFields); err != nil {
+                       return nil, fmt.Errorf("Scanning '%s' failed: %s", text, err)
+               }
+               // Safe as mountinfo encodes mountpoints with spaces as \040.
+               index := strings.Index(text, " - ")
+               postSeparatorFields := strings.Fields(text[index+3:])
+               if len(postSeparatorFields) < 3 {
+                       return nil, fmt.Errorf("Error found less than 3 fields post '-' in %q", text)
+               }
+
+               if optionalFields != "-" {
+                       p.Optional = optionalFields
+               }
+
+               p.Fstype = postSeparatorFields[0]
+               p.Source = postSeparatorFields[1]
+               p.VfsOpts = strings.Join(postSeparatorFields[2:], " ")
+               out = append(out, p)
+       }
+       return out, nil
+}
diff --git a/libcontainer/mount/mountinfo.go b/libcontainer/mount/mountinfo.go
new file mode 100644 (file)
index 0000000..e3fc353
--- /dev/null
@@ -0,0 +1,40 @@
+package mount
+
+// Info reveals information about a particular mounted filesystem. This
+// struct is populated from the content in the /proc/<pid>/mountinfo file.
+type Info struct {
+       // ID is a unique identifier of the mount (may be reused after umount).
+       ID int
+
+       // Parent indicates the ID of the mount parent (or of self for the top of the
+       // mount tree).
+       Parent int
+
+       // Major indicates one half of the device ID which identifies the device class.
+       Major int
+
+       // Minor indicates one half of the device ID which identifies a specific
+       // instance of device.
+       Minor int
+
+       // Root of the mount within the filesystem.
+       Root string
+
+       // Mountpoint indicates the mount point relative to the process's root.
+       Mountpoint string
+
+       // Opts represents mount-specific options.
+       Opts string
+
+       // Optional represents optional fields.
+       Optional string
+
+       // Fstype indicates the type of filesystem, such as EXT3.
+       Fstype string
+
+       // Source indicates filesystem specific information or "none".
+       Source string
+
+       // VfsOpts represents per super block options.
+       VfsOpts string
+}
diff --git a/libcontainer/network_linux.go b/libcontainer/network_linux.go
new file mode 100644 (file)
index 0000000..938d8ce
--- /dev/null
@@ -0,0 +1,103 @@
+// +build linux
+
+package libcontainer
+
+import (
+       "fmt"
+       "io/ioutil"
+       "path/filepath"
+       "strconv"
+       "strings"
+
+       "github.com/opencontainers/runc/libcontainer/configs"
+       "github.com/opencontainers/runc/types"
+       "github.com/vishvananda/netlink"
+)
+
+var strategies = map[string]networkStrategy{
+       "loopback": &loopback{},
+}
+
+// networkStrategy represents a specific network configuration for
+// a container's networking stack
+type networkStrategy interface {
+       create(*network, int) error
+       initialize(*network) error
+       detach(*configs.Network) error
+       attach(*configs.Network) error
+}
+
+// getStrategy returns the specific network strategy for the
+// provided type.
+func getStrategy(tpe string) (networkStrategy, error) {
+       s, exists := strategies[tpe]
+       if !exists {
+               return nil, fmt.Errorf("unknown strategy type %q", tpe)
+       }
+       return s, nil
+}
+
+// Returns the network statistics for the network interfaces represented by the NetworkRuntimeInfo.
+func getNetworkInterfaceStats(interfaceName string) (*types.NetworkInterface, error) {
+       out := &types.NetworkInterface{Name: interfaceName}
+       // This can happen if the network runtime information is missing - possible if the
+       // container was created by an old version of libcontainer.
+       if interfaceName == "" {
+               return out, nil
+       }
+       type netStatsPair struct {
+               // Where to write the output.
+               Out *uint64
+               // The network stats file to read.
+               File string
+       }
+       // Ingress for host veth is from the container. Hence tx_bytes stat on the host veth is actually number of bytes received by the container.
+       netStats := []netStatsPair{
+               {Out: &out.RxBytes, File: "tx_bytes"},
+               {Out: &out.RxPackets, File: "tx_packets"},
+               {Out: &out.RxErrors, File: "tx_errors"},
+               {Out: &out.RxDropped, File: "tx_dropped"},
+
+               {Out: &out.TxBytes, File: "rx_bytes"},
+               {Out: &out.TxPackets, File: "rx_packets"},
+               {Out: &out.TxErrors, File: "rx_errors"},
+               {Out: &out.TxDropped, File: "rx_dropped"},
+       }
+       for _, netStat := range netStats {
+               data, err := readSysfsNetworkStats(interfaceName, netStat.File)
+               if err != nil {
+                       return nil, err
+               }
+               *(netStat.Out) = data
+       }
+       return out, nil
+}
+
+// Reads the specified statistics available under /sys/class/net/<EthInterface>/statistics
+func readSysfsNetworkStats(ethInterface, statsFile string) (uint64, error) {
+       data, err := ioutil.ReadFile(filepath.Join("/sys/class/net", ethInterface, "statistics", statsFile))
+       if err != nil {
+               return 0, err
+       }
+       return strconv.ParseUint(strings.TrimSpace(string(data)), 10, 64)
+}
+
+// loopback is a network strategy that provides a basic loopback device
+type loopback struct {
+}
+
+func (l *loopback) create(n *network, nspid int) error {
+       return nil
+}
+
+func (l *loopback) initialize(config *network) error {
+       return netlink.LinkSetUp(&netlink.Device{LinkAttrs: netlink.LinkAttrs{Name: "lo"}})
+}
+
+func (l *loopback) attach(n *configs.Network) (err error) {
+       return nil
+}
+
+func (l *loopback) detach(n *configs.Network) (err error) {
+       return nil
+}
diff --git a/libcontainer/notify_linux.go b/libcontainer/notify_linux.go
new file mode 100644 (file)
index 0000000..47a0678
--- /dev/null
@@ -0,0 +1,90 @@
+// +build linux
+
+package libcontainer
+
+import (
+       "fmt"
+       "io/ioutil"
+       "os"
+       "path/filepath"
+
+       "golang.org/x/sys/unix"
+)
+
+const oomCgroupName = "memory"
+
+type PressureLevel uint
+
+const (
+       LowPressure PressureLevel = iota
+       MediumPressure
+       CriticalPressure
+)
+
+func registerMemoryEvent(cgDir string, evName string, arg string) (<-chan struct{}, error) {
+       evFile, err := os.Open(filepath.Join(cgDir, evName))
+       if err != nil {
+               return nil, err
+       }
+       fd, err := unix.Eventfd(0, unix.EFD_CLOEXEC)
+       if err != nil {
+               evFile.Close()
+               return nil, err
+       }
+
+       eventfd := os.NewFile(uintptr(fd), "eventfd")
+
+       eventControlPath := filepath.Join(cgDir, "cgroup.event_control")
+       data := fmt.Sprintf("%d %d %s", eventfd.Fd(), evFile.Fd(), arg)
+       if err := ioutil.WriteFile(eventControlPath, []byte(data), 0700); err != nil {
+               eventfd.Close()
+               evFile.Close()
+               return nil, err
+       }
+       ch := make(chan struct{})
+       go func() {
+               defer func() {
+                       eventfd.Close()
+                       evFile.Close()
+                       close(ch)
+               }()
+               buf := make([]byte, 8)
+               for {
+                       if _, err := eventfd.Read(buf); err != nil {
+                               return
+                       }
+                       // When a cgroup is destroyed, an event is sent to eventfd.
+                       // So if the control path is gone, return instead of notifying.
+                       if _, err := os.Lstat(eventControlPath); os.IsNotExist(err) {
+                               return
+                       }
+                       ch <- struct{}{}
+               }
+       }()
+       return ch, nil
+}
+
+// notifyOnOOM returns channel on which you can expect event about OOM,
+// if process died without OOM this channel will be closed.
+func notifyOnOOM(paths map[string]string) (<-chan struct{}, error) {
+       dir := paths[oomCgroupName]
+       if dir == "" {
+               return nil, fmt.Errorf("path %q missing", oomCgroupName)
+       }
+
+       return registerMemoryEvent(dir, "memory.oom_control", "")
+}
+
+func notifyMemoryPressure(paths map[string]string, level PressureLevel) (<-chan struct{}, error) {
+       dir := paths[oomCgroupName]
+       if dir == "" {
+               return nil, fmt.Errorf("path %q missing", oomCgroupName)
+       }
+
+       if level > CriticalPressure {
+               return nil, fmt.Errorf("invalid pressure level %d", level)
+       }
+
+       levelStr := []string{"low", "medium", "critical"}[level]
+       return registerMemoryEvent(dir, "memory.pressure_level", levelStr)
+}
diff --git a/libcontainer/notify_linux_test.go b/libcontainer/notify_linux_test.go
new file mode 100644 (file)
index 0000000..1e15ae2
--- /dev/null
@@ -0,0 +1,126 @@
+// +build linux
+
+package libcontainer
+
+import (
+       "encoding/binary"
+       "fmt"
+       "io/ioutil"
+       "os"
+       "path/filepath"
+       "testing"
+       "time"
+
+       "golang.org/x/sys/unix"
+)
+
+type notifyFunc func(paths map[string]string) (<-chan struct{}, error)
+
+func testMemoryNotification(t *testing.T, evName string, notify notifyFunc, targ string) {
+       memoryPath, err := ioutil.TempDir("", "testmemnotification-"+evName)
+       if err != nil {
+               t.Fatal(err)
+       }
+       evFile := filepath.Join(memoryPath, evName)
+       eventPath := filepath.Join(memoryPath, "cgroup.event_control")
+       if err := ioutil.WriteFile(evFile, []byte{}, 0700); err != nil {
+               t.Fatal(err)
+       }
+       if err := ioutil.WriteFile(eventPath, []byte{}, 0700); err != nil {
+               t.Fatal(err)
+       }
+       paths := map[string]string{
+               "memory": memoryPath,
+       }
+       ch, err := notify(paths)
+       if err != nil {
+               t.Fatal("expected no error, got:", err)
+       }
+
+       data, err := ioutil.ReadFile(eventPath)
+       if err != nil {
+               t.Fatal("couldn't read event control file:", err)
+       }
+
+       var eventFd, evFd int
+       var arg string
+       if targ != "" {
+               _, err = fmt.Sscanf(string(data), "%d %d %s", &eventFd, &evFd, &arg)
+       } else {
+               _, err = fmt.Sscanf(string(data), "%d %d", &eventFd, &evFd)
+       }
+       if err != nil || arg != targ {
+               t.Fatalf("invalid control data %q: %s", data, err)
+       }
+
+       // dup the eventfd
+       efd, err := unix.Dup(eventFd)
+       if err != nil {
+               t.Fatal("unable to dup eventfd:", err)
+       }
+       defer unix.Close(efd)
+
+       buf := make([]byte, 8)
+       binary.LittleEndian.PutUint64(buf, 1)
+
+       if _, err := unix.Write(efd, buf); err != nil {
+               t.Fatal("unable to write to eventfd:", err)
+       }
+
+       select {
+       case <-ch:
+       case <-time.After(100 * time.Millisecond):
+               t.Fatal("no notification on channel after 100ms")
+       }
+
+       // simulate what happens when a cgroup is destroyed by cleaning up and then
+       // writing to the eventfd.
+       if err := os.RemoveAll(memoryPath); err != nil {
+               t.Fatal(err)
+       }
+       if _, err := unix.Write(efd, buf); err != nil {
+               t.Fatal("unable to write to eventfd:", err)
+       }
+
+       // give things a moment to shut down
+       select {
+       case _, ok := <-ch:
+               if ok {
+                       t.Fatal("expected no notification to be triggered")
+               }
+       case <-time.After(100 * time.Millisecond):
+               t.Fatal("channel not closed after 100ms")
+       }
+
+       if _, _, err := unix.Syscall(unix.SYS_FCNTL, uintptr(evFd), unix.F_GETFD, 0); err != unix.EBADF {
+               t.Errorf("expected event control to be closed, but received error %s", err.Error())
+       }
+
+       if _, _, err := unix.Syscall(unix.SYS_FCNTL, uintptr(eventFd), unix.F_GETFD, 0); err != unix.EBADF {
+               t.Errorf("expected event fd to be closed, but received error %s", err.Error())
+       }
+}
+
+func TestNotifyOnOOM(t *testing.T) {
+       f := func(paths map[string]string) (<-chan struct{}, error) {
+               return notifyOnOOM(paths)
+       }
+
+       testMemoryNotification(t, "memory.oom_control", f, "")
+}
+
+func TestNotifyMemoryPressure(t *testing.T) {
+       tests := map[PressureLevel]string{
+               LowPressure:      "low",
+               MediumPressure:   "medium",
+               CriticalPressure: "critical",
+       }
+
+       for level, arg := range tests {
+               f := func(paths map[string]string) (<-chan struct{}, error) {
+                       return notifyMemoryPressure(paths, level)
+               }
+
+               testMemoryNotification(t, "memory.pressure_level", f, arg)
+       }
+}
diff --git a/libcontainer/nsenter/README.md b/libcontainer/nsenter/README.md
new file mode 100644 (file)
index 0000000..9ec6c39
--- /dev/null
@@ -0,0 +1,44 @@
+## nsenter
+
+The `nsenter` package registers a special init constructor that is called before 
+the Go runtime has a chance to boot.  This provides us the ability to `setns` on 
+existing namespaces and avoid the issues that the Go runtime has with multiple 
+threads.  This constructor will be called if this package is registered, 
+imported, in your go application.
+
+The `nsenter` package will `import "C"` and it uses [cgo](https://golang.org/cmd/cgo/)
+package. In cgo, if the import of "C" is immediately preceded by a comment, that comment, 
+called the preamble, is used as a header when compiling the C parts of the package.
+So every time we  import package `nsenter`, the C code function `nsexec()` would be 
+called. And package `nsenter` is only imported in `init.go`, so every time the runc
+`init` command is invoked, that C code is run.
+
+Because `nsexec()` must be run before the Go runtime in order to use the
+Linux kernel namespace, you must `import` this library into a package if
+you plan to use `libcontainer` directly. Otherwise Go will not execute
+the `nsexec()` constructor, which means that the re-exec will not cause
+the namespaces to be joined. You can import it like this:
+
+```go
+import _ "github.com/opencontainers/runc/libcontainer/nsenter"
+```
+
+`nsexec()` will first get the file descriptor number for the init pipe
+from the environment variable `_LIBCONTAINER_INITPIPE` (which was opened
+by the parent and kept open across the fork-exec of the `nsexec()` init
+process). The init pipe is used to read bootstrap data (namespace paths,
+clone flags, uid and gid mappings, and the console path) from the parent
+process. `nsexec()` will then call `setns(2)` to join the namespaces
+provided in the bootstrap data (if available), `clone(2)` a child process
+with the provided clone flags, update the user and group ID mappings, do
+some further miscellaneous setup steps, and then send the PID of the
+child process to the parent of the `nsexec()` "caller". Finally,
+the parent `nsexec()` will exit and the child `nsexec()` process will
+return to allow the Go runtime take over.
+
+NOTE: We do both `setns(2)` and `clone(2)` even if we don't have any
+`CLONE_NEW*` clone flags because we must fork a new process in order to
+enter the PID namespace.
+
+
+
diff --git a/libcontainer/nsenter/cloned_binary.c b/libcontainer/nsenter/cloned_binary.c
new file mode 100644 (file)
index 0000000..ad10f14
--- /dev/null
@@ -0,0 +1,516 @@
+/*
+ * Copyright (C) 2019 Aleksa Sarai <cyphar@cyphar.com>
+ * Copyright (C) 2019 SUSE LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define _GNU_SOURCE
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <limits.h>
+#include <fcntl.h>
+#include <errno.h>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/statfs.h>
+#include <sys/vfs.h>
+#include <sys/mman.h>
+#include <sys/mount.h>
+#include <sys/sendfile.h>
+#include <sys/syscall.h>
+
+/* Use our own wrapper for memfd_create. */
+#if !defined(SYS_memfd_create) && defined(__NR_memfd_create)
+#  define SYS_memfd_create __NR_memfd_create
+#endif
+/* memfd_create(2) flags -- copied from <linux/memfd.h>. */
+#ifndef MFD_CLOEXEC
+#  define MFD_CLOEXEC       0x0001U
+#  define MFD_ALLOW_SEALING 0x0002U
+#endif
+int memfd_create(const char *name, unsigned int flags)
+{
+#ifdef SYS_memfd_create
+       return syscall(SYS_memfd_create, name, flags);
+#else
+       errno = ENOSYS;
+       return -1;
+#endif
+}
+
+
+/* This comes directly from <linux/fcntl.h>. */
+#ifndef F_LINUX_SPECIFIC_BASE
+#  define F_LINUX_SPECIFIC_BASE 1024
+#endif
+#ifndef F_ADD_SEALS
+#  define F_ADD_SEALS (F_LINUX_SPECIFIC_BASE + 9)
+#  define F_GET_SEALS (F_LINUX_SPECIFIC_BASE + 10)
+#endif
+#ifndef F_SEAL_SEAL
+#  define F_SEAL_SEAL   0x0001 /* prevent further seals from being set */
+#  define F_SEAL_SHRINK 0x0002 /* prevent file from shrinking */
+#  define F_SEAL_GROW   0x0004 /* prevent file from growing */
+#  define F_SEAL_WRITE  0x0008 /* prevent writes */
+#endif
+
+#define CLONED_BINARY_ENV "_LIBCONTAINER_CLONED_BINARY"
+#define RUNC_MEMFD_COMMENT "runc_cloned:/proc/self/exe"
+#define RUNC_MEMFD_SEALS \
+       (F_SEAL_SEAL | F_SEAL_SHRINK | F_SEAL_GROW | F_SEAL_WRITE)
+
+static void *must_realloc(void *ptr, size_t size)
+{
+       void *old = ptr;
+       do {
+               ptr = realloc(old, size);
+       } while(!ptr);
+       return ptr;
+}
+
+/*
+ * Verify whether we are currently in a self-cloned program (namely, is
+ * /proc/self/exe a memfd). F_GET_SEALS will only succeed for memfds (or rather
+ * for shmem files), and we want to be sure it's actually sealed.
+ */
+static int is_self_cloned(void)
+{
+       int fd, ret, is_cloned = 0;
+       struct stat statbuf = {};
+       struct statfs fsbuf = {};
+
+       fd = open("/proc/self/exe", O_RDONLY|O_CLOEXEC);
+       if (fd < 0)
+               return -ENOTRECOVERABLE;
+
+       /*
+        * Is the binary a fully-sealed memfd? We don't need CLONED_BINARY_ENV for
+        * this, because you cannot write to a sealed memfd no matter what (so
+        * sharing it isn't a bad thing -- and an admin could bind-mount a sealed
+        * memfd to /usr/bin/runc to allow re-use).
+        */
+       ret = fcntl(fd, F_GET_SEALS);
+       if (ret >= 0) {
+               is_cloned = (ret == RUNC_MEMFD_SEALS);
+               goto out;
+       }
+
+       /*
+        * All other forms require CLONED_BINARY_ENV, since they are potentially
+        * writeable (or we can't tell if they're fully safe) and thus we must
+        * check the environment as an extra layer of defence.
+        */
+       if (!getenv(CLONED_BINARY_ENV)) {
+               is_cloned = false;
+               goto out;
+       }
+
+       /*
+        * Is the binary on a read-only filesystem? We can't detect bind-mounts in
+        * particular (in-kernel they are identical to regular mounts) but we can
+        * at least be sure that it's read-only. In addition, to make sure that
+        * it's *our* bind-mount we check CLONED_BINARY_ENV.
+        */
+       if (fstatfs(fd, &fsbuf) >= 0)
+               is_cloned |= (fsbuf.f_flags & MS_RDONLY);
+
+       /*
+        * Okay, we're a tmpfile -- or we're currently running on RHEL <=7.6
+        * which appears to have a borked backport of F_GET_SEALS. Either way,
+        * having a file which has no hardlinks indicates that we aren't using
+        * a host-side "runc" binary and this is something that a container
+        * cannot fake (because unlinking requires being able to resolve the
+        * path that you want to unlink).
+        */
+       if (fstat(fd, &statbuf) >= 0)
+               is_cloned |= (statbuf.st_nlink == 0);
+
+out:
+       close(fd);
+       return is_cloned;
+}
+
+/* Read a given file into a new buffer, and providing the length. */
+static char *read_file(char *path, size_t *length)
+{
+       int fd;
+       char buf[4096], *copy = NULL;
+
+       if (!length)
+               return NULL;
+
+       fd = open(path, O_RDONLY | O_CLOEXEC);
+       if (fd < 0)
+               return NULL;
+
+       *length = 0;
+       for (;;) {
+               ssize_t n;
+
+               n = read(fd, buf, sizeof(buf));
+               if (n < 0)
+                       goto error;
+               if (!n)
+                       break;
+
+               copy = must_realloc(copy, (*length + n) * sizeof(*copy));
+               memcpy(copy + *length, buf, n);
+               *length += n;
+       }
+       close(fd);
+       return copy;
+
+error:
+       close(fd);
+       free(copy);
+       return NULL;
+}
+
+/*
+ * A poor-man's version of "xargs -0". Basically parses a given block of
+ * NUL-delimited data, within the given length and adds a pointer to each entry
+ * to the array of pointers.
+ */
+static int parse_xargs(char *data, int data_length, char ***output)
+{
+       int num = 0;
+       char *cur = data;
+
+       if (!data || *output != NULL)
+               return -1;
+
+       while (cur < data + data_length) {
+               num++;
+               *output = must_realloc(*output, (num + 1) * sizeof(**output));
+               (*output)[num - 1] = cur;
+               cur += strlen(cur) + 1;
+       }
+       (*output)[num] = NULL;
+       return num;
+}
+
+/*
+ * "Parse" out argv from /proc/self/cmdline.
+ * This is necessary because we are running in a context where we don't have a
+ * main() that we can just get the arguments from.
+ */
+static int fetchve(char ***argv)
+{
+       char *cmdline = NULL;
+       size_t cmdline_size;
+
+       cmdline = read_file("/proc/self/cmdline", &cmdline_size);
+       if (!cmdline)
+               goto error;
+
+       if (parse_xargs(cmdline, cmdline_size, argv) <= 0)
+               goto error;
+
+       return 0;
+
+error:
+       free(cmdline);
+       return -EINVAL;
+}
+
+enum {
+       EFD_NONE = 0,
+       EFD_MEMFD,
+       EFD_FILE,
+};
+
+/*
+ * This comes from <linux/fcntl.h>. We can't hard-code __O_TMPFILE because it
+ * changes depending on the architecture. If we don't have O_TMPFILE we always
+ * have the mkostemp(3) fallback.
+ */
+#ifndef O_TMPFILE
+#  if defined(__O_TMPFILE) && defined(O_DIRECTORY)
+#    define O_TMPFILE (__O_TMPFILE | O_DIRECTORY)
+#  endif
+#endif
+
+static int make_execfd(int *fdtype)
+{
+       int fd = -1;
+       char template[PATH_MAX] = {0};
+       char *prefix = getenv("_LIBCONTAINER_STATEDIR");
+
+       if (!prefix || *prefix != '/')
+               prefix = "/tmp";
+       if (snprintf(template, sizeof(template), "%s/runc.XXXXXX", prefix) < 0)
+               return -1;
+
+       /*
+        * Now try memfd, it's much nicer than actually creating a file in STATEDIR
+        * since it's easily detected thanks to sealing and also doesn't require
+        * assumptions about STATEDIR.
+        */
+       *fdtype = EFD_MEMFD;
+       fd = memfd_create(RUNC_MEMFD_COMMENT, MFD_CLOEXEC | MFD_ALLOW_SEALING);
+       if (fd >= 0)
+               return fd;
+       if (errno != ENOSYS && errno != EINVAL)
+               goto error;
+
+#ifdef O_TMPFILE
+       /*
+        * Try O_TMPFILE to avoid races where someone might snatch our file. Note
+        * that O_EXCL isn't actually a security measure here (since you can just
+        * fd re-open it and clear O_EXCL).
+        */
+       *fdtype = EFD_FILE;
+       fd = open(prefix, O_TMPFILE | O_EXCL | O_RDWR | O_CLOEXEC, 0700);
+       if (fd >= 0) {
+               struct stat statbuf = {};
+               bool working_otmpfile = false;
+
+               /*
+                * open(2) ignores unknown O_* flags -- yeah, I was surprised when I
+                * found this out too. As a result we can't check for EINVAL. However,
+                * if we get nlink != 0 (or EISDIR) then we know that this kernel
+                * doesn't support O_TMPFILE.
+                */
+               if (fstat(fd, &statbuf) >= 0)
+                       working_otmpfile = (statbuf.st_nlink == 0);
+
+               if (working_otmpfile)
+                       return fd;
+
+               /* Pretend that we got EISDIR since O_TMPFILE failed. */
+               close(fd);
+               errno = EISDIR;
+       }
+       if (errno != EISDIR)
+               goto error;
+#endif /* defined(O_TMPFILE) */
+
+       /*
+        * Our final option is to create a temporary file the old-school way, and
+        * then unlink it so that nothing else sees it by accident.
+        */
+       *fdtype = EFD_FILE;
+       fd = mkostemp(template, O_CLOEXEC);
+       if (fd >= 0) {
+               if (unlink(template) >= 0)
+                       return fd;
+               close(fd);
+       }
+
+error:
+       *fdtype = EFD_NONE;
+       return -1;
+}
+
+static int seal_execfd(int *fd, int fdtype)
+{
+       switch (fdtype) {
+       case EFD_MEMFD:
+               return fcntl(*fd, F_ADD_SEALS, RUNC_MEMFD_SEALS);
+       case EFD_FILE: {
+               /* Need to re-open our pseudo-memfd as an O_PATH to avoid execve(2) giving -ETXTBSY. */
+               int newfd;
+               char fdpath[PATH_MAX] = {0};
+
+               if (fchmod(*fd, 0100) < 0)
+                       return -1;
+
+               if (snprintf(fdpath, sizeof(fdpath), "/proc/self/fd/%d", *fd) < 0)
+                       return -1;
+
+               newfd = open(fdpath, O_PATH | O_CLOEXEC);
+               if (newfd < 0)
+                       return -1;
+
+               close(*fd);
+               *fd = newfd;
+               return 0;
+       }
+       default:
+          break;
+       }
+       return -1;
+}
+
+static int try_bindfd(void)
+{
+       int fd, ret = -1;
+       char template[PATH_MAX] = {0};
+       char *prefix = getenv("_LIBCONTAINER_STATEDIR");
+
+       if (!prefix || *prefix != '/')
+               prefix = "/tmp";
+       if (snprintf(template, sizeof(template), "%s/runc.XXXXXX", prefix) < 0)
+               return ret;
+
+       /*
+        * We need somewhere to mount it, mounting anything over /proc/self is a
+        * BAD idea on the host -- even if we do it temporarily.
+        */
+       fd = mkstemp(template);
+       if (fd < 0)
+               return ret;
+       close(fd);
+
+       /*
+        * For obvious reasons this won't work in rootless mode because we haven't
+        * created a userns+mntns -- but getting that to work will be a bit
+        * complicated and it's only worth doing if someone actually needs it.
+        */
+       ret = -EPERM;
+       if (mount("/proc/self/exe", template, "", MS_BIND, "") < 0)
+               goto out;
+       if (mount("", template, "", MS_REMOUNT | MS_BIND | MS_RDONLY, "") < 0)
+               goto out_umount;
+
+
+       /* Get read-only handle that we're sure can't be made read-write. */
+       ret = open(template, O_PATH | O_CLOEXEC);
+
+out_umount:
+       /*
+        * Make sure the MNT_DETACH works, otherwise we could get remounted
+        * read-write and that would be quite bad (the fd would be made read-write
+        * too, invalidating the protection).
+        */
+       if (umount2(template, MNT_DETACH) < 0) {
+               if (ret >= 0)
+                       close(ret);
+               ret = -ENOTRECOVERABLE;
+       }
+
+out:
+       /*
+        * We don't care about unlink errors, the worst that happens is that
+        * there's an empty file left around in STATEDIR.
+        */
+       unlink(template);
+       return ret;
+}
+
+static ssize_t fd_to_fd(int outfd, int infd)
+{
+       ssize_t total = 0;
+       char buffer[4096];
+
+       for (;;) {
+               ssize_t nread, nwritten = 0;
+
+               nread = read(infd, buffer, sizeof(buffer));
+               if (nread < 0)
+                       return -1;
+               if (!nread)
+                       break;
+
+               do {
+                       ssize_t n = write(outfd, buffer + nwritten, nread - nwritten);
+                       if (n < 0)
+                               return -1;
+                       nwritten += n;
+               } while(nwritten < nread);
+
+               total += nwritten;
+       }
+
+       return total;
+}
+
+static int clone_binary(void)
+{
+       int binfd, execfd;
+       struct stat statbuf = {};
+       size_t sent = 0;
+       int fdtype = EFD_NONE;
+
+       /*
+        * Before we resort to copying, let's try creating an ro-binfd in one shot
+        * by getting a handle for a read-only bind-mount of the execfd.
+        */
+       execfd = try_bindfd();
+       if (execfd >= 0)
+               return execfd;
+
+       /*
+        * Dammit, that didn't work -- time to copy the binary to a safe place we
+        * can seal the contents.
+        */
+       execfd = make_execfd(&fdtype);
+       if (execfd < 0 || fdtype == EFD_NONE)
+               return -ENOTRECOVERABLE;
+
+       binfd = open("/proc/self/exe", O_RDONLY | O_CLOEXEC);
+       if (binfd < 0)
+               goto error;
+
+       if (fstat(binfd, &statbuf) < 0)
+               goto error_binfd;
+
+       while (sent < statbuf.st_size) {
+               int n = sendfile(execfd, binfd, NULL, statbuf.st_size - sent);
+               if (n < 0) {
+                       /* sendfile can fail so we fallback to a dumb user-space copy. */
+                       n = fd_to_fd(execfd, binfd);
+                       if (n < 0)
+                               goto error_binfd;
+               }
+               sent += n;
+       }
+       close(binfd);
+       if (sent != statbuf.st_size)
+               goto error;
+
+       if (seal_execfd(&execfd, fdtype) < 0)
+               goto error;
+
+       return execfd;
+
+error_binfd:
+       close(binfd);
+error:
+       close(execfd);
+       return -EIO;
+}
+
+/* Get cheap access to the environment. */
+extern char **environ;
+
+int ensure_cloned_binary(void)
+{
+       int execfd;
+       char **argv = NULL;
+
+       /* Check that we're not self-cloned, and if we are then bail. */
+       int cloned = is_self_cloned();
+       if (cloned > 0 || cloned == -ENOTRECOVERABLE)
+               return cloned;
+
+       if (fetchve(&argv) < 0)
+               return -EINVAL;
+
+       execfd = clone_binary();
+       if (execfd < 0)
+               return -EIO;
+
+       if (putenv(CLONED_BINARY_ENV "=1"))
+               goto error;
+
+       fexecve(execfd, argv, environ);
+error:
+       close(execfd);
+       return -ENOEXEC;
+}
diff --git a/libcontainer/nsenter/namespace.h b/libcontainer/nsenter/namespace.h
new file mode 100644 (file)
index 0000000..9e9bdca
--- /dev/null
@@ -0,0 +1,32 @@
+#ifndef NSENTER_NAMESPACE_H
+#define NSENTER_NAMESPACE_H
+
+#ifndef _GNU_SOURCE
+#      define _GNU_SOURCE
+#endif
+#include <sched.h>
+
+/* All of these are taken from include/uapi/linux/sched.h */
+#ifndef CLONE_NEWNS
+#      define CLONE_NEWNS 0x00020000 /* New mount namespace group */
+#endif
+#ifndef CLONE_NEWCGROUP
+#      define CLONE_NEWCGROUP 0x02000000 /* New cgroup namespace */
+#endif
+#ifndef CLONE_NEWUTS
+#      define CLONE_NEWUTS 0x04000000 /* New utsname namespace */
+#endif
+#ifndef CLONE_NEWIPC
+#      define CLONE_NEWIPC 0x08000000 /* New ipc namespace */
+#endif
+#ifndef CLONE_NEWUSER
+#      define CLONE_NEWUSER 0x10000000 /* New user namespace */
+#endif
+#ifndef CLONE_NEWPID
+#      define CLONE_NEWPID 0x20000000 /* New pid namespace */
+#endif
+#ifndef CLONE_NEWNET
+#      define CLONE_NEWNET 0x40000000 /* New network namespace */
+#endif
+
+#endif /* NSENTER_NAMESPACE_H */
diff --git a/libcontainer/nsenter/nsenter.go b/libcontainer/nsenter/nsenter.go
new file mode 100644 (file)
index 0000000..07f4d63
--- /dev/null
@@ -0,0 +1,12 @@
+// +build linux,!gccgo
+
+package nsenter
+
+/*
+#cgo CFLAGS: -Wall
+extern void nsexec();
+void __attribute__((constructor)) init(void) {
+       nsexec();
+}
+*/
+import "C"
diff --git a/libcontainer/nsenter/nsenter_gccgo.go b/libcontainer/nsenter/nsenter_gccgo.go
new file mode 100644 (file)
index 0000000..63c7a3e
--- /dev/null
@@ -0,0 +1,25 @@
+// +build linux,gccgo
+
+package nsenter
+
+/*
+#cgo CFLAGS: -Wall
+extern void nsexec();
+void __attribute__((constructor)) init(void) {
+       nsexec();
+}
+*/
+import "C"
+
+// AlwaysFalse is here to stay false
+// (and be exported so the compiler doesn't optimize out its reference)
+var AlwaysFalse bool
+
+func init() {
+       if AlwaysFalse {
+               // by referencing this C init() in a noop test, it will ensure the compiler
+               // links in the C function.
+               // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=65134
+               C.init()
+       }
+}
diff --git a/libcontainer/nsenter/nsenter_test.go b/libcontainer/nsenter/nsenter_test.go
new file mode 100644 (file)
index 0000000..c4d3c86
--- /dev/null
@@ -0,0 +1,239 @@
+package nsenter
+
+import (
+       "bytes"
+       "encoding/json"
+       "fmt"
+       "io"
+       "io/ioutil"
+       "os"
+       "os/exec"
+       "strings"
+       "testing"
+
+       "github.com/opencontainers/runc/libcontainer"
+       "github.com/vishvananda/netlink/nl"
+
+       "golang.org/x/sys/unix"
+)
+
+type pid struct {
+       Pid int `json:"Pid"`
+}
+
+type logentry struct {
+       Msg   string `json:"msg"`
+       Level string `json:"level"`
+}
+
+func TestNsenterValidPaths(t *testing.T) {
+       args := []string{"nsenter-exec"}
+       parent, child, err := newPipe()
+       if err != nil {
+               t.Fatalf("failed to create pipe %v", err)
+       }
+
+       namespaces := []string{
+               // join pid ns of the current process
+               fmt.Sprintf("pid:/proc/%d/ns/pid", os.Getpid()),
+       }
+       cmd := &exec.Cmd{
+               Path:       os.Args[0],
+               Args:       args,
+               ExtraFiles: []*os.File{child},
+               Env:        []string{"_LIBCONTAINER_INITPIPE=3"},
+               Stdout:     os.Stdout,
+               Stderr:     os.Stderr,
+       }
+
+       if err := cmd.Start(); err != nil {
+               t.Fatalf("nsenter failed to start %v", err)
+       }
+       // write cloneFlags
+       r := nl.NewNetlinkRequest(int(libcontainer.InitMsg), 0)
+       r.AddData(&libcontainer.Int32msg{
+               Type:  libcontainer.CloneFlagsAttr,
+               Value: uint32(unix.CLONE_NEWNET),
+       })
+       r.AddData(&libcontainer.Bytemsg{
+               Type:  libcontainer.NsPathsAttr,
+               Value: []byte(strings.Join(namespaces, ",")),
+       })
+       if _, err := io.Copy(parent, bytes.NewReader(r.Serialize())); err != nil {
+               t.Fatal(err)
+       }
+
+       decoder := json.NewDecoder(parent)
+       var pid *pid
+
+       if err := cmd.Wait(); err != nil {
+               t.Fatalf("nsenter exits with a non-zero exit status")
+       }
+       if err := decoder.Decode(&pid); err != nil {
+               dir, _ := ioutil.ReadDir(fmt.Sprintf("/proc/%d/ns", os.Getpid()))
+               for _, d := range dir {
+                       t.Log(d.Name())
+               }
+               t.Fatalf("%v", err)
+       }
+
+       p, err := os.FindProcess(pid.Pid)
+       if err != nil {
+               t.Fatalf("%v", err)
+       }
+       p.Wait()
+}
+
+func TestNsenterInvalidPaths(t *testing.T) {
+       args := []string{"nsenter-exec"}
+       parent, child, err := newPipe()
+       if err != nil {
+               t.Fatalf("failed to create pipe %v", err)
+       }
+
+       namespaces := []string{
+               // join pid ns of the current process
+               fmt.Sprintf("pid:/proc/%d/ns/pid", -1),
+       }
+       cmd := &exec.Cmd{
+               Path:       os.Args[0],
+               Args:       args,
+               ExtraFiles: []*os.File{child},
+               Env:        []string{"_LIBCONTAINER_INITPIPE=3"},
+       }
+
+       if err := cmd.Start(); err != nil {
+               t.Fatal(err)
+       }
+       // write cloneFlags
+       r := nl.NewNetlinkRequest(int(libcontainer.InitMsg), 0)
+       r.AddData(&libcontainer.Int32msg{
+               Type:  libcontainer.CloneFlagsAttr,
+               Value: uint32(unix.CLONE_NEWNET),
+       })
+       r.AddData(&libcontainer.Bytemsg{
+               Type:  libcontainer.NsPathsAttr,
+               Value: []byte(strings.Join(namespaces, ",")),
+       })
+       if _, err := io.Copy(parent, bytes.NewReader(r.Serialize())); err != nil {
+               t.Fatal(err)
+       }
+
+       if err := cmd.Wait(); err == nil {
+               t.Fatalf("nsenter exits with a zero exit status")
+       }
+}
+
+func TestNsenterIncorrectPathType(t *testing.T) {
+       args := []string{"nsenter-exec"}
+       parent, child, err := newPipe()
+       if err != nil {
+               t.Fatalf("failed to create pipe %v", err)
+       }
+
+       namespaces := []string{
+               // join pid ns of the current process
+               fmt.Sprintf("net:/proc/%d/ns/pid", os.Getpid()),
+       }
+       cmd := &exec.Cmd{
+               Path:       os.Args[0],
+               Args:       args,
+               ExtraFiles: []*os.File{child},
+               Env:        []string{"_LIBCONTAINER_INITPIPE=3"},
+       }
+
+       if err := cmd.Start(); err != nil {
+               t.Fatal(err)
+       }
+       // write cloneFlags
+       r := nl.NewNetlinkRequest(int(libcontainer.InitMsg), 0)
+       r.AddData(&libcontainer.Int32msg{
+               Type:  libcontainer.CloneFlagsAttr,
+               Value: uint32(unix.CLONE_NEWNET),
+       })
+       r.AddData(&libcontainer.Bytemsg{
+               Type:  libcontainer.NsPathsAttr,
+               Value: []byte(strings.Join(namespaces, ",")),
+       })
+       if _, err := io.Copy(parent, bytes.NewReader(r.Serialize())); err != nil {
+               t.Fatal(err)
+       }
+
+       if err := cmd.Wait(); err == nil {
+               t.Fatalf("nsenter exits with a zero exit status")
+       }
+}
+
+func TestNsenterChildLogging(t *testing.T) {
+       args := []string{"nsenter-exec"}
+       parent, child, err := newPipe()
+       if err != nil {
+               t.Fatalf("failed to create exec pipe %v", err)
+       }
+       logread, logwrite, err := os.Pipe()
+       if err != nil {
+               t.Fatalf("failed to create log pipe %v", err)
+       }
+       defer logread.Close()
+       defer logwrite.Close()
+
+       namespaces := []string{
+               // join pid ns of the current process
+               fmt.Sprintf("pid:/proc/%d/ns/pid", os.Getpid()),
+       }
+       cmd := &exec.Cmd{
+               Path:       os.Args[0],
+               Args:       args,
+               ExtraFiles: []*os.File{child, logwrite},
+               Env:        []string{"_LIBCONTAINER_INITPIPE=3", "_LIBCONTAINER_LOGPIPE=4"},
+               Stdout:     os.Stdout,
+               Stderr:     os.Stderr,
+       }
+
+       if err := cmd.Start(); err != nil {
+               t.Fatalf("nsenter failed to start %v", err)
+       }
+       // write cloneFlags
+       r := nl.NewNetlinkRequest(int(libcontainer.InitMsg), 0)
+       r.AddData(&libcontainer.Int32msg{
+               Type:  libcontainer.CloneFlagsAttr,
+               Value: uint32(unix.CLONE_NEWNET),
+       })
+       r.AddData(&libcontainer.Bytemsg{
+               Type:  libcontainer.NsPathsAttr,
+               Value: []byte(strings.Join(namespaces, ",")),
+       })
+       if _, err := io.Copy(parent, bytes.NewReader(r.Serialize())); err != nil {
+               t.Fatal(err)
+       }
+
+       logsDecoder := json.NewDecoder(logread)
+       var logentry *logentry
+
+       err = logsDecoder.Decode(&logentry)
+       if err != nil {
+               t.Fatalf("child log: %v", err)
+       }
+       if logentry.Level == "" || logentry.Msg == "" {
+               t.Fatalf("child log: empty log fileds: level=\"%s\" msg=\"%s\"", logentry.Level, logentry.Msg)
+       }
+
+       if err := cmd.Wait(); err != nil {
+               t.Fatalf("nsenter exits with a non-zero exit status")
+       }
+}
+
+func init() {
+       if strings.HasPrefix(os.Args[0], "nsenter-") {
+               os.Exit(0)
+       }
+       return
+}
+
+func newPipe() (parent *os.File, child *os.File, err error) {
+       fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0)
+       if err != nil {
+               return nil, nil, err
+       }
+       return os.NewFile(uintptr(fds[1]), "parent"), os.NewFile(uintptr(fds[0]), "child"), nil
+}
diff --git a/libcontainer/nsenter/nsenter_unsupported.go b/libcontainer/nsenter/nsenter_unsupported.go
new file mode 100644 (file)
index 0000000..2459c63
--- /dev/null
@@ -0,0 +1,3 @@
+// +build !linux !cgo
+
+package nsenter
diff --git a/libcontainer/nsenter/nsexec.c b/libcontainer/nsenter/nsexec.c
new file mode 100644 (file)
index 0000000..0726568
--- /dev/null
@@ -0,0 +1,1032 @@
+
+#define _GNU_SOURCE
+#include <endian.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <grp.h>
+#include <sched.h>
+#include <setjmp.h>
+#include <signal.h>
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <sys/ioctl.h>
+#include <sys/prctl.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+
+#include <linux/limits.h>
+#include <linux/netlink.h>
+#include <linux/types.h>
+
+/* Get all of the CLONE_NEW* flags. */
+#include "namespace.h"
+
+/* Synchronisation values. */
+enum sync_t {
+       SYNC_USERMAP_PLS = 0x40,        /* Request parent to map our users. */
+       SYNC_USERMAP_ACK = 0x41,        /* Mapping finished by the parent. */
+       SYNC_RECVPID_PLS = 0x42,        /* Tell parent we're sending the PID. */
+       SYNC_RECVPID_ACK = 0x43,        /* PID was correctly received by parent. */
+       SYNC_GRANDCHILD = 0x44, /* The grandchild is ready to run. */
+       SYNC_CHILD_READY = 0x45,        /* The child or grandchild is ready to return. */
+};
+
+/*
+ * Synchronisation value for cgroup namespace setup.
+ * The same constant is defined in process_linux.go as "createCgroupns".
+ */
+#define CREATECGROUPNS 0x80
+
+/* longjmp() arguments. */
+#define JUMP_PARENT 0x00
+#define JUMP_CHILD  0xA0
+#define JUMP_INIT   0xA1
+
+/* Assume the stack grows down, so arguments should be above it. */
+struct clone_t {
+       /*
+        * Reserve some space for clone() to locate arguments
+        * and retcode in this place
+        */
+       char stack[4096] __attribute__ ((aligned(16)));
+       char stack_ptr[0];
+
+       /* There's two children. This is used to execute the different code. */
+       jmp_buf *env;
+       int jmpval;
+};
+
+struct nlconfig_t {
+       char *data;
+
+       /* Process settings. */
+       uint32_t cloneflags;
+       char *oom_score_adj;
+       size_t oom_score_adj_len;
+
+       /* User namespace settings. */
+       char *uidmap;
+       size_t uidmap_len;
+       char *gidmap;
+       size_t gidmap_len;
+       char *namespaces;
+       size_t namespaces_len;
+       uint8_t is_setgroup;
+
+       /* Rootless container settings. */
+       uint8_t is_rootless_euid;       /* boolean */
+       char *uidmappath;
+       size_t uidmappath_len;
+       char *gidmappath;
+       size_t gidmappath_len;
+};
+
+#define PANIC   "panic"
+#define FATAL   "fatal"
+#define ERROR   "error"
+#define WARNING "warning"
+#define INFO    "info"
+#define DEBUG   "debug"
+
+static int logfd = -1;
+
+/*
+ * List of netlink message types sent to us as part of bootstrapping the init.
+ * These constants are defined in libcontainer/message_linux.go.
+ */
+#define INIT_MSG                       62000
+#define CLONE_FLAGS_ATTR       27281
+#define NS_PATHS_ATTR          27282
+#define UIDMAP_ATTR                    27283
+#define GIDMAP_ATTR                    27284
+#define SETGROUP_ATTR          27285
+#define OOM_SCORE_ADJ_ATTR     27286
+#define ROOTLESS_EUID_ATTR     27287
+#define UIDMAPPATH_ATTR            27288
+#define GIDMAPPATH_ATTR            27289
+
+/*
+ * Use the raw syscall for versions of glibc which don't include a function for
+ * it, namely (glibc 2.12).
+ */
+#if __GLIBC__ == 2 && __GLIBC_MINOR__ < 14
+#      define _GNU_SOURCE
+#      include "syscall.h"
+#      if !defined(SYS_setns) && defined(__NR_setns)
+#              define SYS_setns __NR_setns
+#      endif
+
+#ifndef SYS_setns
+#      error "setns(2) syscall not supported by glibc version"
+#endif
+
+int setns(int fd, int nstype)
+{
+       return syscall(SYS_setns, fd, nstype);
+}
+#endif
+
+static void write_log_with_info(const char *level, const char *function, int line, const char *format, ...)
+{
+       char message[1024] = {};
+
+       va_list args;
+
+       if (logfd < 0 || level == NULL)
+               return;
+
+       va_start(args, format);
+       if (vsnprintf(message, sizeof(message), format, args) < 0)
+               goto done;
+
+       dprintf(logfd, "{\"level\":\"%s\", \"msg\": \"%s:%d %s\"}\n", level, function, line, message);
+done:
+       va_end(args);
+}
+
+#define write_log(level, fmt, ...) \
+       write_log_with_info((level), __FUNCTION__, __LINE__, (fmt), ##__VA_ARGS__)
+
+/* XXX: This is ugly. */
+static int syncfd = -1;
+
+#define bail(fmt, ...)                                       \
+       do {                                                       \
+               write_log(FATAL, "nsenter: " fmt ": %m", ##__VA_ARGS__); \
+               exit(1);                                                 \
+       } while(0)
+
+static int write_file(char *data, size_t data_len, char *pathfmt, ...)
+{
+       int fd, len, ret = 0;
+       char path[PATH_MAX];
+
+       va_list ap;
+       va_start(ap, pathfmt);
+       len = vsnprintf(path, PATH_MAX, pathfmt, ap);
+       va_end(ap);
+       if (len < 0)
+               return -1;
+
+       fd = open(path, O_RDWR);
+       if (fd < 0) {
+               return -1;
+       }
+
+       len = write(fd, data, data_len);
+       if (len != data_len) {
+               ret = -1;
+               goto out;
+       }
+
+ out:
+       close(fd);
+       return ret;
+}
+
+enum policy_t {
+       SETGROUPS_DEFAULT = 0,
+       SETGROUPS_ALLOW,
+       SETGROUPS_DENY,
+};
+
+/* This *must* be called before we touch gid_map. */
+static void update_setgroups(int pid, enum policy_t setgroup)
+{
+       char *policy;
+
+       switch (setgroup) {
+       case SETGROUPS_ALLOW:
+               policy = "allow";
+               break;
+       case SETGROUPS_DENY:
+               policy = "deny";
+               break;
+       case SETGROUPS_DEFAULT:
+       default:
+               /* Nothing to do. */
+               return;
+       }
+
+       if (write_file(policy, strlen(policy), "/proc/%d/setgroups", pid) < 0) {
+               /*
+                * If the kernel is too old to support /proc/pid/setgroups,
+                * open(2) or write(2) will return ENOENT. This is fine.
+                */
+               if (errno != ENOENT)
+                       bail("failed to write '%s' to /proc/%d/setgroups", policy, pid);
+       }
+}
+
+static int try_mapping_tool(const char *app, int pid, char *map, size_t map_len)
+{
+       int child;
+
+       /*
+        * If @app is NULL, execve will segfault. Just check it here and bail (if
+        * we're in this path, the caller is already getting desperate and there
+        * isn't a backup to this failing). This usually would be a configuration
+        * or programming issue.
+        */
+       if (!app)
+               bail("mapping tool not present");
+
+       child = fork();
+       if (child < 0)
+               bail("failed to fork");
+
+       if (!child) {
+#define MAX_ARGV 20
+               char *argv[MAX_ARGV];
+               char *envp[] = { NULL };
+               char pid_fmt[16];
+               int argc = 0;
+               char *next;
+
+               snprintf(pid_fmt, 16, "%d", pid);
+
+               argv[argc++] = (char *)app;
+               argv[argc++] = pid_fmt;
+               /*
+                * Convert the map string into a list of argument that
+                * newuidmap/newgidmap can understand.
+                */
+
+               while (argc < MAX_ARGV) {
+                       if (*map == '\0') {
+                               argv[argc++] = NULL;
+                               break;
+                       }
+                       argv[argc++] = map;
+                       next = strpbrk(map, "\n ");
+                       if (next == NULL)
+                               break;
+                       *next++ = '\0';
+                       map = next + strspn(next, "\n ");
+               }
+
+               execve(app, argv, envp);
+               bail("failed to execv");
+       } else {
+               int status;
+
+               while (true) {
+                       if (waitpid(child, &status, 0) < 0) {
+                               if (errno == EINTR)
+                                       continue;
+                               bail("failed to waitpid");
+                       }
+                       if (WIFEXITED(status) || WIFSIGNALED(status))
+                               return WEXITSTATUS(status);
+               }
+       }
+
+       return -1;
+}
+
+static void update_uidmap(const char *path, int pid, char *map, size_t map_len)
+{
+       if (map == NULL || map_len <= 0)
+               return;
+
+       if (write_file(map, map_len, "/proc/%d/uid_map", pid) < 0) {
+               if (errno != EPERM)
+                       bail("failed to update /proc/%d/uid_map", pid);
+               if (try_mapping_tool(path, pid, map, map_len))
+                       bail("failed to use newuid map on %d", pid);
+       }
+}
+
+static void update_gidmap(const char *path, int pid, char *map, size_t map_len)
+{
+       if (map == NULL || map_len <= 0)
+               return;
+
+       if (write_file(map, map_len, "/proc/%d/gid_map", pid) < 0) {
+               if (errno != EPERM)
+                       bail("failed to update /proc/%d/gid_map", pid);
+               if (try_mapping_tool(path, pid, map, map_len))
+                       bail("failed to use newgid map on %d", pid);
+       }
+}
+
+static void update_oom_score_adj(char *data, size_t len)
+{
+       if (data == NULL || len <= 0)
+               return;
+
+       if (write_file(data, len, "/proc/self/oom_score_adj") < 0)
+               bail("failed to update /proc/self/oom_score_adj");
+}
+
+/* A dummy function that just jumps to the given jumpval. */
+static int child_func(void *arg) __attribute__ ((noinline));
+static int child_func(void *arg)
+{
+       struct clone_t *ca = (struct clone_t *)arg;
+       longjmp(*ca->env, ca->jmpval);
+}
+
+static int clone_parent(jmp_buf *env, int jmpval) __attribute__ ((noinline));
+static int clone_parent(jmp_buf *env, int jmpval)
+{
+       struct clone_t ca = {
+               .env = env,
+               .jmpval = jmpval,
+       };
+
+       return clone(child_func, ca.stack_ptr, CLONE_PARENT | SIGCHLD, &ca);
+}
+
+/*
+ * Gets the init pipe fd from the environment, which is used to read the
+ * bootstrap data and tell the parent what the new pid is after we finish
+ * setting up the environment.
+ */
+static int initpipe(void)
+{
+       int pipenum;
+       char *initpipe, *endptr;
+
+       initpipe = getenv("_LIBCONTAINER_INITPIPE");
+       if (initpipe == NULL || *initpipe == '\0')
+               return -1;
+
+       pipenum = strtol(initpipe, &endptr, 10);
+       if (*endptr != '\0')
+               bail("unable to parse _LIBCONTAINER_INITPIPE");
+
+       return pipenum;
+}
+
+static void setup_logpipe(void)
+{
+       char *logpipe, *endptr;
+
+       logpipe = getenv("_LIBCONTAINER_LOGPIPE");
+       if (logpipe == NULL || *logpipe == '\0') {
+               return;
+       }
+
+       logfd = strtol(logpipe, &endptr, 10);
+       if (logpipe == endptr || *endptr != '\0') {
+               fprintf(stderr, "unable to parse _LIBCONTAINER_LOGPIPE, value: %s\n", logpipe);
+               /* It is too early to use bail */
+               exit(1);
+       }
+}
+
+/* Returns the clone(2) flag for a namespace, given the name of a namespace. */
+static int nsflag(char *name)
+{
+       if (!strcmp(name, "cgroup"))
+               return CLONE_NEWCGROUP;
+       else if (!strcmp(name, "ipc"))
+               return CLONE_NEWIPC;
+       else if (!strcmp(name, "mnt"))
+               return CLONE_NEWNS;
+       else if (!strcmp(name, "net"))
+               return CLONE_NEWNET;
+       else if (!strcmp(name, "pid"))
+               return CLONE_NEWPID;
+       else if (!strcmp(name, "user"))
+               return CLONE_NEWUSER;
+       else if (!strcmp(name, "uts"))
+               return CLONE_NEWUTS;
+
+       /* If we don't recognise a name, fallback to 0. */
+       return 0;
+}
+
+static uint32_t readint32(char *buf)
+{
+       return *(uint32_t *) buf;
+}
+
+static uint8_t readint8(char *buf)
+{
+       return *(uint8_t *) buf;
+}
+
+static void nl_parse(int fd, struct nlconfig_t *config)
+{
+       size_t len, size;
+       struct nlmsghdr hdr;
+       char *data, *current;
+
+       /* Retrieve the netlink header. */
+       len = read(fd, &hdr, NLMSG_HDRLEN);
+       if (len != NLMSG_HDRLEN)
+               bail("invalid netlink header length %zu", len);
+
+       if (hdr.nlmsg_type == NLMSG_ERROR)
+               bail("failed to read netlink message");
+
+       if (hdr.nlmsg_type != INIT_MSG)
+               bail("unexpected msg type %d", hdr.nlmsg_type);
+
+       /* Retrieve data. */
+       size = NLMSG_PAYLOAD(&hdr, 0);
+       current = data = malloc(size);
+       if (!data)
+               bail("failed to allocate %zu bytes of memory for nl_payload", size);
+
+       len = read(fd, data, size);
+       if (len != size)
+               bail("failed to read netlink payload, %zu != %zu", len, size);
+
+       /* Parse the netlink payload. */
+       config->data = data;
+       while (current < data + size) {
+               struct nlattr *nlattr = (struct nlattr *)current;
+               size_t payload_len = nlattr->nla_len - NLA_HDRLEN;
+
+               /* Advance to payload. */
+               current += NLA_HDRLEN;
+
+               /* Handle payload. */
+               switch (nlattr->nla_type) {
+               case CLONE_FLAGS_ATTR:
+                       config->cloneflags = readint32(current);
+                       break;
+               case ROOTLESS_EUID_ATTR:
+                       config->is_rootless_euid = readint8(current);   /* boolean */
+                       break;
+               case OOM_SCORE_ADJ_ATTR:
+                       config->oom_score_adj = current;
+                       config->oom_score_adj_len = payload_len;
+                       break;
+               case NS_PATHS_ATTR:
+                       config->namespaces = current;
+                       config->namespaces_len = payload_len;
+                       break;
+               case UIDMAP_ATTR:
+                       config->uidmap = current;
+                       config->uidmap_len = payload_len;
+                       break;
+               case GIDMAP_ATTR:
+                       config->gidmap = current;
+                       config->gidmap_len = payload_len;
+                       break;
+               case UIDMAPPATH_ATTR:
+                       config->uidmappath = current;
+                       config->uidmappath_len = payload_len;
+                       break;
+               case GIDMAPPATH_ATTR:
+                       config->gidmappath = current;
+                       config->gidmappath_len = payload_len;
+                       break;
+               case SETGROUP_ATTR:
+                       config->is_setgroup = readint8(current);
+                       break;
+               default:
+                       bail("unknown netlink message type %d", nlattr->nla_type);
+               }
+
+               current += NLA_ALIGN(payload_len);
+       }
+}
+
+void nl_free(struct nlconfig_t *config)
+{
+       free(config->data);
+}
+
+void join_namespaces(char *nslist)
+{
+       int num = 0, i;
+       char *saveptr = NULL;
+       char *namespace = strtok_r(nslist, ",", &saveptr);
+       struct namespace_t {
+               int fd;
+               int ns;
+               char type[PATH_MAX];
+               char path[PATH_MAX];
+       } *namespaces = NULL;
+
+       if (!namespace || !strlen(namespace) || !strlen(nslist))
+               bail("ns paths are empty");
+
+       /*
+        * We have to open the file descriptors first, since after
+        * we join the mnt namespace we might no longer be able to
+        * access the paths.
+        */
+       do {
+               int fd;
+               char *path;
+               struct namespace_t *ns;
+
+               /* Resize the namespace array. */
+               namespaces = realloc(namespaces, ++num * sizeof(struct namespace_t));
+               if (!namespaces)
+                       bail("failed to reallocate namespace array");
+               ns = &namespaces[num - 1];
+
+               /* Split 'ns:path'. */
+               path = strstr(namespace, ":");
+               if (!path)
+                       bail("failed to parse %s", namespace);
+               *path++ = '\0';
+
+               fd = open(path, O_RDONLY);
+               if (fd < 0)
+                       bail("failed to open %s", path);
+
+               ns->fd = fd;
+               ns->ns = nsflag(namespace);
+               strncpy(ns->path, path, PATH_MAX - 1);
+               ns->path[PATH_MAX - 1] = '\0';
+       } while ((namespace = strtok_r(NULL, ",", &saveptr)) != NULL);
+
+       /*
+        * The ordering in which we join namespaces is important. We should
+        * always join the user namespace *first*. This is all guaranteed
+        * from the container_linux.go side of this, so we're just going to
+        * follow the order given to us.
+        */
+
+       for (i = 0; i < num; i++) {
+               struct namespace_t ns = namespaces[i];
+
+               if (setns(ns.fd, ns.ns) < 0)
+                       bail("failed to setns to %s", ns.path);
+
+               close(ns.fd);
+       }
+
+       free(namespaces);
+}
+
+/* Defined in cloned_binary.c. */
+extern int ensure_cloned_binary(void);
+
+void nsexec(void)
+{
+       int pipenum;
+       jmp_buf env;
+       int sync_child_pipe[2], sync_grandchild_pipe[2];
+       struct nlconfig_t config = { 0 };
+
+       /*
+        * Setup a pipe to send logs to the parent. This should happen
+        * first, because bail will use that pipe.
+        */
+       setup_logpipe();
+
+       /*
+        * If we don't have an init pipe, just return to the go routine.
+        * We'll only get an init pipe for start or exec.
+        */
+       pipenum = initpipe();
+       if (pipenum == -1)
+               return;
+
+       /*
+        * We need to re-exec if we are not in a cloned binary. This is necessary
+        * to ensure that containers won't be able to access the host binary
+        * through /proc/self/exe. See CVE-2019-5736.
+        */
+       if (ensure_cloned_binary() < 0)
+               bail("could not ensure we are a cloned binary");
+
+       write_log(DEBUG, "nsexec started");
+
+       /* Parse all of the netlink configuration. */
+       nl_parse(pipenum, &config);
+
+       /* Set oom_score_adj. This has to be done before !dumpable because
+        * /proc/self/oom_score_adj is not writeable unless you're an privileged
+        * user (if !dumpable is set). All children inherit their parent's
+        * oom_score_adj value on fork(2) so this will always be propagated
+        * properly.
+        */
+       update_oom_score_adj(config.oom_score_adj, config.oom_score_adj_len);
+
+       /*
+        * Make the process non-dumpable, to avoid various race conditions that
+        * could cause processes in namespaces we're joining to access host
+        * resources (or potentially execute code).
+        *
+        * However, if the number of namespaces we are joining is 0, we are not
+        * going to be switching to a different security context. Thus setting
+        * ourselves to be non-dumpable only breaks things (like rootless
+        * containers), which is the recommendation from the kernel folks.
+        */
+       if (config.namespaces) {
+               if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0)
+                       bail("failed to set process as non-dumpable");
+       }
+
+       /* Pipe so we can tell the child when we've finished setting up. */
+       if (socketpair(AF_LOCAL, SOCK_STREAM, 0, sync_child_pipe) < 0)
+               bail("failed to setup sync pipe between parent and child");
+
+       /*
+        * We need a new socketpair to sync with grandchild so we don't have
+        * race condition with child.
+        */
+       if (socketpair(AF_LOCAL, SOCK_STREAM, 0, sync_grandchild_pipe) < 0)
+               bail("failed to setup sync pipe between parent and grandchild");
+
+       /* TODO: Currently we aren't dealing with child deaths properly. */
+
+       /*
+        * Okay, so this is quite annoying.
+        *
+        * In order for this unsharing code to be more extensible we need to split
+        * up unshare(CLONE_NEWUSER) and clone() in various ways. The ideal case
+        * would be if we did clone(CLONE_NEWUSER) and the other namespaces
+        * separately, but because of SELinux issues we cannot really do that. But
+        * we cannot just dump the namespace flags into clone(...) because several
+        * usecases (such as rootless containers) require more granularity around
+        * the namespace setup. In addition, some older kernels had issues where
+        * CLONE_NEWUSER wasn't handled before other namespaces (but we cannot
+        * handle this while also dealing with SELinux so we choose SELinux support
+        * over broken kernel support).
+        *
+        * However, if we unshare(2) the user namespace *before* we clone(2), then
+        * all hell breaks loose.
+        *
+        * The parent no longer has permissions to do many things (unshare(2) drops
+        * all capabilities in your old namespace), and the container cannot be set
+        * up to have more than one {uid,gid} mapping. This is obviously less than
+        * ideal. In order to fix this, we have to first clone(2) and then unshare.
+        *
+        * Unfortunately, it's not as simple as that. We have to fork to enter the
+        * PID namespace (the PID namespace only applies to children). Since we'll
+        * have to double-fork, this clone_parent() call won't be able to get the
+        * PID of the _actual_ init process (without doing more synchronisation than
+        * I can deal with at the moment). So we'll just get the parent to send it
+        * for us, the only job of this process is to update
+        * /proc/pid/{setgroups,uid_map,gid_map}.
+        *
+        * And as a result of the above, we also need to setns(2) in the first child
+        * because if we join a PID namespace in the topmost parent then our child
+        * will be in that namespace (and it will not be able to give us a PID value
+        * that makes sense without resorting to sending things with cmsg).
+        *
+        * This also deals with an older issue caused by dumping cloneflags into
+        * clone(2): On old kernels, CLONE_PARENT didn't work with CLONE_NEWPID, so
+        * we have to unshare(2) before clone(2) in order to do this. This was fixed
+        * in upstream commit 1f7f4dde5c945f41a7abc2285be43d918029ecc5, and was
+        * introduced by 40a0d32d1eaffe6aac7324ca92604b6b3977eb0e. As far as we're
+        * aware, the last mainline kernel which had this bug was Linux 3.12.
+        * However, we cannot comment on which kernels the broken patch was
+        * backported to.
+        *
+        * -- Aleksa "what has my life come to?" Sarai
+        */
+
+       switch (setjmp(env)) {
+               /*
+                * Stage 0: We're in the parent. Our job is just to create a new child
+                *          (stage 1: JUMP_CHILD) process and write its uid_map and
+                *          gid_map. That process will go on to create a new process, then
+                *          it will send us its PID which we will send to the bootstrap
+                *          process.
+                */
+       case JUMP_PARENT:{
+                       int len;
+                       pid_t child, first_child = -1;
+                       bool ready = false;
+
+                       /* For debugging. */
+                       prctl(PR_SET_NAME, (unsigned long)"runc:[0:PARENT]", 0, 0, 0);
+
+                       /* Start the process of getting a container. */
+                       child = clone_parent(&env, JUMP_CHILD);
+                       if (child < 0)
+                               bail("unable to fork: child_func");
+
+                       /*
+                        * State machine for synchronisation with the children.
+                        *
+                        * Father only return when both child and grandchild are
+                        * ready, so we can receive all possible error codes
+                        * generated by children.
+                        */
+                       while (!ready) {
+                               enum sync_t s;
+
+                               syncfd = sync_child_pipe[1];
+                               close(sync_child_pipe[0]);
+
+                               if (read(syncfd, &s, sizeof(s)) != sizeof(s))
+                                       bail("failed to sync with child: next state");
+
+                               switch (s) {
+                               case SYNC_USERMAP_PLS:
+                                       /*
+                                        * Enable setgroups(2) if we've been asked to. But we also
+                                        * have to explicitly disable setgroups(2) if we're
+                                        * creating a rootless container for single-entry mapping.
+                                        * i.e. config.is_setgroup == false.
+                                        * (this is required since Linux 3.19).
+                                        *
+                                        * For rootless multi-entry mapping, config.is_setgroup shall be true and
+                                        * newuidmap/newgidmap shall be used.
+                                        */
+
+                                       if (config.is_rootless_euid && !config.is_setgroup)
+                                               update_setgroups(child, SETGROUPS_DENY);
+
+                                       /* Set up mappings. */
+                                       update_uidmap(config.uidmappath, child, config.uidmap, config.uidmap_len);
+                                       update_gidmap(config.gidmappath, child, config.gidmap, config.gidmap_len);
+
+                                       s = SYNC_USERMAP_ACK;
+                                       if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
+                                               kill(child, SIGKILL);
+                                               bail("failed to sync with child: write(SYNC_USERMAP_ACK)");
+                                       }
+                                       break;
+                               case SYNC_RECVPID_PLS:{
+                                               first_child = child;
+
+                                               /* Get the init_func pid. */
+                                               if (read(syncfd, &child, sizeof(child)) != sizeof(child)) {
+                                                       kill(first_child, SIGKILL);
+                                                       bail("failed to sync with child: read(childpid)");
+                                               }
+
+                                               /* Send ACK. */
+                                               s = SYNC_RECVPID_ACK;
+                                               if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
+                                                       kill(first_child, SIGKILL);
+                                                       kill(child, SIGKILL);
+                                                       bail("failed to sync with child: write(SYNC_RECVPID_ACK)");
+                                               }
+
+                                               /* Send the init_func pid back to our parent.
+                                                *
+                                                * Send the init_func pid and the pid of the first child back to our parent.
+                                                * We need to send both back because we can't reap the first child we created (CLONE_PARENT).
+                                                * It becomes the responsibility of our parent to reap the first child.
+                                                */
+                                               len = dprintf(pipenum, "{\"pid\": %d, \"pid_first\": %d}\n", child, first_child);
+                                               if (len < 0) {
+                                                       kill(child, SIGKILL);
+                                                       bail("unable to generate JSON for child pid");
+                                               }
+                                       }
+                                       break;
+                               case SYNC_CHILD_READY:
+                                       ready = true;
+                                       break;
+                               default:
+                                       bail("unexpected sync value: %u", s);
+                               }
+                       }
+
+                       /* Now sync with grandchild. */
+
+                       ready = false;
+                       while (!ready) {
+                               enum sync_t s;
+
+                               syncfd = sync_grandchild_pipe[1];
+                               close(sync_grandchild_pipe[0]);
+
+                               s = SYNC_GRANDCHILD;
+                               if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
+                                       kill(child, SIGKILL);
+                                       bail("failed to sync with child: write(SYNC_GRANDCHILD)");
+                               }
+
+                               if (read(syncfd, &s, sizeof(s)) != sizeof(s))
+                                       bail("failed to sync with child: next state");
+
+                               switch (s) {
+                               case SYNC_CHILD_READY:
+                                       ready = true;
+                                       break;
+                               default:
+                                       bail("unexpected sync value: %u", s);
+                               }
+                       }
+                       exit(0);
+               }
+
+               /*
+                * Stage 1: We're in the first child process. Our job is to join any
+                *          provided namespaces in the netlink payload and unshare all
+                *          of the requested namespaces. If we've been asked to
+                *          CLONE_NEWUSER, we will ask our parent (stage 0) to set up
+                *          our user mappings for us. Then, we create a new child
+                *          (stage 2: JUMP_INIT) for PID namespace. We then send the
+                *          child's PID to our parent (stage 0).
+                */
+       case JUMP_CHILD:{
+                       pid_t child;
+                       enum sync_t s;
+
+                       /* We're in a child and thus need to tell the parent if we die. */
+                       syncfd = sync_child_pipe[0];
+                       close(sync_child_pipe[1]);
+
+                       /* For debugging. */
+                       prctl(PR_SET_NAME, (unsigned long)"runc:[1:CHILD]", 0, 0, 0);
+
+                       /*
+                        * We need to setns first. We cannot do this earlier (in stage 0)
+                        * because of the fact that we forked to get here (the PID of
+                        * [stage 2: JUMP_INIT]) would be meaningless). We could send it
+                        * using cmsg(3) but that's just annoying.
+                        */
+                       if (config.namespaces)
+                               join_namespaces(config.namespaces);
+
+                       /*
+                        * Deal with user namespaces first. They are quite special, as they
+                        * affect our ability to unshare other namespaces and are used as
+                        * context for privilege checks.
+                        *
+                        * We don't unshare all namespaces in one go. The reason for this
+                        * is that, while the kernel documentation may claim otherwise,
+                        * there are certain cases where unsharing all namespaces at once
+                        * will result in namespace objects being owned incorrectly.
+                        * Ideally we should just fix these kernel bugs, but it's better to
+                        * be safe than sorry, and fix them separately.
+                        *
+                        * A specific case of this is that the SELinux label of the
+                        * internal kern-mount that mqueue uses will be incorrect if the
+                        * UTS namespace is cloned before the USER namespace is mapped.
+                        * I've also heard of similar problems with the network namespace
+                        * in some scenarios. This also mirrors how LXC deals with this
+                        * problem.
+                        */
+                       if (config.cloneflags & CLONE_NEWUSER) {
+                               if (unshare(CLONE_NEWUSER) < 0)
+                                       bail("failed to unshare user namespace");
+                               config.cloneflags &= ~CLONE_NEWUSER;
+
+                               /*
+                                * We don't have the privileges to do any mapping here (see the
+                                * clone_parent rant). So signal our parent to hook us up.
+                                */
+
+                               /* Switching is only necessary if we joined namespaces. */
+                               if (config.namespaces) {
+                                       if (prctl(PR_SET_DUMPABLE, 1, 0, 0, 0) < 0)
+                                               bail("failed to set process as dumpable");
+                               }
+                               s = SYNC_USERMAP_PLS;
+                               if (write(syncfd, &s, sizeof(s)) != sizeof(s))
+                                       bail("failed to sync with parent: write(SYNC_USERMAP_PLS)");
+
+                               /* ... wait for mapping ... */
+
+                               if (read(syncfd, &s, sizeof(s)) != sizeof(s))
+                                       bail("failed to sync with parent: read(SYNC_USERMAP_ACK)");
+                               if (s != SYNC_USERMAP_ACK)
+                                       bail("failed to sync with parent: SYNC_USERMAP_ACK: got %u", s);
+                               /* Switching is only necessary if we joined namespaces. */
+                               if (config.namespaces) {
+                                       if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0)
+                                               bail("failed to set process as dumpable");
+                               }
+
+                               /* Become root in the namespace proper. */
+                               if (setresuid(0, 0, 0) < 0)
+                                       bail("failed to become root in user namespace");
+                       }
+                       /*
+                        * Unshare all of the namespaces. Now, it should be noted that this
+                        * ordering might break in the future (especially with rootless
+                        * containers). But for now, it's not possible to split this into
+                        * CLONE_NEWUSER + [the rest] because of some RHEL SELinux issues.
+                        *
+                        * Note that we don't merge this with clone() because there were
+                        * some old kernel versions where clone(CLONE_PARENT | CLONE_NEWPID)
+                        * was broken, so we'll just do it the long way anyway.
+                        */
+                       if (unshare(config.cloneflags & ~CLONE_NEWCGROUP) < 0)
+                               bail("failed to unshare namespaces");
+
+                       /*
+                        * TODO: What about non-namespace clone flags that we're dropping here?
+                        *
+                        * We fork again because of PID namespace, setns(2) or unshare(2) don't
+                        * change the PID namespace of the calling process, because doing so
+                        * would change the caller's idea of its own PID (as reported by getpid()),
+                        * which would break many applications and libraries, so we must fork
+                        * to actually enter the new PID namespace.
+                        */
+                       child = clone_parent(&env, JUMP_INIT);
+                       if (child < 0)
+                               bail("unable to fork: init_func");
+
+                       /* Send the child to our parent, which knows what it's doing. */
+                       s = SYNC_RECVPID_PLS;
+                       if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
+                               kill(child, SIGKILL);
+                               bail("failed to sync with parent: write(SYNC_RECVPID_PLS)");
+                       }
+                       if (write(syncfd, &child, sizeof(child)) != sizeof(child)) {
+                               kill(child, SIGKILL);
+                               bail("failed to sync with parent: write(childpid)");
+                       }
+
+                       /* ... wait for parent to get the pid ... */
+
+                       if (read(syncfd, &s, sizeof(s)) != sizeof(s)) {
+                               kill(child, SIGKILL);
+                               bail("failed to sync with parent: read(SYNC_RECVPID_ACK)");
+                       }
+                       if (s != SYNC_RECVPID_ACK) {
+                               kill(child, SIGKILL);
+                               bail("failed to sync with parent: SYNC_RECVPID_ACK: got %u", s);
+                       }
+
+                       s = SYNC_CHILD_READY;
+                       if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
+                               kill(child, SIGKILL);
+                               bail("failed to sync with parent: write(SYNC_CHILD_READY)");
+                       }
+
+                       /* Our work is done. [Stage 2: JUMP_INIT] is doing the rest of the work. */
+                       exit(0);
+               }
+
+               /*
+                * Stage 2: We're the final child process, and the only process that will
+                *          actually return to the Go runtime. Our job is to just do the
+                *          final cleanup steps and then return to the Go runtime to allow
+                *          init_linux.go to run.
+                */
+       case JUMP_INIT:{
+                       /*
+                        * We're inside the child now, having jumped from the
+                        * start_child() code after forking in the parent.
+                        */
+                       enum sync_t s;
+
+                       /* We're in a child and thus need to tell the parent if we die. */
+                       syncfd = sync_grandchild_pipe[0];
+                       close(sync_grandchild_pipe[1]);
+                       close(sync_child_pipe[0]);
+                       close(sync_child_pipe[1]);
+
+                       /* For debugging. */
+                       prctl(PR_SET_NAME, (unsigned long)"runc:[2:INIT]", 0, 0, 0);
+
+                       if (read(syncfd, &s, sizeof(s)) != sizeof(s))
+                               bail("failed to sync with parent: read(SYNC_GRANDCHILD)");
+                       if (s != SYNC_GRANDCHILD)
+                               bail("failed to sync with parent: SYNC_GRANDCHILD: got %u", s);
+
+                       if (setsid() < 0)
+                               bail("setsid failed");
+
+                       if (setuid(0) < 0)
+                               bail("setuid failed");
+
+                       if (setgid(0) < 0)
+                               bail("setgid failed");
+
+                       if (!config.is_rootless_euid && config.is_setgroup) {
+                               if (setgroups(0, NULL) < 0)
+                                       bail("setgroups failed");
+                       }
+
+                       /* ... wait until our topmost parent has finished cgroup setup in p.manager.Apply() ... */
+                       if (config.cloneflags & CLONE_NEWCGROUP) {
+                               uint8_t value;
+                               if (read(pipenum, &value, sizeof(value)) != sizeof(value))
+                                       bail("read synchronisation value failed");
+                               if (value == CREATECGROUPNS) {
+                                       if (unshare(CLONE_NEWCGROUP) < 0)
+                                               bail("failed to unshare cgroup namespace");
+                               } else
+                                       bail("received unknown synchronisation value");
+                       }
+
+                       s = SYNC_CHILD_READY;
+                       if (write(syncfd, &s, sizeof(s)) != sizeof(s))
+                               bail("failed to sync with patent: write(SYNC_CHILD_READY)");
+
+                       /* Close sync pipes. */
+                       close(sync_grandchild_pipe[0]);
+
+                       /* Free netlink data. */
+                       nl_free(&config);
+
+                       /* Finish executing, let the Go runtime take over. */
+                       return;
+               }
+       default:
+               bail("unexpected jump value");
+       }
+
+       /* Should never be reached. */
+       bail("should never be reached");
+}
diff --git a/libcontainer/process.go b/libcontainer/process.go
new file mode 100644 (file)
index 0000000..d3e472a
--- /dev/null
@@ -0,0 +1,115 @@
+package libcontainer
+
+import (
+       "fmt"
+       "io"
+       "math"
+       "os"
+
+       "github.com/opencontainers/runc/libcontainer/configs"
+)
+
+type processOperations interface {
+       wait() (*os.ProcessState, error)
+       signal(sig os.Signal) error
+       pid() int
+}
+
+// Process specifies the configuration and IO for a process inside
+// a container.
+type Process struct {
+       // The command to be run followed by any arguments.
+       Args []string
+
+       // Env specifies the environment variables for the process.
+       Env []string
+
+       // User will set the uid and gid of the executing process running inside the container
+       // local to the container's user and group configuration.
+       User string
+
+       // AdditionalGroups specifies the gids that should be added to supplementary groups
+       // in addition to those that the user belongs to.
+       AdditionalGroups []string
+
+       // Cwd will change the processes current working directory inside the container's rootfs.
+       Cwd string
+
+       // Stdin is a pointer to a reader which provides the standard input stream.
+       Stdin io.Reader
+
+       // Stdout is a pointer to a writer which receives the standard output stream.
+       Stdout io.Writer
+
+       // Stderr is a pointer to a writer which receives the standard error stream.
+       Stderr io.Writer
+
+       // ExtraFiles specifies additional open files to be inherited by the container
+       ExtraFiles []*os.File
+
+       // Initial sizings for the console
+       ConsoleWidth  uint16
+       ConsoleHeight uint16
+
+       // Capabilities specify the capabilities to keep when executing the process inside the container
+       // All capabilities not specified will be dropped from the processes capability mask
+       Capabilities *configs.Capabilities
+
+       // AppArmorProfile specifies the profile to apply to the process and is
+       // changed at the time the process is execed
+       AppArmorProfile string
+
+       // Label specifies the label to apply to the process.  It is commonly used by selinux
+       Label string
+
+       // NoNewPrivileges controls whether processes can gain additional privileges.
+       NoNewPrivileges *bool
+
+       // Rlimits specifies the resource limits, such as max open files, to set in the container
+       // If Rlimits are not set, the container will inherit rlimits from the parent process
+       Rlimits []configs.Rlimit
+
+       // ConsoleSocket provides the masterfd console.
+       ConsoleSocket *os.File
+
+       // Init specifies whether the process is the first process in the container.
+       Init bool
+
+       ops processOperations
+
+       LogLevel string
+}
+
+// Wait waits for the process to exit.
+// Wait releases any resources associated with the Process
+func (p Process) Wait() (*os.ProcessState, error) {
+       if p.ops == nil {
+               return nil, newGenericError(fmt.Errorf("invalid process"), NoProcessOps)
+       }
+       return p.ops.wait()
+}
+
+// Pid returns the process ID
+func (p Process) Pid() (int, error) {
+       // math.MinInt32 is returned here, because it's invalid value
+       // for the kill() system call.
+       if p.ops == nil {
+               return math.MinInt32, newGenericError(fmt.Errorf("invalid process"), NoProcessOps)
+       }
+       return p.ops.pid(), nil
+}
+
+// Signal sends a signal to the Process.
+func (p Process) Signal(sig os.Signal) error {
+       if p.ops == nil {
+               return newGenericError(fmt.Errorf("invalid process"), NoProcessOps)
+       }
+       return p.ops.signal(sig)
+}
+
+// IO holds the process's STDIO
+type IO struct {
+       Stdin  io.WriteCloser
+       Stdout io.ReadCloser
+       Stderr io.ReadCloser
+}
diff --git a/libcontainer/process_linux.go b/libcontainer/process_linux.go
new file mode 100644 (file)
index 0000000..de989b5
--- /dev/null
@@ -0,0 +1,598 @@
+// +build linux
+
+package libcontainer
+
+import (
+       "encoding/json"
+       "errors"
+       "fmt"
+       "io"
+       "os"
+       "os/exec"
+       "path/filepath"
+       "strconv"
+       "syscall" // only for Signal
+
+       "github.com/opencontainers/runc/libcontainer/cgroups"
+       "github.com/opencontainers/runc/libcontainer/configs"
+       "github.com/opencontainers/runc/libcontainer/intelrdt"
+       "github.com/opencontainers/runc/libcontainer/logs"
+       "github.com/opencontainers/runc/libcontainer/system"
+       "github.com/opencontainers/runc/libcontainer/utils"
+
+       "golang.org/x/sys/unix"
+)
+
+// Synchronisation value for cgroup namespace setup.
+// The same constant is defined in nsexec.c as "CREATECGROUPNS".
+const createCgroupns = 0x80
+
+type parentProcess interface {
+       // pid returns the pid for the running process.
+       pid() int
+
+       // start starts the process execution.
+       start() error
+
+       // send a SIGKILL to the process and wait for the exit.
+       terminate() error
+
+       // wait waits on the process returning the process state.
+       wait() (*os.ProcessState, error)
+
+       // startTime returns the process start time.
+       startTime() (uint64, error)
+
+       signal(os.Signal) error
+
+       externalDescriptors() []string
+
+       setExternalDescriptors(fds []string)
+
+       forwardChildLogs()
+}
+
+type filePair struct {
+       parent *os.File
+       child  *os.File
+}
+
+type setnsProcess struct {
+       cmd             *exec.Cmd
+       messageSockPair filePair
+       logFilePair     filePair
+       cgroupPaths     map[string]string
+       rootlessCgroups bool
+       intelRdtPath    string
+       config          *initConfig
+       fds             []string
+       process         *Process
+       bootstrapData   io.Reader
+}
+
+func (p *setnsProcess) startTime() (uint64, error) {
+       stat, err := system.Stat(p.pid())
+       return stat.StartTime, err
+}
+
+func (p *setnsProcess) signal(sig os.Signal) error {
+       s, ok := sig.(syscall.Signal)
+       if !ok {
+               return errors.New("os: unsupported signal type")
+       }
+       return unix.Kill(p.pid(), s)
+}
+
+func (p *setnsProcess) start() (err error) {
+       defer p.messageSockPair.parent.Close()
+       err = p.cmd.Start()
+       // close the write-side of the pipes (controlled by child)
+       p.messageSockPair.child.Close()
+       p.logFilePair.child.Close()
+       if err != nil {
+               return newSystemErrorWithCause(err, "starting setns process")
+       }
+       if p.bootstrapData != nil {
+               if _, err := io.Copy(p.messageSockPair.parent, p.bootstrapData); err != nil {
+                       return newSystemErrorWithCause(err, "copying bootstrap data to pipe")
+               }
+       }
+       if err = p.execSetns(); err != nil {
+               return newSystemErrorWithCause(err, "executing setns process")
+       }
+       if len(p.cgroupPaths) > 0 {
+               if err := cgroups.EnterPid(p.cgroupPaths, p.pid()); err != nil && !p.rootlessCgroups {
+                       return newSystemErrorWithCausef(err, "adding pid %d to cgroups", p.pid())
+               }
+       }
+       if p.intelRdtPath != "" {
+               // if Intel RDT "resource control" filesystem path exists
+               _, err := os.Stat(p.intelRdtPath)
+               if err == nil {
+                       if err := intelrdt.WriteIntelRdtTasks(p.intelRdtPath, p.pid()); err != nil {
+                               return newSystemErrorWithCausef(err, "adding pid %d to Intel RDT resource control filesystem", p.pid())
+                       }
+               }
+       }
+       // set rlimits, this has to be done here because we lose permissions
+       // to raise the limits once we enter a user-namespace
+       if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil {
+               return newSystemErrorWithCause(err, "setting rlimits for process")
+       }
+       if err := utils.WriteJSON(p.messageSockPair.parent, p.config); err != nil {
+               return newSystemErrorWithCause(err, "writing config to pipe")
+       }
+
+       ierr := parseSync(p.messageSockPair.parent, func(sync *syncT) error {
+               switch sync.Type {
+               case procReady:
+                       // This shouldn't happen.
+                       panic("unexpected procReady in setns")
+               case procHooks:
+                       // This shouldn't happen.
+                       panic("unexpected procHooks in setns")
+               default:
+                       return newSystemError(fmt.Errorf("invalid JSON payload from child"))
+               }
+       })
+
+       if err := unix.Shutdown(int(p.messageSockPair.parent.Fd()), unix.SHUT_WR); err != nil {
+               return newSystemErrorWithCause(err, "calling shutdown on init pipe")
+       }
+       // Must be done after Shutdown so the child will exit and we can wait for it.
+       if ierr != nil {
+               p.wait()
+               return ierr
+       }
+       return nil
+}
+
+// execSetns runs the process that executes C code to perform the setns calls
+// because setns support requires the C process to fork off a child and perform the setns
+// before the go runtime boots, we wait on the process to die and receive the child's pid
+// over the provided pipe.
+func (p *setnsProcess) execSetns() error {
+       status, err := p.cmd.Process.Wait()
+       if err != nil {
+               p.cmd.Wait()
+               return newSystemErrorWithCause(err, "waiting on setns process to finish")
+       }
+       if !status.Success() {
+               p.cmd.Wait()
+               return newSystemError(&exec.ExitError{ProcessState: status})
+       }
+       var pid *pid
+       if err := json.NewDecoder(p.messageSockPair.parent).Decode(&pid); err != nil {
+               p.cmd.Wait()
+               return newSystemErrorWithCause(err, "reading pid from init pipe")
+       }
+
+       // Clean up the zombie parent process
+       // On Unix systems FindProcess always succeeds.
+       firstChildProcess, _ := os.FindProcess(pid.PidFirstChild)
+
+       // Ignore the error in case the child has already been reaped for any reason
+       _, _ = firstChildProcess.Wait()
+
+       process, err := os.FindProcess(pid.Pid)
+       if err != nil {
+               return err
+       }
+       p.cmd.Process = process
+       p.process.ops = p
+       return nil
+}
+
+// terminate sends a SIGKILL to the forked process for the setns routine then waits to
+// avoid the process becoming a zombie.
+func (p *setnsProcess) terminate() error {
+       if p.cmd.Process == nil {
+               return nil
+       }
+       err := p.cmd.Process.Kill()
+       if _, werr := p.wait(); err == nil {
+               err = werr
+       }
+       return err
+}
+
+func (p *setnsProcess) wait() (*os.ProcessState, error) {
+       err := p.cmd.Wait()
+
+       // Return actual ProcessState even on Wait error
+       return p.cmd.ProcessState, err
+}
+
+func (p *setnsProcess) pid() int {
+       return p.cmd.Process.Pid
+}
+
+func (p *setnsProcess) externalDescriptors() []string {
+       return p.fds
+}
+
+func (p *setnsProcess) setExternalDescriptors(newFds []string) {
+       p.fds = newFds
+}
+
+func (p *setnsProcess) forwardChildLogs() {
+       go logs.ForwardLogs(p.logFilePair.parent)
+}
+
+type initProcess struct {
+       cmd             *exec.Cmd
+       messageSockPair filePair
+       logFilePair     filePair
+       config          *initConfig
+       manager         cgroups.Manager
+       intelRdtManager intelrdt.Manager
+       container       *linuxContainer
+       fds             []string
+       process         *Process
+       bootstrapData   io.Reader
+       sharePidns      bool
+}
+
+func (p *initProcess) pid() int {
+       return p.cmd.Process.Pid
+}
+
+func (p *initProcess) externalDescriptors() []string {
+       return p.fds
+}
+
+// getChildPid receives the final child's pid over the provided pipe.
+func (p *initProcess) getChildPid() (int, error) {
+       var pid pid
+       if err := json.NewDecoder(p.messageSockPair.parent).Decode(&pid); err != nil {
+               p.cmd.Wait()
+               return -1, err
+       }
+
+       // Clean up the zombie parent process
+       // On Unix systems FindProcess always succeeds.
+       firstChildProcess, _ := os.FindProcess(pid.PidFirstChild)
+
+       // Ignore the error in case the child has already been reaped for any reason
+       _, _ = firstChildProcess.Wait()
+
+       return pid.Pid, nil
+}
+
+func (p *initProcess) waitForChildExit(childPid int) error {
+       status, err := p.cmd.Process.Wait()
+       if err != nil {
+               p.cmd.Wait()
+               return err
+       }
+       if !status.Success() {
+               p.cmd.Wait()
+               return &exec.ExitError{ProcessState: status}
+       }
+
+       process, err := os.FindProcess(childPid)
+       if err != nil {
+               return err
+       }
+       p.cmd.Process = process
+       p.process.ops = p
+       return nil
+}
+
+func (p *initProcess) start() error {
+       defer p.messageSockPair.parent.Close()
+       err := p.cmd.Start()
+       p.process.ops = p
+       // close the write-side of the pipes (controlled by child)
+       p.messageSockPair.child.Close()
+       p.logFilePair.child.Close()
+       if err != nil {
+               p.process.ops = nil
+               return newSystemErrorWithCause(err, "starting init process command")
+       }
+       // Do this before syncing with child so that no children can escape the
+       // cgroup. We don't need to worry about not doing this and not being root
+       // because we'd be using the rootless cgroup manager in that case.
+       if err := p.manager.Apply(p.pid()); err != nil {
+               return newSystemErrorWithCause(err, "applying cgroup configuration for process")
+       }
+       if p.intelRdtManager != nil {
+               if err := p.intelRdtManager.Apply(p.pid()); err != nil {
+                       return newSystemErrorWithCause(err, "applying Intel RDT configuration for process")
+               }
+       }
+       defer func() {
+               if err != nil {
+                       // TODO: should not be the responsibility to call here
+                       p.manager.Destroy()
+                       if p.intelRdtManager != nil {
+                               p.intelRdtManager.Destroy()
+                       }
+               }
+       }()
+
+       if _, err := io.Copy(p.messageSockPair.parent, p.bootstrapData); err != nil {
+               return newSystemErrorWithCause(err, "copying bootstrap data to pipe")
+       }
+       childPid, err := p.getChildPid()
+       if err != nil {
+               return newSystemErrorWithCause(err, "getting the final child's pid from pipe")
+       }
+
+       // Save the standard descriptor names before the container process
+       // can potentially move them (e.g., via dup2()).  If we don't do this now,
+       // we won't know at checkpoint time which file descriptor to look up.
+       fds, err := getPipeFds(childPid)
+       if err != nil {
+               return newSystemErrorWithCausef(err, "getting pipe fds for pid %d", childPid)
+       }
+       p.setExternalDescriptors(fds)
+       // Do this before syncing with child so that no children
+       // can escape the cgroup
+       if err := p.manager.Apply(childPid); err != nil {
+               return newSystemErrorWithCause(err, "applying cgroup configuration for process")
+       }
+       if p.intelRdtManager != nil {
+               if err := p.intelRdtManager.Apply(childPid); err != nil {
+                       return newSystemErrorWithCause(err, "applying Intel RDT configuration for process")
+               }
+       }
+       // Now it's time to setup cgroup namesapce
+       if p.config.Config.Namespaces.Contains(configs.NEWCGROUP) && p.config.Config.Namespaces.PathOf(configs.NEWCGROUP) == "" {
+               if _, err := p.messageSockPair.parent.Write([]byte{createCgroupns}); err != nil {
+                       return newSystemErrorWithCause(err, "sending synchronization value to init process")
+               }
+       }
+
+       // Wait for our first child to exit
+       if err := p.waitForChildExit(childPid); err != nil {
+               return newSystemErrorWithCause(err, "waiting for our first child to exit")
+       }
+
+       defer func() {
+               if err != nil {
+                       // TODO: should not be the responsibility to call here
+                       p.manager.Destroy()
+                       if p.intelRdtManager != nil {
+                               p.intelRdtManager.Destroy()
+                       }
+               }
+       }()
+       if err := p.createNetworkInterfaces(); err != nil {
+               return newSystemErrorWithCause(err, "creating network interfaces")
+       }
+       if err := p.sendConfig(); err != nil {
+               return newSystemErrorWithCause(err, "sending config to init process")
+       }
+       var (
+               sentRun    bool
+               sentResume bool
+       )
+
+       ierr := parseSync(p.messageSockPair.parent, func(sync *syncT) error {
+               switch sync.Type {
+               case procReady:
+                       // set rlimits, this has to be done here because we lose permissions
+                       // to raise the limits once we enter a user-namespace
+                       if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil {
+                               return newSystemErrorWithCause(err, "setting rlimits for ready process")
+                       }
+                       // call prestart hooks
+                       if !p.config.Config.Namespaces.Contains(configs.NEWNS) {
+                               // Setup cgroup before prestart hook, so that the prestart hook could apply cgroup permissions.
+                               if err := p.manager.Set(p.config.Config); err != nil {
+                                       return newSystemErrorWithCause(err, "setting cgroup config for ready process")
+                               }
+                               if p.intelRdtManager != nil {
+                                       if err := p.intelRdtManager.Set(p.config.Config); err != nil {
+                                               return newSystemErrorWithCause(err, "setting Intel RDT config for ready process")
+                                       }
+                               }
+
+                               if p.config.Config.Hooks != nil {
+                                       s, err := p.container.currentOCIState()
+                                       if err != nil {
+                                               return err
+                                       }
+                                       // initProcessStartTime hasn't been set yet.
+                                       s.Pid = p.cmd.Process.Pid
+                                       s.Status = "creating"
+                                       for i, hook := range p.config.Config.Hooks.Prestart {
+                                               if err := hook.Run(s); err != nil {
+                                                       return newSystemErrorWithCausef(err, "running prestart hook %d", i)
+                                               }
+                                       }
+                               }
+                       }
+                       // Sync with child.
+                       if err := writeSync(p.messageSockPair.parent, procRun); err != nil {
+                               return newSystemErrorWithCause(err, "writing syncT 'run'")
+                       }
+                       sentRun = true
+               case procHooks:
+                       // Setup cgroup before prestart hook, so that the prestart hook could apply cgroup permissions.
+                       if err := p.manager.Set(p.config.Config); err != nil {
+                               return newSystemErrorWithCause(err, "setting cgroup config for procHooks process")
+                       }
+                       if p.intelRdtManager != nil {
+                               if err := p.intelRdtManager.Set(p.config.Config); err != nil {
+                                       return newSystemErrorWithCause(err, "setting Intel RDT config for procHooks process")
+                               }
+                       }
+                       if p.config.Config.Hooks != nil {
+                               s, err := p.container.currentOCIState()
+                               if err != nil {
+                                       return err
+                               }
+                               // initProcessStartTime hasn't been set yet.
+                               s.Pid = p.cmd.Process.Pid
+                               s.Status = "creating"
+                               for i, hook := range p.config.Config.Hooks.Prestart {
+                                       if err := hook.Run(s); err != nil {
+                                               return newSystemErrorWithCausef(err, "running prestart hook %d", i)
+                                       }
+                               }
+                       }
+                       // Sync with child.
+                       if err := writeSync(p.messageSockPair.parent, procResume); err != nil {
+                               return newSystemErrorWithCause(err, "writing syncT 'resume'")
+                       }
+                       sentResume = true
+               default:
+                       return newSystemError(fmt.Errorf("invalid JSON payload from child"))
+               }
+
+               return nil
+       })
+
+       if !sentRun {
+               return newSystemErrorWithCause(ierr, "container init")
+       }
+       if p.config.Config.Namespaces.Contains(configs.NEWNS) && !sentResume {
+               return newSystemError(fmt.Errorf("could not synchronise after executing prestart hooks with container process"))
+       }
+       if err := unix.Shutdown(int(p.messageSockPair.parent.Fd()), unix.SHUT_WR); err != nil {
+               return newSystemErrorWithCause(err, "shutting down init pipe")
+       }
+
+       // Must be done after Shutdown so the child will exit and we can wait for it.
+       if ierr != nil {
+               p.wait()
+               return ierr
+       }
+       return nil
+}
+
+func (p *initProcess) wait() (*os.ProcessState, error) {
+       err := p.cmd.Wait()
+       if err != nil {
+               return p.cmd.ProcessState, err
+       }
+       // we should kill all processes in cgroup when init is died if we use host PID namespace
+       if p.sharePidns {
+               signalAllProcesses(p.manager, unix.SIGKILL)
+       }
+       return p.cmd.ProcessState, nil
+}
+
+func (p *initProcess) terminate() error {
+       if p.cmd.Process == nil {
+               return nil
+       }
+       err := p.cmd.Process.Kill()
+       if _, werr := p.wait(); err == nil {
+               err = werr
+       }
+       return err
+}
+
+func (p *initProcess) startTime() (uint64, error) {
+       stat, err := system.Stat(p.pid())
+       return stat.StartTime, err
+}
+
+func (p *initProcess) sendConfig() error {
+       // send the config to the container's init process, we don't use JSON Encode
+       // here because there might be a problem in JSON decoder in some cases, see:
+       // https://github.com/docker/docker/issues/14203#issuecomment-174177790
+       return utils.WriteJSON(p.messageSockPair.parent, p.config)
+}
+
+func (p *initProcess) createNetworkInterfaces() error {
+       for _, config := range p.config.Config.Networks {
+               strategy, err := getStrategy(config.Type)
+               if err != nil {
+                       return err
+               }
+               n := &network{
+                       Network: *config,
+               }
+               if err := strategy.create(n, p.pid()); err != nil {
+                       return err
+               }
+               p.config.Networks = append(p.config.Networks, n)
+       }
+       return nil
+}
+
+func (p *initProcess) signal(sig os.Signal) error {
+       s, ok := sig.(syscall.Signal)
+       if !ok {
+               return errors.New("os: unsupported signal type")
+       }
+       return unix.Kill(p.pid(), s)
+}
+
+func (p *initProcess) setExternalDescriptors(newFds []string) {
+       p.fds = newFds
+}
+
+func (p *initProcess) forwardChildLogs() {
+       go logs.ForwardLogs(p.logFilePair.parent)
+}
+
+func getPipeFds(pid int) ([]string, error) {
+       fds := make([]string, 3)
+
+       dirPath := filepath.Join("/proc", strconv.Itoa(pid), "/fd")
+       for i := 0; i < 3; i++ {
+               // XXX: This breaks if the path is not a valid symlink (which can
+               //      happen in certain particularly unlucky mount namespace setups).
+               f := filepath.Join(dirPath, strconv.Itoa(i))
+               target, err := os.Readlink(f)
+               if err != nil {
+                       // Ignore permission errors, for rootless containers and other
+                       // non-dumpable processes. if we can't get the fd for a particular
+                       // file, there's not much we can do.
+                       if os.IsPermission(err) {
+                               continue
+                       }
+                       return fds, err
+               }
+               fds[i] = target
+       }
+       return fds, nil
+}
+
+// InitializeIO creates pipes for use with the process's stdio and returns the
+// opposite side for each. Do not use this if you want to have a pseudoterminal
+// set up for you by libcontainer (TODO: fix that too).
+// TODO: This is mostly unnecessary, and should be handled by clients.
+func (p *Process) InitializeIO(rootuid, rootgid int) (i *IO, err error) {
+       var fds []uintptr
+       i = &IO{}
+       // cleanup in case of an error
+       defer func() {
+               if err != nil {
+                       for _, fd := range fds {
+                               unix.Close(int(fd))
+                       }
+               }
+       }()
+       // STDIN
+       r, w, err := os.Pipe()
+       if err != nil {
+               return nil, err
+       }
+       fds = append(fds, r.Fd(), w.Fd())
+       p.Stdin, i.Stdin = r, w
+       // STDOUT
+       if r, w, err = os.Pipe(); err != nil {
+               return nil, err
+       }
+       fds = append(fds, r.Fd(), w.Fd())
+       p.Stdout, i.Stdout = w, r
+       // STDERR
+       if r, w, err = os.Pipe(); err != nil {
+               return nil, err
+       }
+       fds = append(fds, r.Fd(), w.Fd())
+       p.Stderr, i.Stderr = w, r
+       // change ownership of the pipes in case we are in a user namespace
+       for _, fd := range fds {
+               if err := unix.Fchown(int(fd), rootuid, rootgid); err != nil {
+                       return nil, err
+               }
+       }
+       return i, nil
+}
diff --git a/libcontainer/restored_process.go b/libcontainer/restored_process.go
new file mode 100644 (file)
index 0000000..28d52ad
--- /dev/null
@@ -0,0 +1,128 @@
+// +build linux
+
+package libcontainer
+
+import (
+       "fmt"
+       "os"
+
+       "github.com/opencontainers/runc/libcontainer/system"
+)
+
+func newRestoredProcess(pid int, fds []string) (*restoredProcess, error) {
+       var (
+               err error
+       )
+       proc, err := os.FindProcess(pid)
+       if err != nil {
+               return nil, err
+       }
+       stat, err := system.Stat(pid)
+       if err != nil {
+               return nil, err
+       }
+       return &restoredProcess{
+               proc:             proc,
+               processStartTime: stat.StartTime,
+               fds:              fds,
+       }, nil
+}
+
+type restoredProcess struct {
+       proc             *os.Process
+       processStartTime uint64
+       fds              []string
+}
+
+func (p *restoredProcess) start() error {
+       return newGenericError(fmt.Errorf("restored process cannot be started"), SystemError)
+}
+
+func (p *restoredProcess) pid() int {
+       return p.proc.Pid
+}
+
+func (p *restoredProcess) terminate() error {
+       err := p.proc.Kill()
+       if _, werr := p.wait(); err == nil {
+               err = werr
+       }
+       return err
+}
+
+func (p *restoredProcess) wait() (*os.ProcessState, error) {
+       // TODO: how do we wait on the actual process?
+       // maybe use --exec-cmd in criu
+       st, err := p.proc.Wait()
+       if err != nil {
+               return nil, err
+       }
+       return st, nil
+}
+
+func (p *restoredProcess) startTime() (uint64, error) {
+       return p.processStartTime, nil
+}
+
+func (p *restoredProcess) signal(s os.Signal) error {
+       return p.proc.Signal(s)
+}
+
+func (p *restoredProcess) externalDescriptors() []string {
+       return p.fds
+}
+
+func (p *restoredProcess) setExternalDescriptors(newFds []string) {
+       p.fds = newFds
+}
+
+func (p *restoredProcess) forwardChildLogs() {
+}
+
+// nonChildProcess represents a process where the calling process is not
+// the parent process.  This process is created when a factory loads a container from
+// a persisted state.
+type nonChildProcess struct {
+       processPid       int
+       processStartTime uint64
+       fds              []string
+}
+
+func (p *nonChildProcess) start() error {
+       return newGenericError(fmt.Errorf("restored process cannot be started"), SystemError)
+}
+
+func (p *nonChildProcess) pid() int {
+       return p.processPid
+}
+
+func (p *nonChildProcess) terminate() error {
+       return newGenericError(fmt.Errorf("restored process cannot be terminated"), SystemError)
+}
+
+func (p *nonChildProcess) wait() (*os.ProcessState, error) {
+       return nil, newGenericError(fmt.Errorf("restored process cannot be waited on"), SystemError)
+}
+
+func (p *nonChildProcess) startTime() (uint64, error) {
+       return p.processStartTime, nil
+}
+
+func (p *nonChildProcess) signal(s os.Signal) error {
+       proc, err := os.FindProcess(p.processPid)
+       if err != nil {
+               return err
+       }
+       return proc.Signal(s)
+}
+
+func (p *nonChildProcess) externalDescriptors() []string {
+       return p.fds
+}
+
+func (p *nonChildProcess) setExternalDescriptors(newFds []string) {
+       p.fds = newFds
+}
+
+func (p *nonChildProcess) forwardChildLogs() {
+}
diff --git a/libcontainer/rootfs_linux.go b/libcontainer/rootfs_linux.go
new file mode 100644 (file)
index 0000000..106c4c2
--- /dev/null
@@ -0,0 +1,1009 @@
+// +build linux
+
+package libcontainer
+
+import (
+       "fmt"
+       "io"
+       "io/ioutil"
+       "os"
+       "os/exec"
+       "path"
+       "path/filepath"
+       "strings"
+       "time"
+
+       securejoin "github.com/cyphar/filepath-securejoin"
+       "github.com/mrunalp/fileutils"
+       "github.com/opencontainers/runc/libcontainer/cgroups"
+       "github.com/opencontainers/runc/libcontainer/configs"
+       "github.com/opencontainers/runc/libcontainer/mount"
+       "github.com/opencontainers/runc/libcontainer/system"
+       libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils"
+       "github.com/opencontainers/selinux/go-selinux/label"
+
+       "golang.org/x/sys/unix"
+)
+
+const defaultMountFlags = unix.MS_NOEXEC | unix.MS_NOSUID | unix.MS_NODEV
+
+// needsSetupDev returns true if /dev needs to be set up.
+func needsSetupDev(config *configs.Config) bool {
+       for _, m := range config.Mounts {
+               if m.Device == "bind" && libcontainerUtils.CleanPath(m.Destination) == "/dev" {
+                       return false
+               }
+       }
+       return true
+}
+
+// prepareRootfs sets up the devices, mount points, and filesystems for use
+// inside a new mount namespace. It doesn't set anything as ro. You must call
+// finalizeRootfs after this function to finish setting up the rootfs.
+func prepareRootfs(pipe io.ReadWriter, iConfig *initConfig) (err error) {
+       config := iConfig.Config
+       if err := prepareRoot(config); err != nil {
+               return newSystemErrorWithCause(err, "preparing rootfs")
+       }
+
+       hasCgroupns := config.Namespaces.Contains(configs.NEWCGROUP)
+       setupDev := needsSetupDev(config)
+       for _, m := range config.Mounts {
+               for _, precmd := range m.PremountCmds {
+                       if err := mountCmd(precmd); err != nil {
+                               return newSystemErrorWithCause(err, "running premount command")
+                       }
+               }
+               if err := mountToRootfs(m, config.Rootfs, config.MountLabel, hasCgroupns); err != nil {
+                       return newSystemErrorWithCausef(err, "mounting %q to rootfs %q at %q", m.Source, config.Rootfs, m.Destination)
+               }
+
+               for _, postcmd := range m.PostmountCmds {
+                       if err := mountCmd(postcmd); err != nil {
+                               return newSystemErrorWithCause(err, "running postmount command")
+                       }
+               }
+       }
+
+       if setupDev {
+               if err := createDevices(config); err != nil {
+                       return newSystemErrorWithCause(err, "creating device nodes")
+               }
+               if err := setupPtmx(config); err != nil {
+                       return newSystemErrorWithCause(err, "setting up ptmx")
+               }
+               if err := setupDevSymlinks(config.Rootfs); err != nil {
+                       return newSystemErrorWithCause(err, "setting up /dev symlinks")
+               }
+       }
+
+       // Signal the parent to run the pre-start hooks.
+       // The hooks are run after the mounts are setup, but before we switch to the new
+       // root, so that the old root is still available in the hooks for any mount
+       // manipulations.
+       // Note that iConfig.Cwd is not guaranteed to exist here.
+       if err := syncParentHooks(pipe); err != nil {
+               return err
+       }
+
+       // The reason these operations are done here rather than in finalizeRootfs
+       // is because the console-handling code gets quite sticky if we have to set
+       // up the console before doing the pivot_root(2). This is because the
+       // Console API has to also work with the ExecIn case, which means that the
+       // API must be able to deal with being inside as well as outside the
+       // container. It's just cleaner to do this here (at the expense of the
+       // operation not being perfectly split).
+
+       if err := unix.Chdir(config.Rootfs); err != nil {
+               return newSystemErrorWithCausef(err, "changing dir to %q", config.Rootfs)
+       }
+
+       if config.NoPivotRoot {
+               err = msMoveRoot(config.Rootfs)
+       } else if config.Namespaces.Contains(configs.NEWNS) {
+               err = pivotRoot(config.Rootfs)
+       } else {
+               err = chroot(config.Rootfs)
+       }
+       if err != nil {
+               return newSystemErrorWithCause(err, "jailing process inside rootfs")
+       }
+
+       if setupDev {
+               if err := reOpenDevNull(); err != nil {
+                       return newSystemErrorWithCause(err, "reopening /dev/null inside container")
+               }
+       }
+
+       if cwd := iConfig.Cwd; cwd != "" {
+               // Note that spec.Process.Cwd can contain unclean value like  "../../../../foo/bar...".
+               // However, we are safe to call MkDirAll directly because we are in the jail here.
+               if err := os.MkdirAll(cwd, 0755); err != nil {
+                       return err
+               }
+       }
+
+       return nil
+}
+
+// finalizeRootfs sets anything to ro if necessary. You must call
+// prepareRootfs first.
+func finalizeRootfs(config *configs.Config) (err error) {
+       // remount dev as ro if specified
+       for _, m := range config.Mounts {
+               if libcontainerUtils.CleanPath(m.Destination) == "/dev" {
+                       if m.Flags&unix.MS_RDONLY == unix.MS_RDONLY {
+                               if err := remountReadonly(m); err != nil {
+                                       return newSystemErrorWithCausef(err, "remounting %q as readonly", m.Destination)
+                               }
+                       }
+                       break
+               }
+       }
+
+       // set rootfs ( / ) as readonly
+       if config.Readonlyfs {
+               if err := setReadonly(); err != nil {
+                       return newSystemErrorWithCause(err, "setting rootfs as readonly")
+               }
+       }
+
+       unix.Umask(0022)
+       return nil
+}
+
+// /tmp has to be mounted as private to allow MS_MOVE to work in all situations
+func prepareTmp(topTmpDir string) (string, error) {
+       tmpdir, err := ioutil.TempDir(topTmpDir, "runctop")
+       if err != nil {
+               return "", err
+       }
+       if err := unix.Mount(tmpdir, tmpdir, "bind", unix.MS_BIND, ""); err != nil {
+               return "", err
+       }
+       if err := unix.Mount("", tmpdir, "", uintptr(unix.MS_PRIVATE), ""); err != nil {
+               return "", err
+       }
+       return tmpdir, nil
+}
+
+func cleanupTmp(tmpdir string) error {
+       unix.Unmount(tmpdir, 0)
+       return os.RemoveAll(tmpdir)
+}
+
+func mountCmd(cmd configs.Command) error {
+       command := exec.Command(cmd.Path, cmd.Args[:]...)
+       command.Env = cmd.Env
+       command.Dir = cmd.Dir
+       if out, err := command.CombinedOutput(); err != nil {
+               return fmt.Errorf("%#v failed: %s: %v", cmd, string(out), err)
+       }
+       return nil
+}
+
+func prepareBindMount(m *configs.Mount, rootfs string) error {
+       stat, err := os.Stat(m.Source)
+       if err != nil {
+               // error out if the source of a bind mount does not exist as we will be
+               // unable to bind anything to it.
+               return err
+       }
+       // ensure that the destination of the bind mount is resolved of symlinks at mount time because
+       // any previous mounts can invalidate the next mount's destination.
+       // this can happen when a user specifies mounts within other mounts to cause breakouts or other
+       // evil stuff to try to escape the container's rootfs.
+       var dest string
+       if dest, err = securejoin.SecureJoin(rootfs, m.Destination); err != nil {
+               return err
+       }
+       if err := checkProcMount(rootfs, dest, m.Source); err != nil {
+               return err
+       }
+       // update the mount with the correct dest after symlinks are resolved.
+       m.Destination = dest
+       if err := createIfNotExists(dest, stat.IsDir()); err != nil {
+               return err
+       }
+
+       return nil
+}
+
+func mountCgroupV1(m *configs.Mount, rootfs, mountLabel string, enableCgroupns bool) error {
+       binds, err := getCgroupMounts(m)
+       if err != nil {
+               return err
+       }
+       var merged []string
+       for _, b := range binds {
+               ss := filepath.Base(b.Destination)
+               if strings.Contains(ss, ",") {
+                       merged = append(merged, ss)
+               }
+       }
+       tmpfs := &configs.Mount{
+               Source:           "tmpfs",
+               Device:           "tmpfs",
+               Destination:      m.Destination,
+               Flags:            defaultMountFlags,
+               Data:             "mode=755",
+               PropagationFlags: m.PropagationFlags,
+       }
+       if err := mountToRootfs(tmpfs, rootfs, mountLabel, enableCgroupns); err != nil {
+               return err
+       }
+       for _, b := range binds {
+               if enableCgroupns {
+                       subsystemPath := filepath.Join(rootfs, b.Destination)
+                       if err := os.MkdirAll(subsystemPath, 0755); err != nil {
+                               return err
+                       }
+                       flags := defaultMountFlags
+                       if m.Flags&unix.MS_RDONLY != 0 {
+                               flags = flags | unix.MS_RDONLY
+                       }
+                       cgroupmount := &configs.Mount{
+                               Source:      "cgroup",
+                               Device:      "cgroup",
+                               Destination: subsystemPath,
+                               Flags:       flags,
+                               Data:        filepath.Base(subsystemPath),
+                       }
+                       if err := mountNewCgroup(cgroupmount); err != nil {
+                               return err
+                       }
+               } else {
+                       if err := mountToRootfs(b, rootfs, mountLabel, enableCgroupns); err != nil {
+                               return err
+                       }
+               }
+       }
+       for _, mc := range merged {
+               for _, ss := range strings.Split(mc, ",") {
+                       // symlink(2) is very dumb, it will just shove the path into
+                       // the link and doesn't do any checks or relative path
+                       // conversion. Also, don't error out if the cgroup already exists.
+                       if err := os.Symlink(mc, filepath.Join(rootfs, m.Destination, ss)); err != nil && !os.IsExist(err) {
+                               return err
+                       }
+               }
+       }
+       return nil
+}
+
+func mountCgroupV2(m *configs.Mount, rootfs, mountLabel string, enableCgroupns bool) error {
+       cgroupPath, err := securejoin.SecureJoin(rootfs, m.Destination)
+       if err != nil {
+               return err
+       }
+       if err := os.MkdirAll(cgroupPath, 0755); err != nil {
+               return err
+       }
+       if err := unix.Mount(m.Source, cgroupPath, "cgroup2", uintptr(m.Flags), m.Data); err != nil {
+               // when we are in UserNS but CgroupNS is not unshared, we cannot mount cgroup2 (#2158)
+               if err == unix.EPERM || err == unix.EBUSY {
+                       return unix.Mount("/sys/fs/cgroup", cgroupPath, "", uintptr(m.Flags)|unix.MS_BIND, "")
+               }
+               return err
+       }
+       return nil
+}
+
+func mountToRootfs(m *configs.Mount, rootfs, mountLabel string, enableCgroupns bool) error {
+       var (
+               dest = m.Destination
+       )
+       if !strings.HasPrefix(dest, rootfs) {
+               dest = filepath.Join(rootfs, dest)
+       }
+
+       switch m.Device {
+       case "proc", "sysfs":
+               // If the destination already exists and is not a directory, we bail
+               // out This is to avoid mounting through a symlink or similar -- which
+               // has been a "fun" attack scenario in the past.
+               // TODO: This won't be necessary once we switch to libpathrs and we can
+               //       stop all of these symlink-exchange attacks.
+               if fi, err := os.Lstat(dest); err != nil {
+                       if !os.IsNotExist(err) {
+                               return err
+                       }
+               } else if fi.Mode()&os.ModeDir == 0 {
+                       return fmt.Errorf("filesystem %q must be mounted on ordinary directory", m.Device)
+               }
+               if err := os.MkdirAll(dest, 0755); err != nil {
+                       return err
+               }
+               // Selinux kernels do not support labeling of /proc or /sys
+               return mountPropagate(m, rootfs, "")
+       case "mqueue":
+               if err := os.MkdirAll(dest, 0755); err != nil {
+                       return err
+               }
+               if err := mountPropagate(m, rootfs, mountLabel); err != nil {
+                       // older kernels do not support labeling of /dev/mqueue
+                       if err := mountPropagate(m, rootfs, ""); err != nil {
+                               return err
+                       }
+                       return label.SetFileLabel(dest, mountLabel)
+               }
+               return nil
+       case "tmpfs":
+               copyUp := m.Extensions&configs.EXT_COPYUP == configs.EXT_COPYUP
+               tmpDir := ""
+               stat, err := os.Stat(dest)
+               if err != nil {
+                       if err := os.MkdirAll(dest, 0755); err != nil {
+                               return err
+                       }
+               }
+               if copyUp {
+                       tmpdir, err := prepareTmp("/tmp")
+                       if err != nil {
+                               return newSystemErrorWithCause(err, "tmpcopyup: failed to setup tmpdir")
+                       }
+                       defer cleanupTmp(tmpdir)
+                       tmpDir, err = ioutil.TempDir(tmpdir, "runctmpdir")
+                       if err != nil {
+                               return newSystemErrorWithCause(err, "tmpcopyup: failed to create tmpdir")
+                       }
+                       defer os.RemoveAll(tmpDir)
+                       m.Destination = tmpDir
+               }
+               if err := mountPropagate(m, rootfs, mountLabel); err != nil {
+                       return err
+               }
+               if copyUp {
+                       if err := fileutils.CopyDirectory(dest, tmpDir); err != nil {
+                               errMsg := fmt.Errorf("tmpcopyup: failed to copy %s to %s: %v", dest, tmpDir, err)
+                               if err1 := unix.Unmount(tmpDir, unix.MNT_DETACH); err1 != nil {
+                                       return newSystemErrorWithCausef(err1, "tmpcopyup: %v: failed to unmount", errMsg)
+                               }
+                               return errMsg
+                       }
+                       if err := unix.Mount(tmpDir, dest, "", unix.MS_MOVE, ""); err != nil {
+                               errMsg := fmt.Errorf("tmpcopyup: failed to move mount %s to %s: %v", tmpDir, dest, err)
+                               if err1 := unix.Unmount(tmpDir, unix.MNT_DETACH); err1 != nil {
+                                       return newSystemErrorWithCausef(err1, "tmpcopyup: %v: failed to unmount", errMsg)
+                               }
+                               return errMsg
+                       }
+               }
+               if stat != nil {
+                       if err = os.Chmod(dest, stat.Mode()); err != nil {
+                               return err
+                       }
+               }
+               return nil
+       case "bind":
+               if err := prepareBindMount(m, rootfs); err != nil {
+                       return err
+               }
+               if err := mountPropagate(m, rootfs, mountLabel); err != nil {
+                       return err
+               }
+               // bind mount won't change mount options, we need remount to make mount options effective.
+               // first check that we have non-default options required before attempting a remount
+               if m.Flags&^(unix.MS_REC|unix.MS_REMOUNT|unix.MS_BIND) != 0 {
+                       // only remount if unique mount options are set
+                       if err := remount(m, rootfs); err != nil {
+                               return err
+                       }
+               }
+
+               if m.Relabel != "" {
+                       if err := label.Validate(m.Relabel); err != nil {
+                               return err
+                       }
+                       shared := label.IsShared(m.Relabel)
+                       if err := label.Relabel(m.Source, mountLabel, shared); err != nil {
+                               return err
+                       }
+               }
+       case "cgroup":
+               if cgroups.IsCgroup2UnifiedMode() {
+                       if err := mountCgroupV2(m, rootfs, mountLabel, enableCgroupns); err != nil {
+                               return err
+                       }
+               } else {
+
+                       if err := mountCgroupV1(m, rootfs, mountLabel, enableCgroupns); err != nil {
+                               return err
+                       }
+               }
+               if m.Flags&unix.MS_RDONLY != 0 {
+                       // remount cgroup root as readonly
+                       mcgrouproot := &configs.Mount{
+                               Source:      m.Destination,
+                               Device:      "bind",
+                               Destination: m.Destination,
+                               Flags:       defaultMountFlags | unix.MS_RDONLY | unix.MS_BIND,
+                       }
+                       if err := remount(mcgrouproot, rootfs); err != nil {
+                               return err
+                       }
+               }
+       default:
+               // ensure that the destination of the mount is resolved of symlinks at mount time because
+               // any previous mounts can invalidate the next mount's destination.
+               // this can happen when a user specifies mounts within other mounts to cause breakouts or other
+               // evil stuff to try to escape the container's rootfs.
+               var err error
+               if dest, err = securejoin.SecureJoin(rootfs, m.Destination); err != nil {
+                       return err
+               }
+               if err := checkProcMount(rootfs, dest, m.Source); err != nil {
+                       return err
+               }
+               // update the mount with the correct dest after symlinks are resolved.
+               m.Destination = dest
+               if err := os.MkdirAll(dest, 0755); err != nil {
+                       return err
+               }
+               return mountPropagate(m, rootfs, mountLabel)
+       }
+       return nil
+}
+
+func getCgroupMounts(m *configs.Mount) ([]*configs.Mount, error) {
+       mounts, err := cgroups.GetCgroupMounts(false)
+       if err != nil {
+               return nil, err
+       }
+
+       cgroupPaths, err := cgroups.ParseCgroupFile("/proc/self/cgroup")
+       if err != nil {
+               return nil, err
+       }
+
+       var binds []*configs.Mount
+
+       for _, mm := range mounts {
+               dir, err := mm.GetOwnCgroup(cgroupPaths)
+               if err != nil {
+                       return nil, err
+               }
+               relDir, err := filepath.Rel(mm.Root, dir)
+               if err != nil {
+                       return nil, err
+               }
+               binds = append(binds, &configs.Mount{
+                       Device:           "bind",
+                       Source:           filepath.Join(mm.Mountpoint, relDir),
+                       Destination:      filepath.Join(m.Destination, filepath.Base(mm.Mountpoint)),
+                       Flags:            unix.MS_BIND | unix.MS_REC | m.Flags,
+                       PropagationFlags: m.PropagationFlags,
+               })
+       }
+
+       return binds, nil
+}
+
+// checkProcMount checks to ensure that the mount destination is not over the top of /proc.
+// dest is required to be an abs path and have any symlinks resolved before calling this function.
+//
+// if source is nil, don't stat the filesystem.  This is used for restore of a checkpoint.
+func checkProcMount(rootfs, dest, source string) error {
+       const procPath = "/proc"
+       // White list, it should be sub directories of invalid destinations
+       validDestinations := []string{
+               // These entries can be bind mounted by files emulated by fuse,
+               // so commands like top, free displays stats in container.
+               "/proc/cpuinfo",
+               "/proc/diskstats",
+               "/proc/meminfo",
+               "/proc/stat",
+               "/proc/swaps",
+               "/proc/uptime",
+               "/proc/loadavg",
+               "/proc/net/dev",
+       }
+       for _, valid := range validDestinations {
+               path, err := filepath.Rel(filepath.Join(rootfs, valid), dest)
+               if err != nil {
+                       return err
+               }
+               if path == "." {
+                       return nil
+               }
+       }
+       path, err := filepath.Rel(filepath.Join(rootfs, procPath), dest)
+       if err != nil {
+               return err
+       }
+       // pass if the mount path is located outside of /proc
+       if strings.HasPrefix(path, "..") {
+               return nil
+       }
+       if path == "." {
+               // an empty source is pasted on restore
+               if source == "" {
+                       return nil
+               }
+               // only allow a mount on-top of proc if it's source is "proc"
+               isproc, err := isProc(source)
+               if err != nil {
+                       return err
+               }
+               // pass if the mount is happening on top of /proc and the source of
+               // the mount is a proc filesystem
+               if isproc {
+                       return nil
+               }
+               return fmt.Errorf("%q cannot be mounted because it is not of type proc", dest)
+       }
+       return fmt.Errorf("%q cannot be mounted because it is inside /proc", dest)
+}
+
+func isProc(path string) (bool, error) {
+       var s unix.Statfs_t
+       if err := unix.Statfs(path, &s); err != nil {
+               return false, err
+       }
+       return s.Type == unix.PROC_SUPER_MAGIC, nil
+}
+
+func setupDevSymlinks(rootfs string) error {
+       var links = [][2]string{
+               {"/proc/self/fd", "/dev/fd"},
+               {"/proc/self/fd/0", "/dev/stdin"},
+               {"/proc/self/fd/1", "/dev/stdout"},
+               {"/proc/self/fd/2", "/dev/stderr"},
+       }
+       // kcore support can be toggled with CONFIG_PROC_KCORE; only create a symlink
+       // in /dev if it exists in /proc.
+       if _, err := os.Stat("/proc/kcore"); err == nil {
+               links = append(links, [2]string{"/proc/kcore", "/dev/core"})
+       }
+       for _, link := range links {
+               var (
+                       src = link[0]
+                       dst = filepath.Join(rootfs, link[1])
+               )
+               if err := os.Symlink(src, dst); err != nil && !os.IsExist(err) {
+                       return fmt.Errorf("symlink %s %s %s", src, dst, err)
+               }
+       }
+       return nil
+}
+
+// If stdin, stdout, and/or stderr are pointing to `/dev/null` in the parent's rootfs
+// this method will make them point to `/dev/null` in this container's rootfs.  This
+// needs to be called after we chroot/pivot into the container's rootfs so that any
+// symlinks are resolved locally.
+func reOpenDevNull() error {
+       var stat, devNullStat unix.Stat_t
+       file, err := os.OpenFile("/dev/null", os.O_RDWR, 0)
+       if err != nil {
+               return fmt.Errorf("Failed to open /dev/null - %s", err)
+       }
+       defer file.Close()
+       if err := unix.Fstat(int(file.Fd()), &devNullStat); err != nil {
+               return err
+       }
+       for fd := 0; fd < 3; fd++ {
+               if err := unix.Fstat(fd, &stat); err != nil {
+                       return err
+               }
+               if stat.Rdev == devNullStat.Rdev {
+                       // Close and re-open the fd.
+                       if err := unix.Dup3(int(file.Fd()), fd, 0); err != nil {
+                               return err
+                       }
+               }
+       }
+       return nil
+}
+
+// Create the device nodes in the container.
+func createDevices(config *configs.Config) error {
+       useBindMount := system.RunningInUserNS() || config.Namespaces.Contains(configs.NEWUSER)
+       oldMask := unix.Umask(0000)
+       for _, node := range config.Devices {
+               // containers running in a user namespace are not allowed to mknod
+               // devices so we can just bind mount it from the host.
+               if err := createDeviceNode(config.Rootfs, node, useBindMount); err != nil {
+                       unix.Umask(oldMask)
+                       return err
+               }
+       }
+       unix.Umask(oldMask)
+       return nil
+}
+
+func bindMountDeviceNode(dest string, node *configs.Device) error {
+       f, err := os.Create(dest)
+       if err != nil && !os.IsExist(err) {
+               return err
+       }
+       if f != nil {
+               f.Close()
+       }
+       return unix.Mount(node.Path, dest, "bind", unix.MS_BIND, "")
+}
+
+// Creates the device node in the rootfs of the container.
+func createDeviceNode(rootfs string, node *configs.Device, bind bool) error {
+       dest := filepath.Join(rootfs, node.Path)
+       if err := os.MkdirAll(filepath.Dir(dest), 0755); err != nil {
+               return err
+       }
+
+       if bind {
+               return bindMountDeviceNode(dest, node)
+       }
+       if err := mknodDevice(dest, node); err != nil {
+               if os.IsExist(err) {
+                       return nil
+               } else if os.IsPermission(err) {
+                       return bindMountDeviceNode(dest, node)
+               }
+               return err
+       }
+       return nil
+}
+
+func mknodDevice(dest string, node *configs.Device) error {
+       fileMode := node.FileMode
+       switch node.Type {
+       case 'c', 'u':
+               fileMode |= unix.S_IFCHR
+       case 'b':
+               fileMode |= unix.S_IFBLK
+       case 'p':
+               fileMode |= unix.S_IFIFO
+       default:
+               return fmt.Errorf("%c is not a valid device type for device %s", node.Type, node.Path)
+       }
+       if err := unix.Mknod(dest, uint32(fileMode), node.Mkdev()); err != nil {
+               return err
+       }
+       return unix.Chown(dest, int(node.Uid), int(node.Gid))
+}
+
+func getMountInfo(mountinfo []*mount.Info, dir string) *mount.Info {
+       for _, m := range mountinfo {
+               if m.Mountpoint == dir {
+                       return m
+               }
+       }
+       return nil
+}
+
+// Get the parent mount point of directory passed in as argument. Also return
+// optional fields.
+func getParentMount(rootfs string) (string, string, error) {
+       var path string
+
+       mountinfos, err := mount.GetMounts()
+       if err != nil {
+               return "", "", err
+       }
+
+       mountinfo := getMountInfo(mountinfos, rootfs)
+       if mountinfo != nil {
+               return rootfs, mountinfo.Optional, nil
+       }
+
+       path = rootfs
+       for {
+               path = filepath.Dir(path)
+
+               mountinfo = getMountInfo(mountinfos, path)
+               if mountinfo != nil {
+                       return path, mountinfo.Optional, nil
+               }
+
+               if path == "/" {
+                       break
+               }
+       }
+
+       // If we are here, we did not find parent mount. Something is wrong.
+       return "", "", fmt.Errorf("Could not find parent mount of %s", rootfs)
+}
+
+// Make parent mount private if it was shared
+func rootfsParentMountPrivate(rootfs string) error {
+       sharedMount := false
+
+       parentMount, optionalOpts, err := getParentMount(rootfs)
+       if err != nil {
+               return err
+       }
+
+       optsSplit := strings.Split(optionalOpts, " ")
+       for _, opt := range optsSplit {
+               if strings.HasPrefix(opt, "shared:") {
+                       sharedMount = true
+                       break
+               }
+       }
+
+       // Make parent mount PRIVATE if it was shared. It is needed for two
+       // reasons. First of all pivot_root() will fail if parent mount is
+       // shared. Secondly when we bind mount rootfs it will propagate to
+       // parent namespace and we don't want that to happen.
+       if sharedMount {
+               return unix.Mount("", parentMount, "", unix.MS_PRIVATE, "")
+       }
+
+       return nil
+}
+
+func prepareRoot(config *configs.Config) error {
+       flag := unix.MS_SLAVE | unix.MS_REC
+       if config.RootPropagation != 0 {
+               flag = config.RootPropagation
+       }
+       if err := unix.Mount("", "/", "", uintptr(flag), ""); err != nil {
+               return err
+       }
+
+       // Make parent mount private to make sure following bind mount does
+       // not propagate in other namespaces. Also it will help with kernel
+       // check pass in pivot_root. (IS_SHARED(new_mnt->mnt_parent))
+       if err := rootfsParentMountPrivate(config.Rootfs); err != nil {
+               return err
+       }
+
+       return unix.Mount(config.Rootfs, config.Rootfs, "bind", unix.MS_BIND|unix.MS_REC, "")
+}
+
+func setReadonly() error {
+       return unix.Mount("/", "/", "bind", unix.MS_BIND|unix.MS_REMOUNT|unix.MS_RDONLY|unix.MS_REC, "")
+}
+
+func setupPtmx(config *configs.Config) error {
+       ptmx := filepath.Join(config.Rootfs, "dev/ptmx")
+       if err := os.Remove(ptmx); err != nil && !os.IsNotExist(err) {
+               return err
+       }
+       if err := os.Symlink("pts/ptmx", ptmx); err != nil {
+               return fmt.Errorf("symlink dev ptmx %s", err)
+       }
+       return nil
+}
+
+// pivotRoot will call pivot_root such that rootfs becomes the new root
+// filesystem, and everything else is cleaned up.
+func pivotRoot(rootfs string) error {
+       // While the documentation may claim otherwise, pivot_root(".", ".") is
+       // actually valid. What this results in is / being the new root but
+       // /proc/self/cwd being the old root. Since we can play around with the cwd
+       // with pivot_root this allows us to pivot without creating directories in
+       // the rootfs. Shout-outs to the LXC developers for giving us this idea.
+
+       oldroot, err := unix.Open("/", unix.O_DIRECTORY|unix.O_RDONLY, 0)
+       if err != nil {
+               return err
+       }
+       defer unix.Close(oldroot)
+
+       newroot, err := unix.Open(rootfs, unix.O_DIRECTORY|unix.O_RDONLY, 0)
+       if err != nil {
+               return err
+       }
+       defer unix.Close(newroot)
+
+       // Change to the new root so that the pivot_root actually acts on it.
+       if err := unix.Fchdir(newroot); err != nil {
+               return err
+       }
+
+       if err := unix.PivotRoot(".", "."); err != nil {
+               return fmt.Errorf("pivot_root %s", err)
+       }
+
+       // Currently our "." is oldroot (according to the current kernel code).
+       // However, purely for safety, we will fchdir(oldroot) since there isn't
+       // really any guarantee from the kernel what /proc/self/cwd will be after a
+       // pivot_root(2).
+
+       if err := unix.Fchdir(oldroot); err != nil {
+               return err
+       }
+
+       // Make oldroot rslave to make sure our unmounts don't propagate to the
+       // host (and thus bork the machine). We don't use rprivate because this is
+       // known to cause issues due to races where we still have a reference to a
+       // mount while a process in the host namespace are trying to operate on
+       // something they think has no mounts (devicemapper in particular).
+       if err := unix.Mount("", ".", "", unix.MS_SLAVE|unix.MS_REC, ""); err != nil {
+               return err
+       }
+       // Preform the unmount. MNT_DETACH allows us to unmount /proc/self/cwd.
+       if err := unix.Unmount(".", unix.MNT_DETACH); err != nil {
+               return err
+       }
+
+       // Switch back to our shiny new root.
+       if err := unix.Chdir("/"); err != nil {
+               return fmt.Errorf("chdir / %s", err)
+       }
+       return nil
+}
+
+func msMoveRoot(rootfs string) error {
+       mountinfos, err := mount.GetMounts()
+       if err != nil {
+               return err
+       }
+
+       absRootfs, err := filepath.Abs(rootfs)
+       if err != nil {
+               return err
+       }
+
+       for _, info := range mountinfos {
+               p, err := filepath.Abs(info.Mountpoint)
+               if err != nil {
+                       return err
+               }
+               // Umount every syfs and proc file systems, except those under the container rootfs
+               if (info.Fstype != "proc" && info.Fstype != "sysfs") || filepath.HasPrefix(p, absRootfs) {
+                       continue
+               }
+               // Be sure umount events are not propagated to the host.
+               if err := unix.Mount("", p, "", unix.MS_SLAVE|unix.MS_REC, ""); err != nil {
+                       return err
+               }
+               if err := unix.Unmount(p, unix.MNT_DETACH); err != nil {
+                       if err != unix.EINVAL && err != unix.EPERM {
+                               return err
+                       } else {
+                               // If we have not privileges for umounting (e.g. rootless), then
+                               // cover the path.
+                               if err := unix.Mount("tmpfs", p, "tmpfs", 0, ""); err != nil {
+                                       return err
+                               }
+                       }
+               }
+       }
+       if err := unix.Mount(rootfs, "/", "", unix.MS_MOVE, ""); err != nil {
+               return err
+       }
+       return chroot(rootfs)
+}
+
+func chroot(rootfs string) error {
+       if err := unix.Chroot("."); err != nil {
+               return err
+       }
+       return unix.Chdir("/")
+}
+
+// createIfNotExists creates a file or a directory only if it does not already exist.
+func createIfNotExists(path string, isDir bool) error {
+       if _, err := os.Stat(path); err != nil {
+               if os.IsNotExist(err) {
+                       if isDir {
+                               return os.MkdirAll(path, 0755)
+                       }
+                       if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
+                               return err
+                       }
+                       f, err := os.OpenFile(path, os.O_CREATE, 0755)
+                       if err != nil {
+                               return err
+                       }
+                       f.Close()
+               }
+       }
+       return nil
+}
+
+// readonlyPath will make a path read only.
+func readonlyPath(path string) error {
+       if err := unix.Mount(path, path, "", unix.MS_BIND|unix.MS_REC, ""); err != nil {
+               if os.IsNotExist(err) {
+                       return nil
+               }
+               return err
+       }
+       return unix.Mount(path, path, "", unix.MS_BIND|unix.MS_REMOUNT|unix.MS_RDONLY|unix.MS_REC, "")
+}
+
+// remountReadonly will remount an existing mount point and ensure that it is read-only.
+func remountReadonly(m *configs.Mount) error {
+       var (
+               dest  = m.Destination
+               flags = m.Flags
+       )
+       for i := 0; i < 5; i++ {
+               // There is a special case in the kernel for
+               // MS_REMOUNT | MS_BIND, which allows us to change only the
+               // flags even as an unprivileged user (i.e. user namespace)
+               // assuming we don't drop any security related flags (nodev,
+               // nosuid, etc.). So, let's use that case so that we can do
+               // this re-mount without failing in a userns.
+               flags |= unix.MS_REMOUNT | unix.MS_BIND | unix.MS_RDONLY
+               if err := unix.Mount("", dest, "", uintptr(flags), ""); err != nil {
+                       switch err {
+                       case unix.EBUSY:
+                               time.Sleep(100 * time.Millisecond)
+                               continue
+                       default:
+                               return err
+                       }
+               }
+               return nil
+       }
+       return fmt.Errorf("unable to mount %s as readonly max retries reached", dest)
+}
+
+// maskPath masks the top of the specified path inside a container to avoid
+// security issues from processes reading information from non-namespace aware
+// mounts ( proc/kcore ).
+// For files, maskPath bind mounts /dev/null over the top of the specified path.
+// For directories, maskPath mounts read-only tmpfs over the top of the specified path.
+func maskPath(path string, mountLabel string) error {
+       if err := unix.Mount("/dev/null", path, "", unix.MS_BIND, ""); err != nil && !os.IsNotExist(err) {
+               if err == unix.ENOTDIR {
+                       return unix.Mount("tmpfs", path, "tmpfs", unix.MS_RDONLY, label.FormatMountLabel("", mountLabel))
+               }
+               return err
+       }
+       return nil
+}
+
+// writeSystemProperty writes the value to a path under /proc/sys as determined from the key.
+// For e.g. net.ipv4.ip_forward translated to /proc/sys/net/ipv4/ip_forward.
+func writeSystemProperty(key, value string) error {
+       keyPath := strings.Replace(key, ".", "/", -1)
+       return ioutil.WriteFile(path.Join("/proc/sys", keyPath), []byte(value), 0644)
+}
+
+func remount(m *configs.Mount, rootfs string) error {
+       var (
+               dest = m.Destination
+       )
+       if !strings.HasPrefix(dest, rootfs) {
+               dest = filepath.Join(rootfs, dest)
+       }
+       return unix.Mount(m.Source, dest, m.Device, uintptr(m.Flags|unix.MS_REMOUNT), "")
+}
+
+// Do the mount operation followed by additional mounts required to take care
+// of propagation flags.
+func mountPropagate(m *configs.Mount, rootfs string, mountLabel string) error {
+       var (
+               dest  = m.Destination
+               data  = label.FormatMountLabel(m.Data, mountLabel)
+               flags = m.Flags
+       )
+       if libcontainerUtils.CleanPath(dest) == "/dev" {
+               flags &= ^unix.MS_RDONLY
+       }
+
+       copyUp := m.Extensions&configs.EXT_COPYUP == configs.EXT_COPYUP
+       if !(copyUp || strings.HasPrefix(dest, rootfs)) {
+               dest = filepath.Join(rootfs, dest)
+       }
+
+       if err := unix.Mount(m.Source, dest, m.Device, uintptr(flags), data); err != nil {
+               return err
+       }
+
+       for _, pflag := range m.PropagationFlags {
+               if err := unix.Mount("", dest, "", uintptr(pflag), ""); err != nil {
+                       return err
+               }
+       }
+       return nil
+}
+
+func mountNewCgroup(m *configs.Mount) error {
+       var (
+               data   = m.Data
+               source = m.Source
+       )
+       if data == "systemd" {
+               data = cgroups.CgroupNamePrefix + data
+               source = "systemd"
+       }
+       if err := unix.Mount(source, m.Destination, m.Device, uintptr(m.Flags), data); err != nil {
+               return err
+       }
+       return nil
+}
diff --git a/libcontainer/rootfs_linux_test.go b/libcontainer/rootfs_linux_test.go
new file mode 100644 (file)
index 0000000..1bfe7c6
--- /dev/null
@@ -0,0 +1,101 @@
+// +build linux
+
+package libcontainer
+
+import (
+       "testing"
+
+       "github.com/opencontainers/runc/libcontainer/configs"
+)
+
+func TestCheckMountDestOnProc(t *testing.T) {
+       dest := "/rootfs/proc/sys"
+       err := checkProcMount("/rootfs", dest, "")
+       if err == nil {
+               t.Fatal("destination inside proc should return an error")
+       }
+}
+
+func TestCheckMountDestOnProcChroot(t *testing.T) {
+       dest := "/rootfs/proc/"
+       err := checkProcMount("/rootfs", dest, "/proc")
+       if err != nil {
+               t.Fatal("destination inside proc when using chroot should not return an error")
+       }
+}
+
+func TestCheckMountDestInSys(t *testing.T) {
+       dest := "/rootfs//sys/fs/cgroup"
+       err := checkProcMount("/rootfs", dest, "")
+       if err != nil {
+               t.Fatal("destination inside /sys should not return an error")
+       }
+}
+
+func TestCheckMountDestFalsePositive(t *testing.T) {
+       dest := "/rootfs/sysfiles/fs/cgroup"
+       err := checkProcMount("/rootfs", dest, "")
+       if err != nil {
+               t.Fatal(err)
+       }
+}
+
+func TestNeedsSetupDev(t *testing.T) {
+       config := &configs.Config{
+               Mounts: []*configs.Mount{
+                       {
+                               Device:      "bind",
+                               Source:      "/dev",
+                               Destination: "/dev",
+                       },
+               },
+       }
+       if needsSetupDev(config) {
+               t.Fatal("expected needsSetupDev to be false, got true")
+       }
+}
+
+func TestNeedsSetupDevStrangeSource(t *testing.T) {
+       config := &configs.Config{
+               Mounts: []*configs.Mount{
+                       {
+                               Device:      "bind",
+                               Source:      "/devx",
+                               Destination: "/dev",
+                       },
+               },
+       }
+       if needsSetupDev(config) {
+               t.Fatal("expected needsSetupDev to be false, got true")
+       }
+}
+
+func TestNeedsSetupDevStrangeDest(t *testing.T) {
+       config := &configs.Config{
+               Mounts: []*configs.Mount{
+                       {
+                               Device:      "bind",
+                               Source:      "/dev",
+                               Destination: "/devx",
+                       },
+               },
+       }
+       if !needsSetupDev(config) {
+               t.Fatal("expected needsSetupDev to be true, got false")
+       }
+}
+
+func TestNeedsSetupDevStrangeSourceDest(t *testing.T) {
+       config := &configs.Config{
+               Mounts: []*configs.Mount{
+                       {
+                               Device:      "bind",
+                               Source:      "/devx",
+                               Destination: "/devx",
+                       },
+               },
+       }
+       if !needsSetupDev(config) {
+               t.Fatal("expected needsSetupDev to be true, got false")
+       }
+}
diff --git a/libcontainer/seccomp/config.go b/libcontainer/seccomp/config.go
new file mode 100644 (file)
index 0000000..c321227
--- /dev/null
@@ -0,0 +1,77 @@
+package seccomp
+
+import (
+       "fmt"
+
+       "github.com/opencontainers/runc/libcontainer/configs"
+)
+
+var operators = map[string]configs.Operator{
+       "SCMP_CMP_NE":        configs.NotEqualTo,
+       "SCMP_CMP_LT":        configs.LessThan,
+       "SCMP_CMP_LE":        configs.LessThanOrEqualTo,
+       "SCMP_CMP_EQ":        configs.EqualTo,
+       "SCMP_CMP_GE":        configs.GreaterThanOrEqualTo,
+       "SCMP_CMP_GT":        configs.GreaterThan,
+       "SCMP_CMP_MASKED_EQ": configs.MaskEqualTo,
+}
+
+var actions = map[string]configs.Action{
+       "SCMP_ACT_KILL":  configs.Kill,
+       "SCMP_ACT_ERRNO": configs.Errno,
+       "SCMP_ACT_TRAP":  configs.Trap,
+       "SCMP_ACT_ALLOW": configs.Allow,
+       "SCMP_ACT_TRACE": configs.Trace,
+       "SCMP_ACT_LOG":   configs.Log,
+}
+
+var archs = map[string]string{
+       "SCMP_ARCH_X86":         "x86",
+       "SCMP_ARCH_X86_64":      "amd64",
+       "SCMP_ARCH_X32":         "x32",
+       "SCMP_ARCH_ARM":         "arm",
+       "SCMP_ARCH_AARCH64":     "arm64",
+       "SCMP_ARCH_MIPS":        "mips",
+       "SCMP_ARCH_MIPS64":      "mips64",
+       "SCMP_ARCH_MIPS64N32":   "mips64n32",
+       "SCMP_ARCH_MIPSEL":      "mipsel",
+       "SCMP_ARCH_MIPSEL64":    "mipsel64",
+       "SCMP_ARCH_MIPSEL64N32": "mipsel64n32",
+       "SCMP_ARCH_PPC":         "ppc",
+       "SCMP_ARCH_PPC64":       "ppc64",
+       "SCMP_ARCH_PPC64LE":     "ppc64le",
+       "SCMP_ARCH_S390":        "s390",
+       "SCMP_ARCH_S390X":       "s390x",
+}
+
+// ConvertStringToOperator converts a string into a Seccomp comparison operator.
+// Comparison operators use the names they are assigned by Libseccomp's header.
+// Attempting to convert a string that is not a valid operator results in an
+// error.
+func ConvertStringToOperator(in string) (configs.Operator, error) {
+       if op, ok := operators[in]; ok == true {
+               return op, nil
+       }
+       return 0, fmt.Errorf("string %s is not a valid operator for seccomp", in)
+}
+
+// ConvertStringToAction converts a string into a Seccomp rule match action.
+// Actions use the names they are assigned in Libseccomp's header, though some
+// (notable, SCMP_ACT_TRACE) are not available in this implementation and will
+// return errors.
+// Attempting to convert a string that is not a valid action results in an
+// error.
+func ConvertStringToAction(in string) (configs.Action, error) {
+       if act, ok := actions[in]; ok == true {
+               return act, nil
+       }
+       return 0, fmt.Errorf("string %s is not a valid action for seccomp", in)
+}
+
+// ConvertStringToArch converts a string into a Seccomp comparison arch.
+func ConvertStringToArch(in string) (string, error) {
+       if arch, ok := archs[in]; ok == true {
+               return arch, nil
+       }
+       return "", fmt.Errorf("string %s is not a valid arch for seccomp", in)
+}
diff --git a/libcontainer/seccomp/fixtures/proc_self_status b/libcontainer/seccomp/fixtures/proc_self_status
new file mode 100644 (file)
index 0000000..0e0084f
--- /dev/null
@@ -0,0 +1,47 @@
+Name:   cat
+State:  R (running)
+Tgid:   19383
+Ngid:   0
+Pid:    19383
+PPid:   19275
+TracerPid:  0
+Uid:    1000    1000    1000    1000
+Gid:    1000    1000    1000    1000
+FDSize: 256
+Groups: 24 25 27 29 30 44 46 102 104 108 111 1000 1001
+NStgid: 19383
+NSpid:  19383
+NSpgid: 19383
+NSsid:  19275
+VmPeak:     5944 kB
+VmSize:     5944 kB
+VmLck:         0 kB
+VmPin:         0 kB
+VmHWM:       744 kB
+VmRSS:       744 kB
+VmData:      324 kB
+VmStk:       136 kB
+VmExe:        48 kB
+VmLib:      1776 kB
+VmPTE:        32 kB
+VmPMD:        12 kB
+VmSwap:        0 kB
+Threads:    1
+SigQ:   0/30067
+SigPnd: 0000000000000000
+ShdPnd: 0000000000000000
+SigBlk: 0000000000000000
+SigIgn: 0000000000000080
+SigCgt: 0000000000000000
+CapInh: 0000000000000000
+CapPrm: 0000000000000000
+CapEff: 0000000000000000
+CapBnd: 0000003fffffffff
+CapAmb: 0000000000000000
+Seccomp:    0
+Cpus_allowed:   f
+Cpus_allowed_list:  0-3
+Mems_allowed:   00000000,00000001
+Mems_allowed_list:  0
+voluntary_ctxt_switches:    0
+nonvoluntary_ctxt_switches: 1
diff --git a/libcontainer/seccomp/seccomp_linux.go b/libcontainer/seccomp/seccomp_linux.go
new file mode 100644 (file)
index 0000000..1b7a071
--- /dev/null
@@ -0,0 +1,261 @@
+// +build linux,cgo,seccomp
+
+package seccomp
+
+import (
+       "bufio"
+       "fmt"
+       "os"
+       "strings"
+
+       "github.com/opencontainers/runc/libcontainer/configs"
+       libseccomp "github.com/seccomp/libseccomp-golang"
+
+       "golang.org/x/sys/unix"
+)
+
+var (
+       actAllow = libseccomp.ActAllow
+       actTrap  = libseccomp.ActTrap
+       actKill  = libseccomp.ActKill
+       actTrace = libseccomp.ActTrace.SetReturnCode(int16(unix.EPERM))
+       actLog   = libseccomp.ActLog
+       actErrno = libseccomp.ActErrno.SetReturnCode(int16(unix.EPERM))
+)
+
+const (
+       // Linux system calls can have at most 6 arguments
+       syscallMaxArguments int = 6
+)
+
+// Filters given syscalls in a container, preventing them from being used
+// Started in the container init process, and carried over to all child processes
+// Setns calls, however, require a separate invocation, as they are not children
+// of the init until they join the namespace
+func InitSeccomp(config *configs.Seccomp) error {
+       if config == nil {
+               return fmt.Errorf("cannot initialize Seccomp - nil config passed")
+       }
+
+       defaultAction, err := getAction(config.DefaultAction)
+       if err != nil {
+               return fmt.Errorf("error initializing seccomp - invalid default action")
+       }
+
+       filter, err := libseccomp.NewFilter(defaultAction)
+       if err != nil {
+               return fmt.Errorf("error creating filter: %s", err)
+       }
+
+       // Add extra architectures
+       for _, arch := range config.Architectures {
+               scmpArch, err := libseccomp.GetArchFromString(arch)
+               if err != nil {
+                       return fmt.Errorf("error validating Seccomp architecture: %s", err)
+               }
+
+               if err := filter.AddArch(scmpArch); err != nil {
+                       return fmt.Errorf("error adding architecture to seccomp filter: %s", err)
+               }
+       }
+
+       // Unset no new privs bit
+       if err := filter.SetNoNewPrivsBit(false); err != nil {
+               return fmt.Errorf("error setting no new privileges: %s", err)
+       }
+
+       // Add a rule for each syscall
+       for _, call := range config.Syscalls {
+               if call == nil {
+                       return fmt.Errorf("encountered nil syscall while initializing Seccomp")
+               }
+
+               if err = matchCall(filter, call); err != nil {
+                       return err
+               }
+       }
+
+       if err = filter.Load(); err != nil {
+               return fmt.Errorf("error loading seccomp filter into kernel: %s", err)
+       }
+
+       return nil
+}
+
+// IsEnabled returns if the kernel has been configured to support seccomp.
+func IsEnabled() bool {
+       // Try to read from /proc/self/status for kernels > 3.8
+       s, err := parseStatusFile("/proc/self/status")
+       if err != nil {
+               // Check if Seccomp is supported, via CONFIG_SECCOMP.
+               if err := unix.Prctl(unix.PR_GET_SECCOMP, 0, 0, 0, 0); err != unix.EINVAL {
+                       // Make sure the kernel has CONFIG_SECCOMP_FILTER.
+                       if err := unix.Prctl(unix.PR_SET_SECCOMP, unix.SECCOMP_MODE_FILTER, 0, 0, 0); err != unix.EINVAL {
+                               return true
+                       }
+               }
+               return false
+       }
+       _, ok := s["Seccomp"]
+       return ok
+}
+
+// Convert Libcontainer Action to Libseccomp ScmpAction
+func getAction(act configs.Action) (libseccomp.ScmpAction, error) {
+       switch act {
+       case configs.Kill:
+               return actKill, nil
+       case configs.Errno:
+               return actErrno, nil
+       case configs.Trap:
+               return actTrap, nil
+       case configs.Allow:
+               return actAllow, nil
+       case configs.Trace:
+               return actTrace, nil
+       case configs.Log:
+               return actLog, nil
+       default:
+               return libseccomp.ActInvalid, fmt.Errorf("invalid action, cannot use in rule")
+       }
+}
+
+// Convert Libcontainer Operator to Libseccomp ScmpCompareOp
+func getOperator(op configs.Operator) (libseccomp.ScmpCompareOp, error) {
+       switch op {
+       case configs.EqualTo:
+               return libseccomp.CompareEqual, nil
+       case configs.NotEqualTo:
+               return libseccomp.CompareNotEqual, nil
+       case configs.GreaterThan:
+               return libseccomp.CompareGreater, nil
+       case configs.GreaterThanOrEqualTo:
+               return libseccomp.CompareGreaterEqual, nil
+       case configs.LessThan:
+               return libseccomp.CompareLess, nil
+       case configs.LessThanOrEqualTo:
+               return libseccomp.CompareLessOrEqual, nil
+       case configs.MaskEqualTo:
+               return libseccomp.CompareMaskedEqual, nil
+       default:
+               return libseccomp.CompareInvalid, fmt.Errorf("invalid operator, cannot use in rule")
+       }
+}
+
+// Convert Libcontainer Arg to Libseccomp ScmpCondition
+func getCondition(arg *configs.Arg) (libseccomp.ScmpCondition, error) {
+       cond := libseccomp.ScmpCondition{}
+
+       if arg == nil {
+               return cond, fmt.Errorf("cannot convert nil to syscall condition")
+       }
+
+       op, err := getOperator(arg.Op)
+       if err != nil {
+               return cond, err
+       }
+
+       return libseccomp.MakeCondition(arg.Index, op, arg.Value, arg.ValueTwo)
+}
+
+// Add a rule to match a single syscall
+func matchCall(filter *libseccomp.ScmpFilter, call *configs.Syscall) error {
+       if call == nil || filter == nil {
+               return fmt.Errorf("cannot use nil as syscall to block")
+       }
+
+       if len(call.Name) == 0 {
+               return fmt.Errorf("empty string is not a valid syscall")
+       }
+
+       // If we can't resolve the syscall, assume it's not supported on this kernel
+       // Ignore it, don't error out
+       callNum, err := libseccomp.GetSyscallFromName(call.Name)
+       if err != nil {
+               return nil
+       }
+
+       // Convert the call's action to the libseccomp equivalent
+       callAct, err := getAction(call.Action)
+       if err != nil {
+               return fmt.Errorf("action in seccomp profile is invalid: %s", err)
+       }
+
+       // Unconditional match - just add the rule
+       if len(call.Args) == 0 {
+               if err = filter.AddRule(callNum, callAct); err != nil {
+                       return fmt.Errorf("error adding seccomp filter rule for syscall %s: %s", call.Name, err)
+               }
+       } else {
+               // If two or more arguments have the same condition,
+               // Revert to old behavior, adding each condition as a separate rule
+               argCounts := make([]uint, syscallMaxArguments)
+               conditions := []libseccomp.ScmpCondition{}
+
+               for _, cond := range call.Args {
+                       newCond, err := getCondition(cond)
+                       if err != nil {
+                               return fmt.Errorf("error creating seccomp syscall condition for syscall %s: %s", call.Name, err)
+                       }
+
+                       argCounts[cond.Index] += 1
+
+                       conditions = append(conditions, newCond)
+               }
+
+               hasMultipleArgs := false
+               for _, count := range argCounts {
+                       if count > 1 {
+                               hasMultipleArgs = true
+                               break
+                       }
+               }
+
+               if hasMultipleArgs {
+                       // Revert to old behavior
+                       // Add each condition attached to a separate rule
+                       for _, cond := range conditions {
+                               condArr := []libseccomp.ScmpCondition{cond}
+
+                               if err = filter.AddRuleConditional(callNum, callAct, condArr); err != nil {
+                                       return fmt.Errorf("error adding seccomp rule for syscall %s: %s", call.Name, err)
+                               }
+                       }
+               } else {
+                       // No conditions share same argument
+                       // Use new, proper behavior
+                       if err = filter.AddRuleConditional(callNum, callAct, conditions); err != nil {
+                               return fmt.Errorf("error adding seccomp rule for syscall %s: %s", call.Name, err)
+                       }
+               }
+       }
+
+       return nil
+}
+
+func parseStatusFile(path string) (map[string]string, error) {
+       f, err := os.Open(path)
+       if err != nil {
+               return nil, err
+       }
+       defer f.Close()
+
+       s := bufio.NewScanner(f)
+       status := make(map[string]string)
+
+       for s.Scan() {
+               text := s.Text()
+               parts := strings.Split(text, ":")
+
+               if len(parts) <= 1 {
+                       continue
+               }
+
+               status[parts[0]] = parts[1]
+       }
+       if err := s.Err(); err != nil {
+               return nil, err
+       }
+
+       return status, nil
+}
diff --git a/libcontainer/seccomp/seccomp_linux_test.go b/libcontainer/seccomp/seccomp_linux_test.go
new file mode 100644 (file)
index 0000000..67a2ef6
--- /dev/null
@@ -0,0 +1,17 @@
+// +build linux,cgo,seccomp
+
+package seccomp
+
+import "testing"
+
+func TestParseStatusFile(t *testing.T) {
+       s, err := parseStatusFile("fixtures/proc_self_status")
+       if err != nil {
+               t.Fatal(err)
+       }
+
+       if _, ok := s["Seccomp"]; !ok {
+
+               t.Fatal("expected to find 'Seccomp' in the map but did not.")
+       }
+}
diff --git a/libcontainer/seccomp/seccomp_unsupported.go b/libcontainer/seccomp/seccomp_unsupported.go
new file mode 100644 (file)
index 0000000..44df1ad
--- /dev/null
@@ -0,0 +1,24 @@
+// +build !linux !cgo !seccomp
+
+package seccomp
+
+import (
+       "errors"
+
+       "github.com/opencontainers/runc/libcontainer/configs"
+)
+
+var ErrSeccompNotEnabled = errors.New("seccomp: config provided but seccomp not supported")
+
+// InitSeccomp does nothing because seccomp is not supported.
+func InitSeccomp(config *configs.Seccomp) error {
+       if config != nil {
+               return ErrSeccompNotEnabled
+       }
+       return nil
+}
+
+// IsEnabled returns false, because it is not supported.
+func IsEnabled() bool {
+       return false
+}
diff --git a/libcontainer/setns_init_linux.go b/libcontainer/setns_init_linux.go
new file mode 100644 (file)
index 0000000..888981f
--- /dev/null
@@ -0,0 +1,92 @@
+// +build linux
+
+package libcontainer
+
+import (
+       "fmt"
+       "os"
+       "runtime"
+
+       "github.com/opencontainers/runc/libcontainer/apparmor"
+       "github.com/opencontainers/runc/libcontainer/keys"
+       "github.com/opencontainers/runc/libcontainer/seccomp"
+       "github.com/opencontainers/runc/libcontainer/system"
+       "github.com/opencontainers/selinux/go-selinux/label"
+       "github.com/pkg/errors"
+
+       "golang.org/x/sys/unix"
+)
+
+// linuxSetnsInit performs the container's initialization for running a new process
+// inside an existing container.
+type linuxSetnsInit struct {
+       pipe          *os.File
+       consoleSocket *os.File
+       config        *initConfig
+}
+
+func (l *linuxSetnsInit) getSessionRingName() string {
+       return fmt.Sprintf("_ses.%s", l.config.ContainerId)
+}
+
+func (l *linuxSetnsInit) Init() error {
+       runtime.LockOSThread()
+       defer runtime.UnlockOSThread()
+
+       if !l.config.Config.NoNewKeyring {
+               if err := label.SetKeyLabel(l.config.ProcessLabel); err != nil {
+                       return err
+               }
+               defer label.SetKeyLabel("")
+               // Do not inherit the parent's session keyring.
+               if _, err := keys.JoinSessionKeyring(l.getSessionRingName()); err != nil {
+                       // Same justification as in standart_init_linux.go as to why we
+                       // don't bail on ENOSYS.
+                       //
+                       // TODO(cyphar): And we should have logging here too.
+                       if errors.Cause(err) != unix.ENOSYS {
+                               return errors.Wrap(err, "join session keyring")
+                       }
+               }
+       }
+       if l.config.CreateConsole {
+               if err := setupConsole(l.consoleSocket, l.config, false); err != nil {
+                       return err
+               }
+               if err := system.Setctty(); err != nil {
+                       return err
+               }
+       }
+       if l.config.NoNewPrivileges {
+               if err := unix.Prctl(unix.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil {
+                       return err
+               }
+       }
+       if err := label.SetProcessLabel(l.config.ProcessLabel); err != nil {
+               return err
+       }
+       defer label.SetProcessLabel("")
+       // Without NoNewPrivileges seccomp is a privileged operation, so we need to
+       // do this before dropping capabilities; otherwise do it as late as possible
+       // just before execve so as few syscalls take place after it as possible.
+       if l.config.Config.Seccomp != nil && !l.config.NoNewPrivileges {
+               if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
+                       return err
+               }
+       }
+       if err := finalizeNamespace(l.config); err != nil {
+               return err
+       }
+       if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil {
+               return err
+       }
+       // Set seccomp as close to execve as possible, so as few syscalls take
+       // place afterward (reducing the amount of syscalls that users need to
+       // enable in their seccomp profiles).
+       if l.config.Config.Seccomp != nil && l.config.NoNewPrivileges {
+               if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
+                       return newSystemErrorWithCause(err, "init seccomp")
+               }
+       }
+       return system.Execv(l.config.Args[0], l.config.Args[0:], os.Environ())
+}
diff --git a/libcontainer/specconv/example.go b/libcontainer/specconv/example.go
new file mode 100644 (file)
index 0000000..23e225c
--- /dev/null
@@ -0,0 +1,223 @@
+package specconv
+
+import (
+       "os"
+       "strings"
+
+       "github.com/opencontainers/runtime-spec/specs-go"
+)
+
+// Example returns an example spec file, with many options set so a user can
+// see what a standard spec file looks like.
+func Example() *specs.Spec {
+       return &specs.Spec{
+               Version: specs.Version,
+               Root: &specs.Root{
+                       Path:     "rootfs",
+                       Readonly: true,
+               },
+               Process: &specs.Process{
+                       Terminal: true,
+                       User:     specs.User{},
+                       Args: []string{
+                               "sh",
+                       },
+                       Env: []string{
+                               "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
+                               "TERM=xterm",
+                       },
+                       Cwd:             "/",
+                       NoNewPrivileges: true,
+                       Capabilities: &specs.LinuxCapabilities{
+                               Bounding: []string{
+                                       "CAP_AUDIT_WRITE",
+                                       "CAP_KILL",
+                                       "CAP_NET_BIND_SERVICE",
+                               },
+                               Permitted: []string{
+                                       "CAP_AUDIT_WRITE",
+                                       "CAP_KILL",
+                                       "CAP_NET_BIND_SERVICE",
+                               },
+                               Inheritable: []string{
+                                       "CAP_AUDIT_WRITE",
+                                       "CAP_KILL",
+                                       "CAP_NET_BIND_SERVICE",
+                               },
+                               Ambient: []string{
+                                       "CAP_AUDIT_WRITE",
+                                       "CAP_KILL",
+                                       "CAP_NET_BIND_SERVICE",
+                               },
+                               Effective: []string{
+                                       "CAP_AUDIT_WRITE",
+                                       "CAP_KILL",
+                                       "CAP_NET_BIND_SERVICE",
+                               },
+                       },
+                       Rlimits: []specs.POSIXRlimit{
+                               {
+                                       Type: "RLIMIT_NOFILE",
+                                       Hard: uint64(1024),
+                                       Soft: uint64(1024),
+                               },
+                       },
+               },
+               Hostname: "runc",
+               Mounts: []specs.Mount{
+                       {
+                               Destination: "/proc",
+                               Type:        "proc",
+                               Source:      "proc",
+                               Options:     nil,
+                       },
+                       {
+                               Destination: "/dev",
+                               Type:        "tmpfs",
+                               Source:      "tmpfs",
+                               Options:     []string{"nosuid", "strictatime", "mode=755", "size=65536k"},
+                       },
+                       {
+                               Destination: "/dev/pts",
+                               Type:        "devpts",
+                               Source:      "devpts",
+                               Options:     []string{"nosuid", "noexec", "newinstance", "ptmxmode=0666", "mode=0620", "gid=5"},
+                       },
+                       {
+                               Destination: "/dev/shm",
+                               Type:        "tmpfs",
+                               Source:      "shm",
+                               Options:     []string{"nosuid", "noexec", "nodev", "mode=1777", "size=65536k"},
+                       },
+                       {
+                               Destination: "/dev/mqueue",
+                               Type:        "mqueue",
+                               Source:      "mqueue",
+                               Options:     []string{"nosuid", "noexec", "nodev"},
+                       },
+                       {
+                               Destination: "/sys",
+                               Type:        "sysfs",
+                               Source:      "sysfs",
+                               Options:     []string{"nosuid", "noexec", "nodev", "ro"},
+                       },
+                       {
+                               Destination: "/sys/fs/cgroup",
+                               Type:        "cgroup",
+                               Source:      "cgroup",
+                               Options:     []string{"nosuid", "noexec", "nodev", "relatime", "ro"},
+                       },
+               },
+               Linux: &specs.Linux{
+                       MaskedPaths: []string{
+                               "/proc/acpi",
+                               "/proc/asound",
+                               "/proc/kcore",
+                               "/proc/keys",
+                               "/proc/latency_stats",
+                               "/proc/timer_list",
+                               "/proc/timer_stats",
+                               "/proc/sched_debug",
+                               "/sys/firmware",
+                               "/proc/scsi",
+                       },
+                       ReadonlyPaths: []string{
+                               "/proc/bus",
+                               "/proc/fs",
+                               "/proc/irq",
+                               "/proc/sys",
+                               "/proc/sysrq-trigger",
+                       },
+                       Resources: &specs.LinuxResources{
+                               Devices: []specs.LinuxDeviceCgroup{
+                                       {
+                                               Allow:  false,
+                                               Access: "rwm",
+                                       },
+                               },
+                       },
+                       Namespaces: []specs.LinuxNamespace{
+                               {
+                                       Type: "pid",
+                               },
+                               {
+                                       Type: "network",
+                               },
+                               {
+                                       Type: "ipc",
+                               },
+                               {
+                                       Type: "uts",
+                               },
+                               {
+                                       Type: "mount",
+                               },
+                       },
+               },
+       }
+}
+
+// ToRootless converts the given spec file into one that should work with
+// rootless containers (euid != 0), by removing incompatible options and adding others that
+// are needed.
+func ToRootless(spec *specs.Spec) {
+       var namespaces []specs.LinuxNamespace
+
+       // Remove networkns from the spec.
+       for _, ns := range spec.Linux.Namespaces {
+               switch ns.Type {
+               case specs.NetworkNamespace, specs.UserNamespace:
+                       // Do nothing.
+               default:
+                       namespaces = append(namespaces, ns)
+               }
+       }
+       // Add userns to the spec.
+       namespaces = append(namespaces, specs.LinuxNamespace{
+               Type: specs.UserNamespace,
+       })
+       spec.Linux.Namespaces = namespaces
+
+       // Add mappings for the current user.
+       spec.Linux.UIDMappings = []specs.LinuxIDMapping{{
+               HostID:      uint32(os.Geteuid()),
+               ContainerID: 0,
+               Size:        1,
+       }}
+       spec.Linux.GIDMappings = []specs.LinuxIDMapping{{
+               HostID:      uint32(os.Getegid()),
+               ContainerID: 0,
+               Size:        1,
+       }}
+
+       // Fix up mounts.
+       var mounts []specs.Mount
+       for _, mount := range spec.Mounts {
+               // Ignore all mounts that are under /sys.
+               if strings.HasPrefix(mount.Destination, "/sys") {
+                       continue
+               }
+
+               // Remove all gid= and uid= mappings.
+               var options []string
+               for _, option := range mount.Options {
+                       if !strings.HasPrefix(option, "gid=") && !strings.HasPrefix(option, "uid=") {
+                               options = append(options, option)
+                       }
+               }
+
+               mount.Options = options
+               mounts = append(mounts, mount)
+       }
+       // Add the sysfs mount as an rbind.
+       mounts = append(mounts, specs.Mount{
+               Source:      "/sys",
+               Destination: "/sys",
+               Type:        "none",
+               Options:     []string{"rbind", "nosuid", "noexec", "nodev", "ro"},
+       })
+       spec.Mounts = mounts
+
+       // Remove cgroup settings.
+       spec.Linux.Resources = nil
+}
diff --git a/libcontainer/specconv/spec_linux.go b/libcontainer/specconv/spec_linux.go
new file mode 100644 (file)
index 0000000..d9e73c4
--- /dev/null
@@ -0,0 +1,839 @@
+// +build linux
+
+// Package specconv implements conversion of specifications to libcontainer
+// configurations
+package specconv
+
+import (
+       "fmt"
+       "os"
+       "path/filepath"
+       "strings"
+       "time"
+
+       "github.com/opencontainers/runc/libcontainer/configs"
+       "github.com/opencontainers/runc/libcontainer/seccomp"
+       libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils"
+       "github.com/opencontainers/runtime-spec/specs-go"
+
+       "golang.org/x/sys/unix"
+)
+
+const wildcard = -1
+
+var namespaceMapping = map[specs.LinuxNamespaceType]configs.NamespaceType{
+       specs.PIDNamespace:     configs.NEWPID,
+       specs.NetworkNamespace: configs.NEWNET,
+       specs.MountNamespace:   configs.NEWNS,
+       specs.UserNamespace:    configs.NEWUSER,
+       specs.IPCNamespace:     configs.NEWIPC,
+       specs.UTSNamespace:     configs.NEWUTS,
+       specs.CgroupNamespace:  configs.NEWCGROUP,
+}
+
+var mountPropagationMapping = map[string]int{
+       "rprivate":    unix.MS_PRIVATE | unix.MS_REC,
+       "private":     unix.MS_PRIVATE,
+       "rslave":      unix.MS_SLAVE | unix.MS_REC,
+       "slave":       unix.MS_SLAVE,
+       "rshared":     unix.MS_SHARED | unix.MS_REC,
+       "shared":      unix.MS_SHARED,
+       "runbindable": unix.MS_UNBINDABLE | unix.MS_REC,
+       "unbindable":  unix.MS_UNBINDABLE,
+       "":            0,
+}
+
+// AllowedDevices is exposed for devicefilter_test.go
+var AllowedDevices = []*configs.Device{
+       // allow mknod for any device
+       {
+               Type:        'c',
+               Major:       wildcard,
+               Minor:       wildcard,
+               Permissions: "m",
+               Allow:       true,
+       },
+       {
+               Type:        'b',
+               Major:       wildcard,
+               Minor:       wildcard,
+               Permissions: "m",
+               Allow:       true,
+       },
+       {
+               Type:        'c',
+               Path:        "/dev/null",
+               Major:       1,
+               Minor:       3,
+               Permissions: "rwm",
+               Allow:       true,
+       },
+       {
+               Type:        'c',
+               Path:        "/dev/random",
+               Major:       1,
+               Minor:       8,
+               Permissions: "rwm",
+               Allow:       true,
+       },
+       {
+               Type:        'c',
+               Path:        "/dev/full",
+               Major:       1,
+               Minor:       7,
+               Permissions: "rwm",
+               Allow:       true,
+       },
+       {
+               Type:        'c',
+               Path:        "/dev/tty",
+               Major:       5,
+               Minor:       0,
+               Permissions: "rwm",
+               Allow:       true,
+       },
+       {
+               Type:        'c',
+               Path:        "/dev/zero",
+               Major:       1,
+               Minor:       5,
+               Permissions: "rwm",
+               Allow:       true,
+       },
+       {
+               Type:        'c',
+               Path:        "/dev/urandom",
+               Major:       1,
+               Minor:       9,
+               Permissions: "rwm",
+               Allow:       true,
+       },
+       {
+               Path:        "/dev/console",
+               Type:        'c',
+               Major:       5,
+               Minor:       1,
+               Permissions: "rwm",
+               Allow:       true,
+       },
+       // /dev/pts/ - pts namespaces are "coming soon"
+       {
+               Path:        "",
+               Type:        'c',
+               Major:       136,
+               Minor:       wildcard,
+               Permissions: "rwm",
+               Allow:       true,
+       },
+       {
+               Path:        "",
+               Type:        'c',
+               Major:       5,
+               Minor:       2,
+               Permissions: "rwm",
+               Allow:       true,
+       },
+       // tuntap
+       {
+               Path:        "",
+               Type:        'c',
+               Major:       10,
+               Minor:       200,
+               Permissions: "rwm",
+               Allow:       true,
+       },
+}
+
+type CreateOpts struct {
+       CgroupName       string
+       UseSystemdCgroup bool
+       NoPivotRoot      bool
+       NoNewKeyring     bool
+       Spec             *specs.Spec
+       RootlessEUID     bool
+       RootlessCgroups  bool
+}
+
+// CreateLibcontainerConfig creates a new libcontainer configuration from a
+// given specification and a cgroup name
+func CreateLibcontainerConfig(opts *CreateOpts) (*configs.Config, error) {
+       // runc's cwd will always be the bundle path
+       rcwd, err := os.Getwd()
+       if err != nil {
+               return nil, err
+       }
+       cwd, err := filepath.Abs(rcwd)
+       if err != nil {
+               return nil, err
+       }
+       spec := opts.Spec
+       if spec.Root == nil {
+               return nil, fmt.Errorf("Root must be specified")
+       }
+       rootfsPath := spec.Root.Path
+       if !filepath.IsAbs(rootfsPath) {
+               rootfsPath = filepath.Join(cwd, rootfsPath)
+       }
+       labels := []string{}
+       for k, v := range spec.Annotations {
+               labels = append(labels, fmt.Sprintf("%s=%s", k, v))
+       }
+       config := &configs.Config{
+               Rootfs:          rootfsPath,
+               NoPivotRoot:     opts.NoPivotRoot,
+               Readonlyfs:      spec.Root.Readonly,
+               Hostname:        spec.Hostname,
+               Labels:          append(labels, fmt.Sprintf("bundle=%s", cwd)),
+               NoNewKeyring:    opts.NoNewKeyring,
+               RootlessEUID:    opts.RootlessEUID,
+               RootlessCgroups: opts.RootlessCgroups,
+       }
+
+       exists := false
+       for _, m := range spec.Mounts {
+               config.Mounts = append(config.Mounts, createLibcontainerMount(cwd, m))
+       }
+       if err := createDevices(spec, config); err != nil {
+               return nil, err
+       }
+       c, err := CreateCgroupConfig(opts)
+       if err != nil {
+               return nil, err
+       }
+       config.Cgroups = c
+       // set linux-specific config
+       if spec.Linux != nil {
+               if config.RootPropagation, exists = mountPropagationMapping[spec.Linux.RootfsPropagation]; !exists {
+                       return nil, fmt.Errorf("rootfsPropagation=%v is not supported", spec.Linux.RootfsPropagation)
+               }
+               if config.NoPivotRoot && (config.RootPropagation&unix.MS_PRIVATE != 0) {
+                       return nil, fmt.Errorf("rootfsPropagation of [r]private is not safe without pivot_root")
+               }
+
+               for _, ns := range spec.Linux.Namespaces {
+                       t, exists := namespaceMapping[ns.Type]
+                       if !exists {
+                               return nil, fmt.Errorf("namespace %q does not exist", ns)
+                       }
+                       if config.Namespaces.Contains(t) {
+                               return nil, fmt.Errorf("malformed spec file: duplicated ns %q", ns)
+                       }
+                       config.Namespaces.Add(t, ns.Path)
+               }
+               if config.Namespaces.Contains(configs.NEWNET) && config.Namespaces.PathOf(configs.NEWNET) == "" {
+                       config.Networks = []*configs.Network{
+                               {
+                                       Type: "loopback",
+                               },
+                       }
+               }
+               if config.Namespaces.Contains(configs.NEWUSER) {
+                       if err := setupUserNamespace(spec, config); err != nil {
+                               return nil, err
+                       }
+               }
+               config.MaskPaths = spec.Linux.MaskedPaths
+               config.ReadonlyPaths = spec.Linux.ReadonlyPaths
+               config.MountLabel = spec.Linux.MountLabel
+               config.Sysctl = spec.Linux.Sysctl
+               if spec.Linux.Seccomp != nil {
+                       seccomp, err := SetupSeccomp(spec.Linux.Seccomp)
+                       if err != nil {
+                               return nil, err
+                       }
+                       config.Seccomp = seccomp
+               }
+               if spec.Linux.IntelRdt != nil {
+                       config.IntelRdt = &configs.IntelRdt{}
+                       if spec.Linux.IntelRdt.L3CacheSchema != "" {
+                               config.IntelRdt.L3CacheSchema = spec.Linux.IntelRdt.L3CacheSchema
+                       }
+                       if spec.Linux.IntelRdt.MemBwSchema != "" {
+                               config.IntelRdt.MemBwSchema = spec.Linux.IntelRdt.MemBwSchema
+                       }
+               }
+       }
+       if spec.Process != nil {
+               config.OomScoreAdj = spec.Process.OOMScoreAdj
+               if spec.Process.SelinuxLabel != "" {
+                       config.ProcessLabel = spec.Process.SelinuxLabel
+               }
+               if spec.Process.Capabilities != nil {
+                       config.Capabilities = &configs.Capabilities{
+                               Bounding:    spec.Process.Capabilities.Bounding,
+                               Effective:   spec.Process.Capabilities.Effective,
+                               Permitted:   spec.Process.Capabilities.Permitted,
+                               Inheritable: spec.Process.Capabilities.Inheritable,
+                               Ambient:     spec.Process.Capabilities.Ambient,
+                       }
+               }
+       }
+       createHooks(spec, config)
+       config.Version = specs.Version
+       return config, nil
+}
+
+func createLibcontainerMount(cwd string, m specs.Mount) *configs.Mount {
+       flags, pgflags, data, ext := parseMountOptions(m.Options)
+       source := m.Source
+       device := m.Type
+       if flags&unix.MS_BIND != 0 {
+               // Any "type" the user specified is meaningless (and ignored) for
+               // bind-mounts -- so we set it to "bind" because rootfs_linux.go
+               // (incorrectly) relies on this for some checks.
+               device = "bind"
+               if !filepath.IsAbs(source) {
+                       source = filepath.Join(cwd, m.Source)
+               }
+       }
+       return &configs.Mount{
+               Device:           device,
+               Source:           source,
+               Destination:      m.Destination,
+               Data:             data,
+               Flags:            flags,
+               PropagationFlags: pgflags,
+               Extensions:       ext,
+       }
+}
+
+func CreateCgroupConfig(opts *CreateOpts) (*configs.Cgroup, error) {
+       var (
+               myCgroupPath string
+
+               spec             = opts.Spec
+               useSystemdCgroup = opts.UseSystemdCgroup
+               name             = opts.CgroupName
+       )
+
+       c := &configs.Cgroup{
+               Resources: &configs.Resources{},
+       }
+
+       if spec.Linux != nil && spec.Linux.CgroupsPath != "" {
+               myCgroupPath = libcontainerUtils.CleanPath(spec.Linux.CgroupsPath)
+               if useSystemdCgroup {
+                       myCgroupPath = spec.Linux.CgroupsPath
+               }
+       }
+
+       if useSystemdCgroup {
+               if myCgroupPath == "" {
+                       c.Parent = "system.slice"
+                       c.ScopePrefix = "runc"
+                       c.Name = name
+               } else {
+                       // Parse the path from expected "slice:prefix:name"
+                       // for e.g. "system.slice:docker:1234"
+                       parts := strings.Split(myCgroupPath, ":")
+                       if len(parts) != 3 {
+                               return nil, fmt.Errorf("expected cgroupsPath to be of format \"slice:prefix:name\" for systemd cgroups, got %q instead", myCgroupPath)
+                       }
+                       c.Parent = parts[0]
+                       c.ScopePrefix = parts[1]
+                       c.Name = parts[2]
+               }
+       } else {
+               if myCgroupPath == "" {
+                       c.Name = name
+               }
+               c.Path = myCgroupPath
+       }
+
+       // In rootless containers, any attempt to make cgroup changes is likely to fail.
+       // libcontainer will validate this but ignores the error.
+       c.Resources.AllowedDevices = AllowedDevices
+       if spec.Linux != nil {
+               r := spec.Linux.Resources
+               if r == nil {
+                       return c, nil
+               }
+               for i, d := range spec.Linux.Resources.Devices {
+                       var (
+                               t     = "a"
+                               major = int64(-1)
+                               minor = int64(-1)
+                       )
+                       if d.Type != "" {
+                               t = d.Type
+                       }
+                       if d.Major != nil {
+                               major = *d.Major
+                       }
+                       if d.Minor != nil {
+                               minor = *d.Minor
+                       }
+                       if d.Access == "" {
+                               return nil, fmt.Errorf("device access at %d field cannot be empty", i)
+                       }
+                       dt, err := stringToCgroupDeviceRune(t)
+                       if err != nil {
+                               return nil, err
+                       }
+                       dd := &configs.Device{
+                               Type:        dt,
+                               Major:       major,
+                               Minor:       minor,
+                               Permissions: d.Access,
+                               Allow:       d.Allow,
+                       }
+                       c.Resources.Devices = append(c.Resources.Devices, dd)
+               }
+               if r.Memory != nil {
+                       if r.Memory.Limit != nil {
+                               c.Resources.Memory = *r.Memory.Limit
+                       }
+                       if r.Memory.Reservation != nil {
+                               c.Resources.MemoryReservation = *r.Memory.Reservation
+                       }
+                       if r.Memory.Swap != nil {
+                               c.Resources.MemorySwap = *r.Memory.Swap
+                       }
+                       if r.Memory.Kernel != nil {
+                               c.Resources.KernelMemory = *r.Memory.Kernel
+                       }
+                       if r.Memory.KernelTCP != nil {
+                               c.Resources.KernelMemoryTCP = *r.Memory.KernelTCP
+                       }
+                       if r.Memory.Swappiness != nil {
+                               c.Resources.MemorySwappiness = r.Memory.Swappiness
+                       }
+                       if r.Memory.DisableOOMKiller != nil {
+                               c.Resources.OomKillDisable = *r.Memory.DisableOOMKiller
+                       }
+               }
+               if r.CPU != nil {
+                       if r.CPU.Shares != nil {
+                               c.Resources.CpuShares = *r.CPU.Shares
+                       }
+                       if r.CPU.Quota != nil {
+                               c.Resources.CpuQuota = *r.CPU.Quota
+                       }
+                       if r.CPU.Period != nil {
+                               c.Resources.CpuPeriod = *r.CPU.Period
+                       }
+                       if r.CPU.RealtimeRuntime != nil {
+                               c.Resources.CpuRtRuntime = *r.CPU.RealtimeRuntime
+                       }
+                       if r.CPU.RealtimePeriod != nil {
+                               c.Resources.CpuRtPeriod = *r.CPU.RealtimePeriod
+                       }
+                       if r.CPU.Cpus != "" {
+                               c.Resources.CpusetCpus = r.CPU.Cpus
+                       }
+                       if r.CPU.Mems != "" {
+                               c.Resources.CpusetMems = r.CPU.Mems
+                       }
+               }
+               if r.Pids != nil {
+                       c.Resources.PidsLimit = r.Pids.Limit
+               }
+               if r.BlockIO != nil {
+                       if r.BlockIO.Weight != nil {
+                               c.Resources.BlkioWeight = *r.BlockIO.Weight
+                       }
+                       if r.BlockIO.LeafWeight != nil {
+                               c.Resources.BlkioLeafWeight = *r.BlockIO.LeafWeight
+                       }
+                       if r.BlockIO.WeightDevice != nil {
+                               for _, wd := range r.BlockIO.WeightDevice {
+                                       var weight, leafWeight uint16
+                                       if wd.Weight != nil {
+                                               weight = *wd.Weight
+                                       }
+                                       if wd.LeafWeight != nil {
+                                               leafWeight = *wd.LeafWeight
+                                       }
+                                       weightDevice := configs.NewWeightDevice(wd.Major, wd.Minor, weight, leafWeight)
+                                       c.Resources.BlkioWeightDevice = append(c.Resources.BlkioWeightDevice, weightDevice)
+                               }
+                       }
+                       if r.BlockIO.ThrottleReadBpsDevice != nil {
+                               for _, td := range r.BlockIO.ThrottleReadBpsDevice {
+                                       rate := td.Rate
+                                       throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, rate)
+                                       c.Resources.BlkioThrottleReadBpsDevice = append(c.Resources.BlkioThrottleReadBpsDevice, throttleDevice)
+                               }
+                       }
+                       if r.BlockIO.ThrottleWriteBpsDevice != nil {
+                               for _, td := range r.BlockIO.ThrottleWriteBpsDevice {
+                                       rate := td.Rate
+                                       throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, rate)
+                                       c.Resources.BlkioThrottleWriteBpsDevice = append(c.Resources.BlkioThrottleWriteBpsDevice, throttleDevice)
+                               }
+                       }
+                       if r.BlockIO.ThrottleReadIOPSDevice != nil {
+                               for _, td := range r.BlockIO.ThrottleReadIOPSDevice {
+                                       rate := td.Rate
+                                       throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, rate)
+                                       c.Resources.BlkioThrottleReadIOPSDevice = append(c.Resources.BlkioThrottleReadIOPSDevice, throttleDevice)
+                               }
+                       }
+                       if r.BlockIO.ThrottleWriteIOPSDevice != nil {
+                               for _, td := range r.BlockIO.ThrottleWriteIOPSDevice {
+                                       rate := td.Rate
+                                       throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, rate)
+                                       c.Resources.BlkioThrottleWriteIOPSDevice = append(c.Resources.BlkioThrottleWriteIOPSDevice, throttleDevice)
+                               }
+                       }
+               }
+               for _, l := range r.HugepageLimits {
+                       c.Resources.HugetlbLimit = append(c.Resources.HugetlbLimit, &configs.HugepageLimit{
+                               Pagesize: l.Pagesize,
+                               Limit:    l.Limit,
+                       })
+               }
+               if r.Network != nil {
+                       if r.Network.ClassID != nil {
+                               c.Resources.NetClsClassid = *r.Network.ClassID
+                       }
+                       for _, m := range r.Network.Priorities {
+                               c.Resources.NetPrioIfpriomap = append(c.Resources.NetPrioIfpriomap, &configs.IfPrioMap{
+                                       Interface: m.Name,
+                                       Priority:  int64(m.Priority),
+                               })
+                       }
+               }
+       }
+       // append the default allowed devices to the end of the list
+       c.Resources.Devices = append(c.Resources.Devices, AllowedDevices...)
+       return c, nil
+}
+
+func stringToCgroupDeviceRune(s string) (rune, error) {
+       switch s {
+       case "a":
+               return 'a', nil
+       case "b":
+               return 'b', nil
+       case "c":
+               return 'c', nil
+       default:
+               return 0, fmt.Errorf("invalid cgroup device type %q", s)
+       }
+}
+
+func stringToDeviceRune(s string) (rune, error) {
+       switch s {
+       case "p":
+               return 'p', nil
+       case "u":
+               return 'u', nil
+       case "b":
+               return 'b', nil
+       case "c":
+               return 'c', nil
+       default:
+               return 0, fmt.Errorf("invalid device type %q", s)
+       }
+}
+
+func createDevices(spec *specs.Spec, config *configs.Config) error {
+       // add whitelisted devices
+       config.Devices = []*configs.Device{
+               {
+                       Type:     'c',
+                       Path:     "/dev/null",
+                       Major:    1,
+                       Minor:    3,
+                       FileMode: 0666,
+                       Uid:      0,
+                       Gid:      0,
+               },
+               {
+                       Type:     'c',
+                       Path:     "/dev/random",
+                       Major:    1,
+                       Minor:    8,
+                       FileMode: 0666,
+                       Uid:      0,
+                       Gid:      0,
+               },
+               {
+                       Type:     'c',
+                       Path:     "/dev/full",
+                       Major:    1,
+                       Minor:    7,
+                       FileMode: 0666,
+                       Uid:      0,
+                       Gid:      0,
+               },
+               {
+                       Type:     'c',
+                       Path:     "/dev/tty",
+                       Major:    5,
+                       Minor:    0,
+                       FileMode: 0666,
+                       Uid:      0,
+                       Gid:      0,
+               },
+               {
+                       Type:     'c',
+                       Path:     "/dev/zero",
+                       Major:    1,
+                       Minor:    5,
+                       FileMode: 0666,
+                       Uid:      0,
+                       Gid:      0,
+               },
+               {
+                       Type:     'c',
+                       Path:     "/dev/urandom",
+                       Major:    1,
+                       Minor:    9,
+                       FileMode: 0666,
+                       Uid:      0,
+                       Gid:      0,
+               },
+       }
+       // merge in additional devices from the spec
+       if spec.Linux != nil {
+               for _, d := range spec.Linux.Devices {
+                       var uid, gid uint32
+                       var filemode os.FileMode = 0666
+
+                       if d.UID != nil {
+                               uid = *d.UID
+                       }
+                       if d.GID != nil {
+                               gid = *d.GID
+                       }
+                       dt, err := stringToDeviceRune(d.Type)
+                       if err != nil {
+                               return err
+                       }
+                       if d.FileMode != nil {
+                               filemode = *d.FileMode
+                       }
+                       device := &configs.Device{
+                               Type:     dt,
+                               Path:     d.Path,
+                               Major:    d.Major,
+                               Minor:    d.Minor,
+                               FileMode: filemode,
+                               Uid:      uid,
+                               Gid:      gid,
+                       }
+                       config.Devices = append(config.Devices, device)
+               }
+       }
+       return nil
+}
+
+func setupUserNamespace(spec *specs.Spec, config *configs.Config) error {
+       create := func(m specs.LinuxIDMapping) configs.IDMap {
+               return configs.IDMap{
+                       HostID:      int(m.HostID),
+                       ContainerID: int(m.ContainerID),
+                       Size:        int(m.Size),
+               }
+       }
+       if spec.Linux != nil {
+               for _, m := range spec.Linux.UIDMappings {
+                       config.UidMappings = append(config.UidMappings, create(m))
+               }
+               for _, m := range spec.Linux.GIDMappings {
+                       config.GidMappings = append(config.GidMappings, create(m))
+               }
+       }
+       rootUID, err := config.HostRootUID()
+       if err != nil {
+               return err
+       }
+       rootGID, err := config.HostRootGID()
+       if err != nil {
+               return err
+       }
+       for _, node := range config.Devices {
+               node.Uid = uint32(rootUID)
+               node.Gid = uint32(rootGID)
+       }
+       return nil
+}
+
+// parseMountOptions parses the string and returns the flags, propagation
+// flags and any mount data that it contains.
+func parseMountOptions(options []string) (int, []int, string, int) {
+       var (
+               flag     int
+               pgflag   []int
+               data     []string
+               extFlags int
+       )
+       flags := map[string]struct {
+               clear bool
+               flag  int
+       }{
+               "acl":           {false, unix.MS_POSIXACL},
+               "async":         {true, unix.MS_SYNCHRONOUS},
+               "atime":         {true, unix.MS_NOATIME},
+               "bind":          {false, unix.MS_BIND},
+               "defaults":      {false, 0},
+               "dev":           {true, unix.MS_NODEV},
+               "diratime":      {true, unix.MS_NODIRATIME},
+               "dirsync":       {false, unix.MS_DIRSYNC},
+               "exec":          {true, unix.MS_NOEXEC},
+               "iversion":      {false, unix.MS_I_VERSION},
+               "lazytime":      {false, unix.MS_LAZYTIME},
+               "loud":          {true, unix.MS_SILENT},
+               "mand":          {false, unix.MS_MANDLOCK},
+               "noacl":         {true, unix.MS_POSIXACL},
+               "noatime":       {false, unix.MS_NOATIME},
+               "nodev":         {false, unix.MS_NODEV},
+               "nodiratime":    {false, unix.MS_NODIRATIME},
+               "noexec":        {false, unix.MS_NOEXEC},
+               "noiversion":    {true, unix.MS_I_VERSION},
+               "nolazytime":    {true, unix.MS_LAZYTIME},
+               "nomand":        {true, unix.MS_MANDLOCK},
+               "norelatime":    {true, unix.MS_RELATIME},
+               "nostrictatime": {true, unix.MS_STRICTATIME},
+               "nosuid":        {false, unix.MS_NOSUID},
+               "rbind":         {false, unix.MS_BIND | unix.MS_REC},
+               "relatime":      {false, unix.MS_RELATIME},
+               "remount":       {false, unix.MS_REMOUNT},
+               "ro":            {false, unix.MS_RDONLY},
+               "rw":            {true, unix.MS_RDONLY},
+               "silent":        {false, unix.MS_SILENT},
+               "strictatime":   {false, unix.MS_STRICTATIME},
+               "suid":          {true, unix.MS_NOSUID},
+               "sync":          {false, unix.MS_SYNCHRONOUS},
+       }
+       propagationFlags := map[string]int{
+               "private":     unix.MS_PRIVATE,
+               "shared":      unix.MS_SHARED,
+               "slave":       unix.MS_SLAVE,
+               "unbindable":  unix.MS_UNBINDABLE,
+               "rprivate":    unix.MS_PRIVATE | unix.MS_REC,
+               "rshared":     unix.MS_SHARED | unix.MS_REC,
+               "rslave":      unix.MS_SLAVE | unix.MS_REC,
+               "runbindable": unix.MS_UNBINDABLE | unix.MS_REC,
+       }
+       extensionFlags := map[string]struct {
+               clear bool
+               flag  int
+       }{
+               "tmpcopyup": {false, configs.EXT_COPYUP},
+       }
+       for _, o := range options {
+               // If the option does not exist in the flags table or the flag
+               // is not supported on the platform,
+               // then it is a data value for a specific fs type
+               if f, exists := flags[o]; exists && f.flag != 0 {
+                       if f.clear {
+                               flag &= ^f.flag
+                       } else {
+                               flag |= f.flag
+                       }
+               } else if f, exists := propagationFlags[o]; exists && f != 0 {
+                       pgflag = append(pgflag, f)
+               } else if f, exists := extensionFlags[o]; exists && f.flag != 0 {
+                       if f.clear {
+                               extFlags &= ^f.flag
+                       } else {
+                               extFlags |= f.flag
+                       }
+               } else {
+                       data = append(data, o)
+               }
+       }
+       return flag, pgflag, strings.Join(data, ","), extFlags
+}
+
+func SetupSeccomp(config *specs.LinuxSeccomp) (*configs.Seccomp, error) {
+       if config == nil {
+               return nil, nil
+       }
+
+       // No default action specified, no syscalls listed, assume seccomp disabled
+       if config.DefaultAction == "" && len(config.Syscalls) == 0 {
+               return nil, nil
+       }
+
+       newConfig := new(configs.Seccomp)
+       newConfig.Syscalls = []*configs.Syscall{}
+
+       if len(config.Architectures) > 0 {
+               newConfig.Architectures = []string{}
+               for _, arch := range config.Architectures {
+                       newArch, err := seccomp.ConvertStringToArch(string(arch))
+                       if err != nil {
+                               return nil, err
+                       }
+                       newConfig.Architectures = append(newConfig.Architectures, newArch)
+               }
+       }
+
+       // Convert default action from string representation
+       newDefaultAction, err := seccomp.ConvertStringToAction(string(config.DefaultAction))
+       if err != nil {
+               return nil, err
+       }
+       newConfig.DefaultAction = newDefaultAction
+
+       // Loop through all syscall blocks and convert them to libcontainer format
+       for _, call := range config.Syscalls {
+               newAction, err := seccomp.ConvertStringToAction(string(call.Action))
+               if err != nil {
+                       return nil, err
+               }
+
+               for _, name := range call.Names {
+                       newCall := configs.Syscall{
+                               Name:   name,
+                               Action: newAction,
+                               Args:   []*configs.Arg{},
+                       }
+                       // Loop through all the arguments of the syscall and convert them
+                       for _, arg := range call.Args {
+                               newOp, err := seccomp.ConvertStringToOperator(string(arg.Op))
+                               if err != nil {
+                                       return nil, err
+                               }
+
+                               newArg := configs.Arg{
+                                       Index:    arg.Index,
+                                       Value:    arg.Value,
+                                       ValueTwo: arg.ValueTwo,
+                                       Op:       newOp,
+                               }
+
+                               newCall.Args = append(newCall.Args, &newArg)
+                       }
+                       newConfig.Syscalls = append(newConfig.Syscalls, &newCall)
+               }
+       }
+
+       return newConfig, nil
+}
+
+func createHooks(rspec *specs.Spec, config *configs.Config) {
+       config.Hooks = &configs.Hooks{}
+       if rspec.Hooks != nil {
+
+               for _, h := range rspec.Hooks.Prestart {
+                       cmd := createCommandHook(h)
+                       config.Hooks.Prestart = append(config.Hooks.Prestart, configs.NewCommandHook(cmd))
+               }
+               for _, h := range rspec.Hooks.Poststart {
+                       cmd := createCommandHook(h)
+                       config.Hooks.Poststart = append(config.Hooks.Poststart, configs.NewCommandHook(cmd))
+               }
+               for _, h := range rspec.Hooks.Poststop {
+                       cmd := createCommandHook(h)
+                       config.Hooks.Poststop = append(config.Hooks.Poststop, configs.NewCommandHook(cmd))
+               }
+       }
+}
+
+func createCommandHook(h specs.Hook) configs.Command {
+       cmd := configs.Command{
+               Path: h.Path,
+               Args: h.Args,
+               Env:  h.Env,
+       }
+       if h.Timeout != nil {
+               d := time.Duration(*h.Timeout) * time.Second
+               cmd.Timeout = &d
+       }
+       return cmd
+}
diff --git a/libcontainer/specconv/spec_linux_test.go b/libcontainer/specconv/spec_linux_test.go
new file mode 100644 (file)
index 0000000..da6a43a
--- /dev/null
@@ -0,0 +1,452 @@
+// +build linux
+
+package specconv
+
+import (
+       "os"
+       "strings"
+       "testing"
+
+       "golang.org/x/sys/unix"
+
+       "github.com/opencontainers/runc/libcontainer/configs"
+       "github.com/opencontainers/runc/libcontainer/configs/validate"
+       "github.com/opencontainers/runtime-spec/specs-go"
+)
+
+func TestCreateCommandHookTimeout(t *testing.T) {
+       timeout := 3600
+       hook := specs.Hook{
+               Path:    "/some/hook/path",
+               Args:    []string{"--some", "thing"},
+               Env:     []string{"SOME=value"},
+               Timeout: &timeout,
+       }
+       command := createCommandHook(hook)
+       timeoutStr := command.Timeout.String()
+       if timeoutStr != "1h0m0s" {
+               t.Errorf("Expected the Timeout to be 1h0m0s, got: %s", timeoutStr)
+       }
+}
+
+func TestCreateHooks(t *testing.T) {
+       rspec := &specs.Spec{
+               Hooks: &specs.Hooks{
+                       Prestart: []specs.Hook{
+                               {
+                                       Path: "/some/hook/path",
+                               },
+                               {
+                                       Path: "/some/hook2/path",
+                                       Args: []string{"--some", "thing"},
+                               },
+                       },
+                       Poststart: []specs.Hook{
+                               {
+                                       Path: "/some/hook/path",
+                                       Args: []string{"--some", "thing"},
+                                       Env:  []string{"SOME=value"},
+                               },
+                               {
+                                       Path: "/some/hook2/path",
+                               },
+                               {
+                                       Path: "/some/hook3/path",
+                               },
+                       },
+                       Poststop: []specs.Hook{
+                               {
+                                       Path: "/some/hook/path",
+                                       Args: []string{"--some", "thing"},
+                                       Env:  []string{"SOME=value"},
+                               },
+                               {
+                                       Path: "/some/hook2/path",
+                               },
+                               {
+                                       Path: "/some/hook3/path",
+                               },
+                               {
+                                       Path: "/some/hook4/path",
+                                       Args: []string{"--some", "thing"},
+                               },
+                       },
+               },
+       }
+       conf := &configs.Config{}
+       createHooks(rspec, conf)
+
+       prestart := conf.Hooks.Prestart
+
+       if len(prestart) != 2 {
+               t.Error("Expected 2 Prestart hooks")
+       }
+
+       poststart := conf.Hooks.Poststart
+
+       if len(poststart) != 3 {
+               t.Error("Expected 3 Poststart hooks")
+       }
+
+       poststop := conf.Hooks.Poststop
+
+       if len(poststop) != 4 {
+               t.Error("Expected 4 Poststop hooks")
+       }
+
+}
+func TestSetupSeccomp(t *testing.T) {
+       conf := &specs.LinuxSeccomp{
+               DefaultAction: "SCMP_ACT_ERRNO",
+               Architectures: []specs.Arch{specs.ArchX86_64, specs.ArchARM},
+               Syscalls: []specs.LinuxSyscall{
+                       {
+                               Names:  []string{"clone"},
+                               Action: "SCMP_ACT_ALLOW",
+                               Args: []specs.LinuxSeccompArg{
+                                       {
+                                               Index:    0,
+                                               Value:    unix.CLONE_NEWNS | unix.CLONE_NEWUTS | unix.CLONE_NEWIPC | unix.CLONE_NEWUSER | unix.CLONE_NEWPID | unix.CLONE_NEWNET | unix.CLONE_NEWCGROUP,
+                                               ValueTwo: 0,
+                                               Op:       "SCMP_CMP_MASKED_EQ",
+                                       },
+                               },
+                       },
+                       {
+                               Names: []string{
+                                       "select",
+                                       "semctl",
+                                       "semget",
+                                       "semop",
+                                       "semtimedop",
+                                       "send",
+                                       "sendfile",
+                               },
+                               Action: "SCMP_ACT_ALLOW",
+                       },
+               },
+       }
+       seccomp, err := SetupSeccomp(conf)
+
+       if err != nil {
+               t.Errorf("Couldn't create Seccomp config: %v", err)
+       }
+
+       if seccomp.DefaultAction != 2 { // SCMP_ACT_ERRNO
+               t.Error("Wrong conversion for DefaultAction")
+       }
+
+       if len(seccomp.Architectures) != 2 {
+               t.Error("Wrong number of architectures")
+       }
+
+       if seccomp.Architectures[0] != "amd64" || seccomp.Architectures[1] != "arm" {
+               t.Error("Expected architectures are not found")
+       }
+
+       calls := seccomp.Syscalls
+
+       callsLength := len(calls)
+       if callsLength != 8 {
+               t.Errorf("Expected 8 syscalls, got :%d", callsLength)
+       }
+
+       for i, call := range calls {
+               if i == 0 {
+                       expectedCloneSyscallArgs := configs.Arg{
+                               Index:    0,
+                               Op:       7, // SCMP_CMP_MASKED_EQ
+                               Value:    unix.CLONE_NEWNS | unix.CLONE_NEWUTS | unix.CLONE_NEWIPC | unix.CLONE_NEWUSER | unix.CLONE_NEWPID | unix.CLONE_NEWNET | unix.CLONE_NEWCGROUP,
+                               ValueTwo: 0,
+                       }
+                       if expectedCloneSyscallArgs != *call.Args[0] {
+                               t.Errorf("Wrong arguments conversion for the clone syscall under test")
+                       }
+               }
+               if call.Action != 4 {
+                       t.Error("Wrong conversion for the clone syscall action")
+               }
+
+       }
+
+}
+
+func TestLinuxCgroupWithMemoryResource(t *testing.T) {
+       cgroupsPath := "/user/cgroups/path/id"
+
+       spec := &specs.Spec{}
+       devices := []specs.LinuxDeviceCgroup{
+               {
+                       Allow:  false,
+                       Access: "rwm",
+               },
+       }
+
+       limit := int64(100)
+       reservation := int64(50)
+       swap := int64(20)
+       kernel := int64(40)
+       kernelTCP := int64(45)
+       swappiness := uint64(1)
+       swappinessPtr := &swappiness
+       disableOOMKiller := true
+       resources := &specs.LinuxResources{
+               Devices: devices,
+               Memory: &specs.LinuxMemory{
+                       Limit:            &limit,
+                       Reservation:      &reservation,
+                       Swap:             &swap,
+                       Kernel:           &kernel,
+                       KernelTCP:        &kernelTCP,
+                       Swappiness:       swappinessPtr,
+                       DisableOOMKiller: &disableOOMKiller,
+               },
+       }
+       spec.Linux = &specs.Linux{
+               CgroupsPath: cgroupsPath,
+               Resources:   resources,
+       }
+
+       opts := &CreateOpts{
+               CgroupName:       "ContainerID",
+               UseSystemdCgroup: false,
+               Spec:             spec,
+       }
+
+       cgroup, err := CreateCgroupConfig(opts)
+       if err != nil {
+               t.Errorf("Couldn't create Cgroup config: %v", err)
+       }
+
+       if cgroup.Path != cgroupsPath {
+               t.Errorf("Wrong cgroupsPath, expected '%s' got '%s'", cgroupsPath, cgroup.Path)
+       }
+       if cgroup.Resources.Memory != limit {
+               t.Errorf("Expected to have %d as memory limit, got %d", limit, cgroup.Resources.Memory)
+       }
+       if cgroup.Resources.MemoryReservation != reservation {
+               t.Errorf("Expected to have %d as memory reservation, got %d", reservation, cgroup.Resources.MemoryReservation)
+       }
+       if cgroup.Resources.MemorySwap != swap {
+               t.Errorf("Expected to have %d as swap, got %d", swap, cgroup.Resources.MemorySwap)
+       }
+       if cgroup.Resources.KernelMemory != kernel {
+               t.Errorf("Expected to have %d as Kernel Memory, got %d", kernel, cgroup.Resources.KernelMemory)
+       }
+       if cgroup.Resources.KernelMemoryTCP != kernelTCP {
+               t.Errorf("Expected to have %d as TCP Kernel Memory, got %d", kernelTCP, cgroup.Resources.KernelMemoryTCP)
+       }
+       if cgroup.Resources.MemorySwappiness != swappinessPtr {
+               t.Errorf("Expected to have %d as memory swappiness, got %d", swappinessPtr, cgroup.Resources.MemorySwappiness)
+       }
+       if cgroup.Resources.OomKillDisable != disableOOMKiller {
+               t.Errorf("The OOMKiller should be enabled")
+       }
+}
+
+func TestLinuxCgroupSystemd(t *testing.T) {
+       cgroupsPath := "parent:scopeprefix:name"
+
+       spec := &specs.Spec{}
+       spec.Linux = &specs.Linux{
+               CgroupsPath: cgroupsPath,
+       }
+
+       opts := &CreateOpts{
+               UseSystemdCgroup: true,
+               Spec:             spec,
+       }
+
+       cgroup, err := CreateCgroupConfig(opts)
+
+       if err != nil {
+               t.Errorf("Couldn't create Cgroup config: %v", err)
+       }
+
+       expectedParent := "parent"
+       if cgroup.Parent != expectedParent {
+               t.Errorf("Expected to have %s as Parent instead of %s", expectedParent, cgroup.Parent)
+       }
+
+       expectedScopePrefix := "scopeprefix"
+       if cgroup.ScopePrefix != expectedScopePrefix {
+               t.Errorf("Expected to have %s as ScopePrefix instead of %s", expectedScopePrefix, cgroup.ScopePrefix)
+       }
+
+       expectedName := "name"
+       if cgroup.Name != expectedName {
+               t.Errorf("Expected to have %s as Name instead of %s", expectedName, cgroup.Name)
+       }
+}
+
+func TestLinuxCgroupSystemdWithEmptyPath(t *testing.T) {
+       cgroupsPath := ""
+
+       spec := &specs.Spec{}
+       spec.Linux = &specs.Linux{
+               CgroupsPath: cgroupsPath,
+       }
+
+       opts := &CreateOpts{
+               CgroupName:       "ContainerID",
+               UseSystemdCgroup: true,
+               Spec:             spec,
+       }
+
+       cgroup, err := CreateCgroupConfig(opts)
+
+       if err != nil {
+               t.Errorf("Couldn't create Cgroup config: %v", err)
+       }
+
+       expectedParent := "system.slice"
+       if cgroup.Parent != expectedParent {
+               t.Errorf("Expected to have %s as Parent instead of %s", expectedParent, cgroup.Parent)
+       }
+
+       expectedScopePrefix := "runc"
+       if cgroup.ScopePrefix != expectedScopePrefix {
+               t.Errorf("Expected to have %s as ScopePrefix instead of %s", expectedScopePrefix, cgroup.ScopePrefix)
+       }
+
+       if cgroup.Name != opts.CgroupName {
+               t.Errorf("Expected to have %s as Name instead of %s", opts.CgroupName, cgroup.Name)
+       }
+}
+
+func TestLinuxCgroupSystemdWithInvalidPath(t *testing.T) {
+       cgroupsPath := "/user/cgroups/path/id"
+
+       spec := &specs.Spec{}
+       spec.Linux = &specs.Linux{
+               CgroupsPath: cgroupsPath,
+       }
+
+       opts := &CreateOpts{
+               CgroupName:       "ContainerID",
+               UseSystemdCgroup: true,
+               Spec:             spec,
+       }
+
+       _, err := CreateCgroupConfig(opts)
+       if err == nil {
+               t.Error("Expected to produce an error if not using the correct format for cgroup paths belonging to systemd")
+       }
+}
+func TestLinuxCgroupsPathSpecified(t *testing.T) {
+       cgroupsPath := "/user/cgroups/path/id"
+
+       spec := &specs.Spec{}
+       spec.Linux = &specs.Linux{
+               CgroupsPath: cgroupsPath,
+       }
+
+       opts := &CreateOpts{
+               CgroupName:       "ContainerID",
+               UseSystemdCgroup: false,
+               Spec:             spec,
+       }
+
+       cgroup, err := CreateCgroupConfig(opts)
+       if err != nil {
+               t.Errorf("Couldn't create Cgroup config: %v", err)
+       }
+
+       if cgroup.Path != cgroupsPath {
+               t.Errorf("Wrong cgroupsPath, expected '%s' got '%s'", cgroupsPath, cgroup.Path)
+       }
+}
+
+func TestLinuxCgroupsPathNotSpecified(t *testing.T) {
+       spec := &specs.Spec{}
+       opts := &CreateOpts{
+               CgroupName:       "ContainerID",
+               UseSystemdCgroup: false,
+               Spec:             spec,
+       }
+
+       cgroup, err := CreateCgroupConfig(opts)
+       if err != nil {
+               t.Errorf("Couldn't create Cgroup config: %v", err)
+       }
+
+       if cgroup.Path != "" {
+               t.Errorf("Wrong cgroupsPath, expected it to be empty string, got '%s'", cgroup.Path)
+       }
+}
+
+func TestSpecconvExampleValidate(t *testing.T) {
+       spec := Example()
+       spec.Root.Path = "/"
+
+       opts := &CreateOpts{
+               CgroupName:       "ContainerID",
+               UseSystemdCgroup: false,
+               Spec:             spec,
+       }
+
+       config, err := CreateLibcontainerConfig(opts)
+       if err != nil {
+               t.Errorf("Couldn't create libcontainer config: %v", err)
+       }
+
+       validator := validate.New()
+       if err := validator.Validate(config); err != nil {
+               t.Errorf("Expected specconv to produce valid container config: %v", err)
+       }
+}
+
+func TestDupNamespaces(t *testing.T) {
+       spec := &specs.Spec{
+               Root: &specs.Root{
+                       Path: "rootfs",
+               },
+               Linux: &specs.Linux{
+                       Namespaces: []specs.LinuxNamespace{
+                               {
+                                       Type: "pid",
+                               },
+                               {
+                                       Type: "pid",
+                                       Path: "/proc/1/ns/pid",
+                               },
+                       },
+               },
+       }
+
+       _, err := CreateLibcontainerConfig(&CreateOpts{
+               Spec: spec,
+       })
+
+       if !strings.Contains(err.Error(), "malformed spec file: duplicated ns") {
+               t.Errorf("Duplicated namespaces should be forbidden")
+       }
+}
+
+func TestNonZeroEUIDCompatibleSpecconvValidate(t *testing.T) {
+       if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) {
+               t.Skip("userns is unsupported")
+       }
+
+       spec := Example()
+       spec.Root.Path = "/"
+       ToRootless(spec)
+
+       opts := &CreateOpts{
+               CgroupName:       "ContainerID",
+               UseSystemdCgroup: false,
+               Spec:             spec,
+               RootlessEUID:     true,
+               RootlessCgroups:  true,
+       }
+
+       config, err := CreateLibcontainerConfig(opts)
+       if err != nil {
+               t.Errorf("Couldn't create libcontainer config: %v", err)
+       }
+
+       validator := validate.New()
+       if err := validator.Validate(config); err != nil {
+               t.Errorf("Expected specconv to produce valid rootless container config: %v", err)
+       }
+}
diff --git a/libcontainer/stacktrace/capture.go b/libcontainer/stacktrace/capture.go
new file mode 100644 (file)
index 0000000..0bbe149
--- /dev/null
@@ -0,0 +1,27 @@
+package stacktrace
+
+import "runtime"
+
+// Capture captures a stacktrace for the current calling go program
+//
+// skip is the number of frames to skip
+func Capture(userSkip int) Stacktrace {
+       var (
+               skip   = userSkip + 1 // add one for our own function
+               frames []Frame
+               prevPc uintptr
+       )
+       for i := skip; ; i++ {
+               pc, file, line, ok := runtime.Caller(i)
+               //detect if caller is repeated to avoid loop, gccgo
+               //currently runs  into a loop without this check
+               if !ok || pc == prevPc {
+                       break
+               }
+               frames = append(frames, NewFrame(pc, file, line))
+               prevPc = pc
+       }
+       return Stacktrace{
+               Frames: frames,
+       }
+}
diff --git a/libcontainer/stacktrace/capture_test.go b/libcontainer/stacktrace/capture_test.go
new file mode 100644 (file)
index 0000000..978f6c4
--- /dev/null
@@ -0,0 +1,31 @@
+package stacktrace
+
+import (
+       "strings"
+       "testing"
+)
+
+func captureFunc() Stacktrace {
+       return Capture(0)
+}
+
+func TestCaptureTestFunc(t *testing.T) {
+       stack := captureFunc()
+
+       if len(stack.Frames) == 0 {
+               t.Fatal("expected stack frames to be returned")
+       }
+
+       // the first frame is the caller
+       frame := stack.Frames[0]
+       if expected := "captureFunc"; frame.Function != expected {
+               t.Fatalf("expected function %q but received %q", expected, frame.Function)
+       }
+       expected := "/runc/libcontainer/stacktrace"
+       if !strings.HasSuffix(frame.Package, expected) {
+               t.Fatalf("expected package %q but received %q", expected, frame.Package)
+       }
+       if expected := "capture_test.go"; frame.File != expected {
+               t.Fatalf("expected file %q but received %q", expected, frame.File)
+       }
+}
diff --git a/libcontainer/stacktrace/frame.go b/libcontainer/stacktrace/frame.go
new file mode 100644 (file)
index 0000000..0d590d9
--- /dev/null
@@ -0,0 +1,38 @@
+package stacktrace
+
+import (
+       "path/filepath"
+       "runtime"
+       "strings"
+)
+
+// NewFrame returns a new stack frame for the provided information
+func NewFrame(pc uintptr, file string, line int) Frame {
+       fn := runtime.FuncForPC(pc)
+       if fn == nil {
+               return Frame{}
+       }
+       pack, name := parseFunctionName(fn.Name())
+       return Frame{
+               Line:     line,
+               File:     filepath.Base(file),
+               Package:  pack,
+               Function: name,
+       }
+}
+
+func parseFunctionName(name string) (string, string) {
+       i := strings.LastIndex(name, ".")
+       if i == -1 {
+               return "", name
+       }
+       return name[:i], name[i+1:]
+}
+
+// Frame contains all the information for a stack frame within a go program
+type Frame struct {
+       File     string
+       Function string
+       Package  string
+       Line     int
+}
diff --git a/libcontainer/stacktrace/frame_test.go b/libcontainer/stacktrace/frame_test.go
new file mode 100644 (file)
index 0000000..c6fc78e
--- /dev/null
@@ -0,0 +1,20 @@
+package stacktrace
+
+import "testing"
+
+func TestParsePackageName(t *testing.T) {
+       var (
+               name             = "github.com/opencontainers/runc/libcontainer/stacktrace.captureFunc"
+               expectedPackage  = "github.com/opencontainers/runc/libcontainer/stacktrace"
+               expectedFunction = "captureFunc"
+       )
+
+       pack, funcName := parseFunctionName(name)
+       if pack != expectedPackage {
+               t.Fatalf("expected package %q but received %q", expectedPackage, pack)
+       }
+
+       if funcName != expectedFunction {
+               t.Fatalf("expected function %q but received %q", expectedFunction, funcName)
+       }
+}
diff --git a/libcontainer/stacktrace/stacktrace.go b/libcontainer/stacktrace/stacktrace.go
new file mode 100644 (file)
index 0000000..5e8b58d
--- /dev/null
@@ -0,0 +1,5 @@
+package stacktrace
+
+type Stacktrace struct {
+       Frames []Frame
+}
diff --git a/libcontainer/standard_init_linux.go b/libcontainer/standard_init_linux.go
new file mode 100644 (file)
index 0000000..4e03b8b
--- /dev/null
@@ -0,0 +1,214 @@
+// +build linux
+
+package libcontainer
+
+import (
+       "fmt"
+       "os"
+       "os/exec"
+       "runtime"
+       "syscall" //only for Exec
+
+       "github.com/opencontainers/runc/libcontainer/apparmor"
+       "github.com/opencontainers/runc/libcontainer/configs"
+       "github.com/opencontainers/runc/libcontainer/keys"
+       "github.com/opencontainers/runc/libcontainer/seccomp"
+       "github.com/opencontainers/runc/libcontainer/system"
+       "github.com/opencontainers/selinux/go-selinux/label"
+       "github.com/pkg/errors"
+
+       "golang.org/x/sys/unix"
+)
+
+type linuxStandardInit struct {
+       pipe          *os.File
+       consoleSocket *os.File
+       parentPid     int
+       fifoFd        int
+       config        *initConfig
+}
+
+func (l *linuxStandardInit) getSessionRingParams() (string, uint32, uint32) {
+       var newperms uint32
+
+       if l.config.Config.Namespaces.Contains(configs.NEWUSER) {
+               // With user ns we need 'other' search permissions.
+               newperms = 0x8
+       } else {
+               // Without user ns we need 'UID' search permissions.
+               newperms = 0x80000
+       }
+
+       // Create a unique per session container name that we can join in setns;
+       // However, other containers can also join it.
+       return fmt.Sprintf("_ses.%s", l.config.ContainerId), 0xffffffff, newperms
+}
+
+func (l *linuxStandardInit) Init() error {
+       runtime.LockOSThread()
+       defer runtime.UnlockOSThread()
+       if !l.config.Config.NoNewKeyring {
+               if err := label.SetKeyLabel(l.config.ProcessLabel); err != nil {
+                       return err
+               }
+               defer label.SetKeyLabel("")
+               ringname, keepperms, newperms := l.getSessionRingParams()
+
+               // Do not inherit the parent's session keyring.
+               if sessKeyId, err := keys.JoinSessionKeyring(ringname); err != nil {
+                       // If keyrings aren't supported then it is likely we are on an
+                       // older kernel (or inside an LXC container). While we could bail,
+                       // the security feature we are using here is best-effort (it only
+                       // really provides marginal protection since VFS credentials are
+                       // the only significant protection of keyrings).
+                       //
+                       // TODO(cyphar): Log this so people know what's going on, once we
+                       //               have proper logging in 'runc init'.
+                       if errors.Cause(err) != unix.ENOSYS {
+                               return errors.Wrap(err, "join session keyring")
+                       }
+               } else {
+                       // Make session keyring searcheable. If we've gotten this far we
+                       // bail on any error -- we don't want to have a keyring with bad
+                       // permissions.
+                       if err := keys.ModKeyringPerm(sessKeyId, keepperms, newperms); err != nil {
+                               return errors.Wrap(err, "mod keyring permissions")
+                       }
+               }
+       }
+
+       if err := setupNetwork(l.config); err != nil {
+               return err
+       }
+       if err := setupRoute(l.config.Config); err != nil {
+               return err
+       }
+
+       label.Init()
+       if err := prepareRootfs(l.pipe, l.config); err != nil {
+               return err
+       }
+       // Set up the console. This has to be done *before* we finalize the rootfs,
+       // but *after* we've given the user the chance to set up all of the mounts
+       // they wanted.
+       if l.config.CreateConsole {
+               if err := setupConsole(l.consoleSocket, l.config, true); err != nil {
+                       return err
+               }
+               if err := system.Setctty(); err != nil {
+                       return errors.Wrap(err, "setctty")
+               }
+       }
+
+       // Finish the rootfs setup.
+       if l.config.Config.Namespaces.Contains(configs.NEWNS) {
+               if err := finalizeRootfs(l.config.Config); err != nil {
+                       return err
+               }
+       }
+
+       if hostname := l.config.Config.Hostname; hostname != "" {
+               if err := unix.Sethostname([]byte(hostname)); err != nil {
+                       return errors.Wrap(err, "sethostname")
+               }
+       }
+       if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil {
+               return errors.Wrap(err, "apply apparmor profile")
+       }
+
+       for key, value := range l.config.Config.Sysctl {
+               if err := writeSystemProperty(key, value); err != nil {
+                       return errors.Wrapf(err, "write sysctl key %s", key)
+               }
+       }
+       for _, path := range l.config.Config.ReadonlyPaths {
+               if err := readonlyPath(path); err != nil {
+                       return errors.Wrapf(err, "readonly path %s", path)
+               }
+       }
+       for _, path := range l.config.Config.MaskPaths {
+               if err := maskPath(path, l.config.Config.MountLabel); err != nil {
+                       return errors.Wrapf(err, "mask path %s", path)
+               }
+       }
+       pdeath, err := system.GetParentDeathSignal()
+       if err != nil {
+               return errors.Wrap(err, "get pdeath signal")
+       }
+       if l.config.NoNewPrivileges {
+               if err := unix.Prctl(unix.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil {
+                       return errors.Wrap(err, "set nonewprivileges")
+               }
+       }
+       // Tell our parent that we're ready to Execv. This must be done before the
+       // Seccomp rules have been applied, because we need to be able to read and
+       // write to a socket.
+       if err := syncParentReady(l.pipe); err != nil {
+               return errors.Wrap(err, "sync ready")
+       }
+       if err := label.SetProcessLabel(l.config.ProcessLabel); err != nil {
+               return errors.Wrap(err, "set process label")
+       }
+       defer label.SetProcessLabel("")
+       // Without NoNewPrivileges seccomp is a privileged operation, so we need to
+       // do this before dropping capabilities; otherwise do it as late as possible
+       // just before execve so as few syscalls take place after it as possible.
+       if l.config.Config.Seccomp != nil && !l.config.NoNewPrivileges {
+               if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
+                       return err
+               }
+       }
+       if err := finalizeNamespace(l.config); err != nil {
+               return err
+       }
+       // finalizeNamespace can change user/group which clears the parent death
+       // signal, so we restore it here.
+       if err := pdeath.Restore(); err != nil {
+               return errors.Wrap(err, "restore pdeath signal")
+       }
+       // Compare the parent from the initial start of the init process and make
+       // sure that it did not change.  if the parent changes that means it died
+       // and we were reparented to something else so we should just kill ourself
+       // and not cause problems for someone else.
+       if unix.Getppid() != l.parentPid {
+               return unix.Kill(unix.Getpid(), unix.SIGKILL)
+       }
+       // Check for the arg before waiting to make sure it exists and it is
+       // returned as a create time error.
+       name, err := exec.LookPath(l.config.Args[0])
+       if err != nil {
+               return err
+       }
+       // Close the pipe to signal that we have completed our init.
+       l.pipe.Close()
+       // Wait for the FIFO to be opened on the other side before exec-ing the
+       // user process. We open it through /proc/self/fd/$fd, because the fd that
+       // was given to us was an O_PATH fd to the fifo itself. Linux allows us to
+       // re-open an O_PATH fd through /proc.
+       fd, err := unix.Open(fmt.Sprintf("/proc/self/fd/%d", l.fifoFd), unix.O_WRONLY|unix.O_CLOEXEC, 0)
+       if err != nil {
+               return newSystemErrorWithCause(err, "open exec fifo")
+       }
+       if _, err := unix.Write(fd, []byte("0")); err != nil {
+               return newSystemErrorWithCause(err, "write 0 exec fifo")
+       }
+       // Close the O_PATH fifofd fd before exec because the kernel resets
+       // dumpable in the wrong order. This has been fixed in newer kernels, but
+       // we keep this to ensure CVE-2016-9962 doesn't re-emerge on older kernels.
+       // N.B. the core issue itself (passing dirfds to the host filesystem) has
+       // since been resolved.
+       // https://github.com/torvalds/linux/blob/v4.9/fs/exec.c#L1290-L1318
+       unix.Close(l.fifoFd)
+       // Set seccomp as close to execve as possible, so as few syscalls take
+       // place afterward (reducing the amount of syscalls that users need to
+       // enable in their seccomp profiles).
+       if l.config.Config.Seccomp != nil && l.config.NoNewPrivileges {
+               if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
+                       return newSystemErrorWithCause(err, "init seccomp")
+               }
+       }
+       if err := syscall.Exec(name, l.config.Args[0:], os.Environ()); err != nil {
+               return newSystemErrorWithCause(err, "exec user process")
+       }
+       return nil
+}
diff --git a/libcontainer/state_linux.go b/libcontainer/state_linux.go
new file mode 100644 (file)
index 0000000..5c16a42
--- /dev/null
@@ -0,0 +1,251 @@
+// +build linux
+
+package libcontainer
+
+import (
+       "fmt"
+       "os"
+       "path/filepath"
+
+       "github.com/opencontainers/runc/libcontainer/configs"
+
+       "github.com/sirupsen/logrus"
+       "golang.org/x/sys/unix"
+)
+
+func newStateTransitionError(from, to containerState) error {
+       return &stateTransitionError{
+               From: from.status().String(),
+               To:   to.status().String(),
+       }
+}
+
+// stateTransitionError is returned when an invalid state transition happens from one
+// state to another.
+type stateTransitionError struct {
+       From string
+       To   string
+}
+
+func (s *stateTransitionError) Error() string {
+       return fmt.Sprintf("invalid state transition from %s to %s", s.From, s.To)
+}
+
+type containerState interface {
+       transition(containerState) error
+       destroy() error
+       status() Status
+}
+
+func destroy(c *linuxContainer) error {
+       if !c.config.Namespaces.Contains(configs.NEWPID) {
+               if err := signalAllProcesses(c.cgroupManager, unix.SIGKILL); err != nil {
+                       logrus.Warn(err)
+               }
+       }
+       err := c.cgroupManager.Destroy()
+       if c.intelRdtManager != nil {
+               if ierr := c.intelRdtManager.Destroy(); err == nil {
+                       err = ierr
+               }
+       }
+       if rerr := os.RemoveAll(c.root); err == nil {
+               err = rerr
+       }
+       c.initProcess = nil
+       if herr := runPoststopHooks(c); err == nil {
+               err = herr
+       }
+       c.state = &stoppedState{c: c}
+       return err
+}
+
+func runPoststopHooks(c *linuxContainer) error {
+       if c.config.Hooks != nil {
+               s, err := c.currentOCIState()
+               if err != nil {
+                       return err
+               }
+               for _, hook := range c.config.Hooks.Poststop {
+                       if err := hook.Run(s); err != nil {
+                               return err
+                       }
+               }
+       }
+       return nil
+}
+
+// stoppedState represents a container is a stopped/destroyed state.
+type stoppedState struct {
+       c *linuxContainer
+}
+
+func (b *stoppedState) status() Status {
+       return Stopped
+}
+
+func (b *stoppedState) transition(s containerState) error {
+       switch s.(type) {
+       case *runningState, *restoredState:
+               b.c.state = s
+               return nil
+       case *stoppedState:
+               return nil
+       }
+       return newStateTransitionError(b, s)
+}
+
+func (b *stoppedState) destroy() error {
+       return destroy(b.c)
+}
+
+// runningState represents a container that is currently running.
+type runningState struct {
+       c *linuxContainer
+}
+
+func (r *runningState) status() Status {
+       return Running
+}
+
+func (r *runningState) transition(s containerState) error {
+       switch s.(type) {
+       case *stoppedState:
+               t, err := r.c.runType()
+               if err != nil {
+                       return err
+               }
+               if t == Running {
+                       return newGenericError(fmt.Errorf("container still running"), ContainerNotStopped)
+               }
+               r.c.state = s
+               return nil
+       case *pausedState:
+               r.c.state = s
+               return nil
+       case *runningState:
+               return nil
+       }
+       return newStateTransitionError(r, s)
+}
+
+func (r *runningState) destroy() error {
+       t, err := r.c.runType()
+       if err != nil {
+               return err
+       }
+       if t == Running {
+               return newGenericError(fmt.Errorf("container is not destroyed"), ContainerNotStopped)
+       }
+       return destroy(r.c)
+}
+
+type createdState struct {
+       c *linuxContainer
+}
+
+func (i *createdState) status() Status {
+       return Created
+}
+
+func (i *createdState) transition(s containerState) error {
+       switch s.(type) {
+       case *runningState, *pausedState, *stoppedState:
+               i.c.state = s
+               return nil
+       case *createdState:
+               return nil
+       }
+       return newStateTransitionError(i, s)
+}
+
+func (i *createdState) destroy() error {
+       i.c.initProcess.signal(unix.SIGKILL)
+       return destroy(i.c)
+}
+
+// pausedState represents a container that is currently pause.  It cannot be destroyed in a
+// paused state and must transition back to running first.
+type pausedState struct {
+       c *linuxContainer
+}
+
+func (p *pausedState) status() Status {
+       return Paused
+}
+
+func (p *pausedState) transition(s containerState) error {
+       switch s.(type) {
+       case *runningState, *stoppedState:
+               p.c.state = s
+               return nil
+       case *pausedState:
+               return nil
+       }
+       return newStateTransitionError(p, s)
+}
+
+func (p *pausedState) destroy() error {
+       t, err := p.c.runType()
+       if err != nil {
+               return err
+       }
+       if t != Running && t != Created {
+               if err := p.c.cgroupManager.Freeze(configs.Thawed); err != nil {
+                       return err
+               }
+               return destroy(p.c)
+       }
+       return newGenericError(fmt.Errorf("container is paused"), ContainerPaused)
+}
+
+// restoredState is the same as the running state but also has associated checkpoint
+// information that maybe need destroyed when the container is stopped and destroy is called.
+type restoredState struct {
+       imageDir string
+       c        *linuxContainer
+}
+
+func (r *restoredState) status() Status {
+       return Running
+}
+
+func (r *restoredState) transition(s containerState) error {
+       switch s.(type) {
+       case *stoppedState, *runningState:
+               return nil
+       }
+       return newStateTransitionError(r, s)
+}
+
+func (r *restoredState) destroy() error {
+       if _, err := os.Stat(filepath.Join(r.c.root, "checkpoint")); err != nil {
+               if !os.IsNotExist(err) {
+                       return err
+               }
+       }
+       return destroy(r.c)
+}
+
+// loadedState is used whenever a container is restored, loaded, or setting additional
+// processes inside and it should not be destroyed when it is exiting.
+type loadedState struct {
+       c *linuxContainer
+       s Status
+}
+
+func (n *loadedState) status() Status {
+       return n.s
+}
+
+func (n *loadedState) transition(s containerState) error {
+       n.c.state = s
+       return nil
+}
+
+func (n *loadedState) destroy() error {
+       if err := n.c.refreshState(); err != nil {
+               return err
+       }
+       return n.c.state.destroy()
+}
diff --git a/libcontainer/state_linux_test.go b/libcontainer/state_linux_test.go
new file mode 100644 (file)
index 0000000..6ef516b
--- /dev/null
@@ -0,0 +1,116 @@
+// +build linux
+
+package libcontainer
+
+import (
+       "reflect"
+       "testing"
+)
+
+var states = map[containerState]Status{
+       &createdState{}:          Created,
+       &runningState{}:          Running,
+       &restoredState{}:         Running,
+       &pausedState{}:           Paused,
+       &stoppedState{}:          Stopped,
+       &loadedState{s: Running}: Running,
+}
+
+func TestStateStatus(t *testing.T) {
+       for s, status := range states {
+               if s.status() != status {
+                       t.Fatalf("state returned %s but expected %s", s.status(), status)
+               }
+       }
+}
+
+func isStateTransitionError(err error) bool {
+       _, ok := err.(*stateTransitionError)
+       return ok
+}
+
+func testTransitions(t *testing.T, initialState containerState, valid []containerState) {
+       validMap := map[reflect.Type]interface{}{}
+       for _, validState := range valid {
+               validMap[reflect.TypeOf(validState)] = nil
+               t.Run(validState.status().String(), func(t *testing.T) {
+                       if err := initialState.transition(validState); err != nil {
+                               t.Fatal(err)
+                       }
+               })
+       }
+       for state := range states {
+               if _, ok := validMap[reflect.TypeOf(state)]; ok {
+                       continue
+               }
+               t.Run(state.status().String(), func(t *testing.T) {
+                       err := initialState.transition(state)
+                       if err == nil {
+                               t.Fatal("transition should fail")
+                       }
+                       if !isStateTransitionError(err) {
+                               t.Fatal("expected stateTransitionError")
+                       }
+               })
+       }
+}
+
+func TestStoppedStateTransition(t *testing.T) {
+       testTransitions(
+               t,
+               &stoppedState{c: &linuxContainer{}},
+               []containerState{
+                       &stoppedState{},
+                       &runningState{},
+                       &restoredState{},
+               },
+       )
+}
+
+func TestPausedStateTransition(t *testing.T) {
+       testTransitions(
+               t,
+               &pausedState{c: &linuxContainer{}},
+               []containerState{
+                       &pausedState{},
+                       &runningState{},
+                       &stoppedState{},
+               },
+       )
+}
+
+func TestRestoredStateTransition(t *testing.T) {
+       testTransitions(
+               t,
+               &restoredState{c: &linuxContainer{}},
+               []containerState{
+                       &stoppedState{},
+                       &runningState{},
+               },
+       )
+}
+
+func TestRunningStateTransition(t *testing.T) {
+       testTransitions(
+               t,
+               &runningState{c: &linuxContainer{}},
+               []containerState{
+                       &stoppedState{},
+                       &pausedState{},
+                       &runningState{},
+               },
+       )
+}
+
+func TestCreatedStateTransition(t *testing.T) {
+       testTransitions(
+               t,
+               &createdState{c: &linuxContainer{}},
+               []containerState{
+                       &stoppedState{},
+                       &pausedState{},
+                       &runningState{},
+                       &createdState{},
+               },
+       )
+}
diff --git a/libcontainer/stats_linux.go b/libcontainer/stats_linux.go
new file mode 100644 (file)
index 0000000..fff9dd3
--- /dev/null
@@ -0,0 +1,13 @@
+package libcontainer
+
+import (
+       "github.com/opencontainers/runc/libcontainer/cgroups"
+       "github.com/opencontainers/runc/libcontainer/intelrdt"
+       "github.com/opencontainers/runc/types"
+)
+
+type Stats struct {
+       Interfaces    []*types.NetworkInterface
+       CgroupStats   *cgroups.Stats
+       IntelRdtStats *intelrdt.Stats
+}
diff --git a/libcontainer/sync.go b/libcontainer/sync.go
new file mode 100644 (file)
index 0000000..a8704a2
--- /dev/null
@@ -0,0 +1,104 @@
+package libcontainer
+
+import (
+       "encoding/json"
+       "fmt"
+       "io"
+
+       "github.com/opencontainers/runc/libcontainer/utils"
+)
+
+type syncType string
+
+// Constants that are used for synchronisation between the parent and child
+// during container setup. They come in pairs (with procError being a generic
+// response which is followed by a &genericError).
+//
+// [  child  ] <-> [   parent   ]
+//
+// procHooks   --> [run hooks]
+//             <-- procResume
+//
+// procConsole -->
+//             <-- procConsoleReq
+//  [send(fd)] --> [recv(fd)]
+//             <-- procConsoleAck
+//
+// procReady   --> [final setup]
+//             <-- procRun
+const (
+       procError  syncType = "procError"
+       procReady  syncType = "procReady"
+       procRun    syncType = "procRun"
+       procHooks  syncType = "procHooks"
+       procResume syncType = "procResume"
+)
+
+type syncT struct {
+       Type syncType `json:"type"`
+}
+
+// writeSync is used to write to a synchronisation pipe. An error is returned
+// if there was a problem writing the payload.
+func writeSync(pipe io.Writer, sync syncType) error {
+       return utils.WriteJSON(pipe, syncT{sync})
+}
+
+// readSync is used to read from a synchronisation pipe. An error is returned
+// if we got a genericError, the pipe was closed, or we got an unexpected flag.
+func readSync(pipe io.Reader, expected syncType) error {
+       var procSync syncT
+       if err := json.NewDecoder(pipe).Decode(&procSync); err != nil {
+               if err == io.EOF {
+                       return fmt.Errorf("parent closed synchronisation channel")
+               }
+
+               if procSync.Type == procError {
+                       var ierr genericError
+
+                       if err := json.NewDecoder(pipe).Decode(&ierr); err != nil {
+                               return fmt.Errorf("failed reading error from parent: %v", err)
+                       }
+
+                       return &ierr
+               }
+
+               if procSync.Type != expected {
+                       return fmt.Errorf("invalid synchronisation flag from parent")
+               }
+       }
+       return nil
+}
+
+// parseSync runs the given callback function on each syncT received from the
+// child. It will return once io.EOF is returned from the given pipe.
+func parseSync(pipe io.Reader, fn func(*syncT) error) error {
+       dec := json.NewDecoder(pipe)
+       for {
+               var sync syncT
+               if err := dec.Decode(&sync); err != nil {
+                       if err == io.EOF {
+                               break
+                       }
+                       return err
+               }
+
+               // We handle this case outside fn for cleanliness reasons.
+               var ierr *genericError
+               if sync.Type == procError {
+                       if err := dec.Decode(&ierr); err != nil && err != io.EOF {
+                               return newSystemErrorWithCause(err, "decoding proc error from init")
+                       }
+                       if ierr != nil {
+                               return ierr
+                       }
+                       // Programmer error.
+                       panic("No error following JSON procError payload.")
+               }
+
+               if err := fn(&sync); err != nil {
+                       return err
+               }
+       }
+       return nil
+}
diff --git a/libcontainer/system/linux.go b/libcontainer/system/linux.go
new file mode 100644 (file)
index 0000000..a4ae890
--- /dev/null
@@ -0,0 +1,155 @@
+// +build linux
+
+package system
+
+import (
+       "os"
+       "os/exec"
+       "syscall" // only for exec
+       "unsafe"
+
+       "github.com/opencontainers/runc/libcontainer/user"
+       "golang.org/x/sys/unix"
+)
+
+// If arg2 is nonzero, set the "child subreaper" attribute of the
+// calling process; if arg2 is zero, unset the attribute.  When a
+// process is marked as a child subreaper, all of the children
+// that it creates, and their descendants, will be marked as
+// having a subreaper.  In effect, a subreaper fulfills the role
+// of init(1) for its descendant processes.  Upon termination of
+// a process that is orphaned (i.e., its immediate parent has
+// already terminated) and marked as having a subreaper, the
+// nearest still living ancestor subreaper will receive a SIGCHLD
+// signal and be able to wait(2) on the process to discover its
+// termination status.
+const PR_SET_CHILD_SUBREAPER = 36
+
+type ParentDeathSignal int
+
+func (p ParentDeathSignal) Restore() error {
+       if p == 0 {
+               return nil
+       }
+       current, err := GetParentDeathSignal()
+       if err != nil {
+               return err
+       }
+       if p == current {
+               return nil
+       }
+       return p.Set()
+}
+
+func (p ParentDeathSignal) Set() error {
+       return SetParentDeathSignal(uintptr(p))
+}
+
+func Execv(cmd string, args []string, env []string) error {
+       name, err := exec.LookPath(cmd)
+       if err != nil {
+               return err
+       }
+
+       return syscall.Exec(name, args, env)
+}
+
+func Prlimit(pid, resource int, limit unix.Rlimit) error {
+       _, _, err := unix.RawSyscall6(unix.SYS_PRLIMIT64, uintptr(pid), uintptr(resource), uintptr(unsafe.Pointer(&limit)), uintptr(unsafe.Pointer(&limit)), 0, 0)
+       if err != 0 {
+               return err
+       }
+       return nil
+}
+
+func SetParentDeathSignal(sig uintptr) error {
+       if err := unix.Prctl(unix.PR_SET_PDEATHSIG, sig, 0, 0, 0); err != nil {
+               return err
+       }
+       return nil
+}
+
+func GetParentDeathSignal() (ParentDeathSignal, error) {
+       var sig int
+       if err := unix.Prctl(unix.PR_GET_PDEATHSIG, uintptr(unsafe.Pointer(&sig)), 0, 0, 0); err != nil {
+               return -1, err
+       }
+       return ParentDeathSignal(sig), nil
+}
+
+func SetKeepCaps() error {
+       if err := unix.Prctl(unix.PR_SET_KEEPCAPS, 1, 0, 0, 0); err != nil {
+               return err
+       }
+
+       return nil
+}
+
+func ClearKeepCaps() error {
+       if err := unix.Prctl(unix.PR_SET_KEEPCAPS, 0, 0, 0, 0); err != nil {
+               return err
+       }
+
+       return nil
+}
+
+func Setctty() error {
+       if err := unix.IoctlSetInt(0, unix.TIOCSCTTY, 0); err != nil {
+               return err
+       }
+       return nil
+}
+
+// RunningInUserNS detects whether we are currently running in a user namespace.
+// Originally copied from github.com/lxc/lxd/shared/util.go
+func RunningInUserNS() bool {
+       uidmap, err := user.CurrentProcessUIDMap()
+       if err != nil {
+               // This kernel-provided file only exists if user namespaces are supported
+               return false
+       }
+       return UIDMapInUserNS(uidmap)
+}
+
+func UIDMapInUserNS(uidmap []user.IDMap) bool {
+       /*
+        * We assume we are in the initial user namespace if we have a full
+        * range - 4294967295 uids starting at uid 0.
+        */
+       if len(uidmap) == 1 && uidmap[0].ID == 0 && uidmap[0].ParentID == 0 && uidmap[0].Count == 4294967295 {
+               return false
+       }
+       return true
+}
+
+// GetParentNSeuid returns the euid within the parent user namespace
+func GetParentNSeuid() int64 {
+       euid := int64(os.Geteuid())
+       uidmap, err := user.CurrentProcessUIDMap()
+       if err != nil {
+               // This kernel-provided file only exists if user namespaces are supported
+               return euid
+       }
+       for _, um := range uidmap {
+               if um.ID <= euid && euid <= um.ID+um.Count-1 {
+                       return um.ParentID + euid - um.ID
+               }
+       }
+       return euid
+}
+
+// SetSubreaper sets the value i as the subreaper setting for the calling process
+func SetSubreaper(i int) error {
+       return unix.Prctl(PR_SET_CHILD_SUBREAPER, uintptr(i), 0, 0, 0)
+}
+
+// GetSubreaper returns the subreaper setting for the calling process
+func GetSubreaper() (int, error) {
+       var i uintptr
+
+       if err := unix.Prctl(unix.PR_GET_CHILD_SUBREAPER, uintptr(unsafe.Pointer(&i)), 0, 0, 0); err != nil {
+               return -1, err
+       }
+
+       return int(i), nil
+}
diff --git a/libcontainer/system/linux_test.go b/libcontainer/system/linux_test.go
new file mode 100644 (file)
index 0000000..4d613d8
--- /dev/null
@@ -0,0 +1,45 @@
+// +build linux
+
+package system
+
+import (
+       "strings"
+       "testing"
+
+       "github.com/opencontainers/runc/libcontainer/user"
+)
+
+func TestUIDMapInUserNS(t *testing.T) {
+       cases := []struct {
+               s        string
+               expected bool
+       }{
+               {
+                       s:        "         0          0 4294967295\n",
+                       expected: false,
+               },
+               {
+                       s:        "         0          0          1\n",
+                       expected: true,
+               },
+               {
+                       s:        "         0       1001          1\n         1     231072      65536\n",
+                       expected: true,
+               },
+               {
+                       // file exist but empty (the initial state when userns is created. see man 7 user_namespaces)
+                       s:        "",
+                       expected: true,
+               },
+       }
+       for _, c := range cases {
+               uidmap, err := user.ParseIDMap(strings.NewReader(c.s))
+               if err != nil {
+                       t.Fatal(err)
+               }
+               actual := UIDMapInUserNS(uidmap)
+               if c.expected != actual {
+                       t.Fatalf("expected %v, got %v for %q", c.expected, actual, c.s)
+               }
+       }
+}
diff --git a/libcontainer/system/proc.go b/libcontainer/system/proc.go
new file mode 100644 (file)
index 0000000..79232a4
--- /dev/null
@@ -0,0 +1,113 @@
+package system
+
+import (
+       "fmt"
+       "io/ioutil"
+       "path/filepath"
+       "strconv"
+       "strings"
+)
+
+// State is the status of a process.
+type State rune
+
+const ( // Only values for Linux 3.14 and later are listed here
+       Dead        State = 'X'
+       DiskSleep   State = 'D'
+       Running     State = 'R'
+       Sleeping    State = 'S'
+       Stopped     State = 'T'
+       TracingStop State = 't'
+       Zombie      State = 'Z'
+)
+
+// String forms of the state from proc(5)'s documentation for
+// /proc/[pid]/status' "State" field.
+func (s State) String() string {
+       switch s {
+       case Dead:
+               return "dead"
+       case DiskSleep:
+               return "disk sleep"
+       case Running:
+               return "running"
+       case Sleeping:
+               return "sleeping"
+       case Stopped:
+               return "stopped"
+       case TracingStop:
+               return "tracing stop"
+       case Zombie:
+               return "zombie"
+       default:
+               return fmt.Sprintf("unknown (%c)", s)
+       }
+}
+
+// Stat_t represents the information from /proc/[pid]/stat, as
+// described in proc(5) with names based on the /proc/[pid]/status
+// fields.
+type Stat_t struct {
+       // PID is the process ID.
+       PID uint
+
+       // Name is the command run by the process.
+       Name string
+
+       // State is the state of the process.
+       State State
+
+       // StartTime is the number of clock ticks after system boot (since
+       // Linux 2.6).
+       StartTime uint64
+}
+
+// Stat returns a Stat_t instance for the specified process.
+func Stat(pid int) (stat Stat_t, err error) {
+       bytes, err := ioutil.ReadFile(filepath.Join("/proc", strconv.Itoa(pid), "stat"))
+       if err != nil {
+               return stat, err
+       }
+       return parseStat(string(bytes))
+}
+
+// GetProcessStartTime is deprecated.  Use Stat(pid) and
+// Stat_t.StartTime instead.
+func GetProcessStartTime(pid int) (string, error) {
+       stat, err := Stat(pid)
+       if err != nil {
+               return "", err
+       }
+       return fmt.Sprintf("%d", stat.StartTime), nil
+}
+
+func parseStat(data string) (stat Stat_t, err error) {
+       // From proc(5), field 2 could contain space and is inside `(` and `)`.
+       // The following is an example:
+       // 89653 (gunicorn: maste) S 89630 89653 89653 0 -1 4194560 29689 28896 0 3 146 32 76 19 20 0 1 0 2971844 52965376 3920 18446744073709551615 1 1 0 0 0 0 0 16781312 137447943 0 0 0 17 1 0 0 0 0 0 0 0 0 0 0 0 0 0
+       i := strings.LastIndex(data, ")")
+       if i <= 2 || i >= len(data)-1 {
+               return stat, fmt.Errorf("invalid stat data: %q", data)
+       }
+
+       parts := strings.SplitN(data[:i], "(", 2)
+       if len(parts) != 2 {
+               return stat, fmt.Errorf("invalid stat data: %q", data)
+       }
+
+       stat.Name = parts[1]
+       _, err = fmt.Sscanf(parts[0], "%d", &stat.PID)
+       if err != nil {
+               return stat, err
+       }
+
+       // parts indexes should be offset by 3 from the field number given
+       // proc(5), because parts is zero-indexed and we've removed fields
+       // one (PID) and two (Name) in the paren-split.
+       parts = strings.Split(data[i+2:], " ")
+       var state int
+       fmt.Sscanf(parts[3-3], "%c", &state)
+       stat.State = State(state)
+       fmt.Sscanf(parts[22-3], "%d", &stat.StartTime)
+       return stat, nil
+}
diff --git a/libcontainer/system/proc_test.go b/libcontainer/system/proc_test.go
new file mode 100644 (file)
index 0000000..7e1acc5
--- /dev/null
@@ -0,0 +1,45 @@
+package system
+
+import "testing"
+
+func TestParseStartTime(t *testing.T) {
+       data := map[string]Stat_t{
+               "4902 (gunicorn: maste) S 4885 4902 4902 0 -1 4194560 29683 29929 61 83 78 16 96 17 20 0 1 0 9126532 52965376 1903 18446744073709551615 4194304 7461796 140733928751520 140733928698072 139816984959091 0 0 16781312 137447943 1 0 0 17 3 0 0 9 0 0 9559488 10071156 33050624 140733928758775 140733928758945 140733928758945 140733928759264 0": {
+                       PID:       4902,
+                       Name:      "gunicorn: maste",
+                       State:     'S',
+                       StartTime: 9126532,
+               },
+               "9534 (cat) R 9323 9534 9323 34828 9534 4194304 95 0 0 0 0 0 0 0 20 0 1 0 9214966 7626752 168 18446744073709551615 4194304 4240332 140732237651568 140732237650920 140570710391216 0 0 0 0 0 0 0 17 1 0 0 0 0 0 6340112 6341364 21553152 140732237653865 140732237653885 140732237653885 140732237656047 0": {
+                       PID:       9534,
+                       Name:      "cat",
+                       State:     'R',
+                       StartTime: 9214966,
+               },
+
+               "24767 (irq/44-mei_me) S 2 0 0 0 -1 2129984 0 0 0 0 0 0 0 0 -51 0 1 0 8722075 0 0 18446744073709551615 0 0 0 0 0 0 0 2147483647 0 0 0 0 17 1 50 1 0 0 0 0 0 0 0 0 0 0 0": {
+                       PID:       24767,
+                       Name:      "irq/44-mei_me",
+                       State:     'S',
+                       StartTime: 8722075,
+               },
+       }
+       for line, expected := range data {
+               st, err := parseStat(line)
+               if err != nil {
+                       t.Fatal(err)
+               }
+               if st.PID != expected.PID {
+                       t.Fatalf("expected PID %q but received %q", expected.PID, st.PID)
+               }
+               if st.State != expected.State {
+                       t.Fatalf("expected state %q but received %q", expected.State, st.State)
+               }
+               if st.Name != expected.Name {
+                       t.Fatalf("expected name %q but received %q", expected.Name, st.Name)
+               }
+               if st.StartTime != expected.StartTime {
+                       t.Fatalf("expected start time %q but received %q", expected.StartTime, st.StartTime)
+               }
+       }
+}
diff --git a/libcontainer/system/syscall_linux_32.go b/libcontainer/system/syscall_linux_32.go
new file mode 100644 (file)
index 0000000..c5ca5d8
--- /dev/null
@@ -0,0 +1,26 @@
+// +build linux
+// +build 386 arm
+
+package system
+
+import (
+       "golang.org/x/sys/unix"
+)
+
+// Setuid sets the uid of the calling thread to the specified uid.
+func Setuid(uid int) (err error) {
+       _, _, e1 := unix.RawSyscall(unix.SYS_SETUID32, uintptr(uid), 0, 0)
+       if e1 != 0 {
+               err = e1
+       }
+       return
+}
+
+// Setgid sets the gid of the calling thread to the specified gid.
+func Setgid(gid int) (err error) {
+       _, _, e1 := unix.RawSyscall(unix.SYS_SETGID32, uintptr(gid), 0, 0)
+       if e1 != 0 {
+               err = e1
+       }
+       return
+}
diff --git a/libcontainer/system/syscall_linux_64.go b/libcontainer/system/syscall_linux_64.go
new file mode 100644 (file)
index 0000000..e05e30a
--- /dev/null
@@ -0,0 +1,26 @@
+// +build linux
+// +build arm64 amd64 mips mipsle mips64 mips64le ppc ppc64 ppc64le riscv64 s390x
+
+package system
+
+import (
+       "golang.org/x/sys/unix"
+)
+
+// Setuid sets the uid of the calling thread to the specified uid.
+func Setuid(uid int) (err error) {
+       _, _, e1 := unix.RawSyscall(unix.SYS_SETUID, uintptr(uid), 0, 0)
+       if e1 != 0 {
+               err = e1
+       }
+       return
+}
+
+// Setgid sets the gid of the calling thread to the specified gid.
+func Setgid(gid int) (err error) {
+       _, _, e1 := unix.RawSyscall(unix.SYS_SETGID, uintptr(gid), 0, 0)
+       if e1 != 0 {
+               err = e1
+       }
+       return
+}
diff --git a/libcontainer/system/sysconfig.go b/libcontainer/system/sysconfig.go
new file mode 100644 (file)
index 0000000..b8434f1
--- /dev/null
@@ -0,0 +1,12 @@
+// +build cgo,linux
+
+package system
+
+/*
+#include <unistd.h>
+*/
+import "C"
+
+func GetClockTicks() int {
+       return int(C.sysconf(C._SC_CLK_TCK))
+}
diff --git a/libcontainer/system/sysconfig_notcgo.go b/libcontainer/system/sysconfig_notcgo.go
new file mode 100644 (file)
index 0000000..d93b5d5
--- /dev/null
@@ -0,0 +1,15 @@
+// +build !cgo windows
+
+package system
+
+func GetClockTicks() int {
+       // TODO figure out a better alternative for platforms where we're missing cgo
+       //
+       // TODO Windows. This could be implemented using Win32 QueryPerformanceFrequency().
+       // https://msdn.microsoft.com/en-us/library/windows/desktop/ms644905(v=vs.85).aspx
+       //
+       // An example of its usage can be found here.
+       // https://msdn.microsoft.com/en-us/library/windows/desktop/dn553408(v=vs.85).aspx
+
+       return 100
+}
diff --git a/libcontainer/system/unsupported.go b/libcontainer/system/unsupported.go
new file mode 100644 (file)
index 0000000..b94be74
--- /dev/null
@@ -0,0 +1,27 @@
+// +build !linux
+
+package system
+
+import (
+       "os"
+
+       "github.com/opencontainers/runc/libcontainer/user"
+)
+
+// RunningInUserNS is a stub for non-Linux systems
+// Always returns false
+func RunningInUserNS() bool {
+       return false
+}
+
+// UIDMapInUserNS is a stub for non-Linux systems
+// Always returns false
+func UIDMapInUserNS(uidmap []user.IDMap) bool {
+       return false
+}
+
+// GetParentNSeuid returns the euid within the parent user namespace
+// Always returns os.Geteuid on non-linux
+func GetParentNSeuid() int {
+       return os.Geteuid()
+}
diff --git a/libcontainer/system/xattrs_linux.go b/libcontainer/system/xattrs_linux.go
new file mode 100644 (file)
index 0000000..a6823fc
--- /dev/null
@@ -0,0 +1,35 @@
+package system
+
+import "golang.org/x/sys/unix"
+
+// Returns a []byte slice if the xattr is set and nil otherwise
+// Requires path and its attribute as arguments
+func Lgetxattr(path string, attr string) ([]byte, error) {
+       var sz int
+       // Start with a 128 length byte array
+       dest := make([]byte, 128)
+       sz, errno := unix.Lgetxattr(path, attr, dest)
+
+       switch {
+       case errno == unix.ENODATA:
+               return nil, errno
+       case errno == unix.ENOTSUP:
+               return nil, errno
+       case errno == unix.ERANGE:
+               // 128 byte array might just not be good enough,
+               // A dummy buffer is used to get the real size
+               // of the xattrs on disk
+               sz, errno = unix.Lgetxattr(path, attr, []byte{})
+               if errno != nil {
+                       return nil, errno
+               }
+               dest = make([]byte, sz)
+               sz, errno = unix.Lgetxattr(path, attr, dest)
+               if errno != nil {
+                       return nil, errno
+               }
+       case errno != nil:
+               return nil, errno
+       }
+       return dest[:sz], nil
+}
diff --git a/libcontainer/user/MAINTAINERS b/libcontainer/user/MAINTAINERS
new file mode 100644 (file)
index 0000000..edbe200
--- /dev/null
@@ -0,0 +1,2 @@
+Tianon Gravi <admwiggin@gmail.com> (@tianon)
+Aleksa Sarai <cyphar@cyphar.com> (@cyphar)
diff --git a/libcontainer/user/lookup.go b/libcontainer/user/lookup.go
new file mode 100644 (file)
index 0000000..6fd8dd0
--- /dev/null
@@ -0,0 +1,41 @@
+package user
+
+import (
+       "errors"
+)
+
+var (
+       // The current operating system does not provide the required data for user lookups.
+       ErrUnsupported = errors.New("user lookup: operating system does not provide passwd-formatted data")
+       // No matching entries found in file.
+       ErrNoPasswdEntries = errors.New("no matching entries in passwd file")
+       ErrNoGroupEntries  = errors.New("no matching entries in group file")
+)
+
+// LookupUser looks up a user by their username in /etc/passwd. If the user
+// cannot be found (or there is no /etc/passwd file on the filesystem), then
+// LookupUser returns an error.
+func LookupUser(username string) (User, error) {
+       return lookupUser(username)
+}
+
+// LookupUid looks up a user by their user id in /etc/passwd. If the user cannot
+// be found (or there is no /etc/passwd file on the filesystem), then LookupId
+// returns an error.
+func LookupUid(uid int) (User, error) {
+       return lookupUid(uid)
+}
+
+// LookupGroup looks up a group by its name in /etc/group. If the group cannot
+// be found (or there is no /etc/group file on the filesystem), then LookupGroup
+// returns an error.
+func LookupGroup(groupname string) (Group, error) {
+       return lookupGroup(groupname)
+}
+
+// LookupGid looks up a group by its group id in /etc/group. If the group cannot
+// be found (or there is no /etc/group file on the filesystem), then LookupGid
+// returns an error.
+func LookupGid(gid int) (Group, error) {
+       return lookupGid(gid)
+}
diff --git a/libcontainer/user/lookup_unix.go b/libcontainer/user/lookup_unix.go
new file mode 100644 (file)
index 0000000..92b5ae8
--- /dev/null
@@ -0,0 +1,144 @@
+// +build darwin dragonfly freebsd linux netbsd openbsd solaris
+
+package user
+
+import (
+       "io"
+       "os"
+       "strconv"
+
+       "golang.org/x/sys/unix"
+)
+
+// Unix-specific path to the passwd and group formatted files.
+const (
+       unixPasswdPath = "/etc/passwd"
+       unixGroupPath  = "/etc/group"
+)
+
+func lookupUser(username string) (User, error) {
+       return lookupUserFunc(func(u User) bool {
+               return u.Name == username
+       })
+}
+
+func lookupUid(uid int) (User, error) {
+       return lookupUserFunc(func(u User) bool {
+               return u.Uid == uid
+       })
+}
+
+func lookupUserFunc(filter func(u User) bool) (User, error) {
+       // Get operating system-specific passwd reader-closer.
+       passwd, err := GetPasswd()
+       if err != nil {
+               return User{}, err
+       }
+       defer passwd.Close()
+
+       // Get the users.
+       users, err := ParsePasswdFilter(passwd, filter)
+       if err != nil {
+               return User{}, err
+       }
+
+       // No user entries found.
+       if len(users) == 0 {
+               return User{}, ErrNoPasswdEntries
+       }
+
+       // Assume the first entry is the "correct" one.
+       return users[0], nil
+}
+
+func lookupGroup(groupname string) (Group, error) {
+       return lookupGroupFunc(func(g Group) bool {
+               return g.Name == groupname
+       })
+}
+
+func lookupGid(gid int) (Group, error) {
+       return lookupGroupFunc(func(g Group) bool {
+               return g.Gid == gid
+       })
+}
+
+func lookupGroupFunc(filter func(g Group) bool) (Group, error) {
+       // Get operating system-specific group reader-closer.
+       group, err := GetGroup()
+       if err != nil {
+               return Group{}, err
+       }
+       defer group.Close()
+
+       // Get the users.
+       groups, err := ParseGroupFilter(group, filter)
+       if err != nil {
+               return Group{}, err
+       }
+
+       // No user entries found.
+       if len(groups) == 0 {
+               return Group{}, ErrNoGroupEntries
+       }
+
+       // Assume the first entry is the "correct" one.
+       return groups[0], nil
+}
+
+func GetPasswdPath() (string, error) {
+       return unixPasswdPath, nil
+}
+
+func GetPasswd() (io.ReadCloser, error) {
+       return os.Open(unixPasswdPath)
+}
+
+func GetGroupPath() (string, error) {
+       return unixGroupPath, nil
+}
+
+func GetGroup() (io.ReadCloser, error) {
+       return os.Open(unixGroupPath)
+}
+
+// CurrentUser looks up the current user by their user id in /etc/passwd. If the
+// user cannot be found (or there is no /etc/passwd file on the filesystem),
+// then CurrentUser returns an error.
+func CurrentUser() (User, error) {
+       return LookupUid(unix.Getuid())
+}
+
+// CurrentGroup looks up the current user's group by their primary group id's
+// entry in /etc/passwd. If the group cannot be found (or there is no
+// /etc/group file on the filesystem), then CurrentGroup returns an error.
+func CurrentGroup() (Group, error) {
+       return LookupGid(unix.Getgid())
+}
+
+func currentUserSubIDs(fileName string) ([]SubID, error) {
+       u, err := CurrentUser()
+       if err != nil {
+               return nil, err
+       }
+       filter := func(entry SubID) bool {
+               return entry.Name == u.Name || entry.Name == strconv.Itoa(u.Uid)
+       }
+       return ParseSubIDFileFilter(fileName, filter)
+}
+
+func CurrentUserSubUIDs() ([]SubID, error) {
+       return currentUserSubIDs("/etc/subuid")
+}
+
+func CurrentUserSubGIDs() ([]SubID, error) {
+       return currentUserSubIDs("/etc/subgid")
+}
+
+func CurrentProcessUIDMap() ([]IDMap, error) {
+       return ParseIDMapFile("/proc/self/uid_map")
+}
+
+func CurrentProcessGIDMap() ([]IDMap, error) {
+       return ParseIDMapFile("/proc/self/gid_map")
+}
diff --git a/libcontainer/user/lookup_windows.go b/libcontainer/user/lookup_windows.go
new file mode 100644 (file)
index 0000000..65cd40e
--- /dev/null
@@ -0,0 +1,40 @@
+// +build windows
+
+package user
+
+import (
+       "fmt"
+       "os/user"
+)
+
+func lookupUser(username string) (User, error) {
+       u, err := user.Lookup(username)
+       if err != nil {
+               return User{}, err
+       }
+       return userFromOS(u)
+}
+
+func lookupUid(uid int) (User, error) {
+       u, err := user.LookupId(fmt.Sprintf("%d", uid))
+       if err != nil {
+               return User{}, err
+       }
+       return userFromOS(u)
+}
+
+func lookupGroup(groupname string) (Group, error) {
+       g, err := user.LookupGroup(groupname)
+       if err != nil {
+               return Group{}, err
+       }
+       return groupFromOS(g)
+}
+
+func lookupGid(gid int) (Group, error) {
+       g, err := user.LookupGroupId(fmt.Sprintf("%d", gid))
+       if err != nil {
+               return Group{}, err
+       }
+       return groupFromOS(g)
+}
diff --git a/libcontainer/user/user.go b/libcontainer/user/user.go
new file mode 100644 (file)
index 0000000..7b912bb
--- /dev/null
@@ -0,0 +1,608 @@
+package user
+
+import (
+       "bufio"
+       "fmt"
+       "io"
+       "os"
+       "os/user"
+       "strconv"
+       "strings"
+)
+
+const (
+       minId = 0
+       maxId = 1<<31 - 1 //for 32-bit systems compatibility
+)
+
+var (
+       ErrRange = fmt.Errorf("uids and gids must be in range %d-%d", minId, maxId)
+)
+
+type User struct {
+       Name  string
+       Pass  string
+       Uid   int
+       Gid   int
+       Gecos string
+       Home  string
+       Shell string
+}
+
+// userFromOS converts an os/user.(*User) to local User
+//
+// (This does not include Pass, Shell or Gecos)
+func userFromOS(u *user.User) (User, error) {
+       newUser := User{
+               Name: u.Username,
+               Home: u.HomeDir,
+       }
+       id, err := strconv.Atoi(u.Uid)
+       if err != nil {
+               return newUser, err
+       }
+       newUser.Uid = id
+
+       id, err = strconv.Atoi(u.Gid)
+       if err != nil {
+               return newUser, err
+       }
+       newUser.Gid = id
+       return newUser, nil
+}
+
+type Group struct {
+       Name string
+       Pass string
+       Gid  int
+       List []string
+}
+
+// groupFromOS converts an os/user.(*Group) to local Group
+//
+// (This does not include Pass, Shell or Gecos)
+func groupFromOS(g *user.Group) (Group, error) {
+       newGroup := Group{
+               Name: g.Name,
+       }
+
+       id, err := strconv.Atoi(g.Gid)
+       if err != nil {
+               return newGroup, err
+       }
+       newGroup.Gid = id
+
+       return newGroup, nil
+}
+
+// SubID represents an entry in /etc/sub{u,g}id
+type SubID struct {
+       Name  string
+       SubID int64
+       Count int64
+}
+
+// IDMap represents an entry in /proc/PID/{u,g}id_map
+type IDMap struct {
+       ID       int64
+       ParentID int64
+       Count    int64
+}
+
+func parseLine(line string, v ...interface{}) {
+       parseParts(strings.Split(line, ":"), v...)
+}
+
+func parseParts(parts []string, v ...interface{}) {
+       if len(parts) == 0 {
+               return
+       }
+
+       for i, p := range parts {
+               // Ignore cases where we don't have enough fields to populate the arguments.
+               // Some configuration files like to misbehave.
+               if len(v) <= i {
+                       break
+               }
+
+               // Use the type of the argument to figure out how to parse it, scanf() style.
+               // This is legit.
+               switch e := v[i].(type) {
+               case *string:
+                       *e = p
+               case *int:
+                       // "numbers", with conversion errors ignored because of some misbehaving configuration files.
+                       *e, _ = strconv.Atoi(p)
+               case *int64:
+                       *e, _ = strconv.ParseInt(p, 10, 64)
+               case *[]string:
+                       // Comma-separated lists.
+                       if p != "" {
+                               *e = strings.Split(p, ",")
+                       } else {
+                               *e = []string{}
+                       }
+               default:
+                       // Someone goof'd when writing code using this function. Scream so they can hear us.
+                       panic(fmt.Sprintf("parseLine only accepts {*string, *int, *int64, *[]string} as arguments! %#v is not a pointer!", e))
+               }
+       }
+}
+
+func ParsePasswdFile(path string) ([]User, error) {
+       passwd, err := os.Open(path)
+       if err != nil {
+               return nil, err
+       }
+       defer passwd.Close()
+       return ParsePasswd(passwd)
+}
+
+func ParsePasswd(passwd io.Reader) ([]User, error) {
+       return ParsePasswdFilter(passwd, nil)
+}
+
+func ParsePasswdFileFilter(path string, filter func(User) bool) ([]User, error) {
+       passwd, err := os.Open(path)
+       if err != nil {
+               return nil, err
+       }
+       defer passwd.Close()
+       return ParsePasswdFilter(passwd, filter)
+}
+
+func ParsePasswdFilter(r io.Reader, filter func(User) bool) ([]User, error) {
+       if r == nil {
+               return nil, fmt.Errorf("nil source for passwd-formatted data")
+       }
+
+       var (
+               s   = bufio.NewScanner(r)
+               out = []User{}
+       )
+
+       for s.Scan() {
+               if err := s.Err(); err != nil {
+                       return nil, err
+               }
+
+               line := strings.TrimSpace(s.Text())
+               if line == "" {
+                       continue
+               }
+
+               // see: man 5 passwd
+               //  name:password:UID:GID:GECOS:directory:shell
+               // Name:Pass:Uid:Gid:Gecos:Home:Shell
+               //  root:x:0:0:root:/root:/bin/bash
+               //  adm:x:3:4:adm:/var/adm:/bin/false
+               p := User{}
+               parseLine(line, &p.Name, &p.Pass, &p.Uid, &p.Gid, &p.Gecos, &p.Home, &p.Shell)
+
+               if filter == nil || filter(p) {
+                       out = append(out, p)
+               }
+       }
+
+       return out, nil
+}
+
+func ParseGroupFile(path string) ([]Group, error) {
+       group, err := os.Open(path)
+       if err != nil {
+               return nil, err
+       }
+
+       defer group.Close()
+       return ParseGroup(group)
+}
+
+func ParseGroup(group io.Reader) ([]Group, error) {
+       return ParseGroupFilter(group, nil)
+}
+
+func ParseGroupFileFilter(path string, filter func(Group) bool) ([]Group, error) {
+       group, err := os.Open(path)
+       if err != nil {
+               return nil, err
+       }
+       defer group.Close()
+       return ParseGroupFilter(group, filter)
+}
+
+func ParseGroupFilter(r io.Reader, filter func(Group) bool) ([]Group, error) {
+       if r == nil {
+               return nil, fmt.Errorf("nil source for group-formatted data")
+       }
+
+       var (
+               s   = bufio.NewScanner(r)
+               out = []Group{}
+       )
+
+       for s.Scan() {
+               if err := s.Err(); err != nil {
+                       return nil, err
+               }
+
+               text := s.Text()
+               if text == "" {
+                       continue
+               }
+
+               // see: man 5 group
+               //  group_name:password:GID:user_list
+               // Name:Pass:Gid:List
+               //  root:x:0:root
+               //  adm:x:4:root,adm,daemon
+               p := Group{}
+               parseLine(text, &p.Name, &p.Pass, &p.Gid, &p.List)
+
+               if filter == nil || filter(p) {
+                       out = append(out, p)
+               }
+       }
+
+       return out, nil
+}
+
+type ExecUser struct {
+       Uid   int
+       Gid   int
+       Sgids []int
+       Home  string
+}
+
+// GetExecUserPath is a wrapper for GetExecUser. It reads data from each of the
+// given file paths and uses that data as the arguments to GetExecUser. If the
+// files cannot be opened for any reason, the error is ignored and a nil
+// io.Reader is passed instead.
+func GetExecUserPath(userSpec string, defaults *ExecUser, passwdPath, groupPath string) (*ExecUser, error) {
+       var passwd, group io.Reader
+
+       if passwdFile, err := os.Open(passwdPath); err == nil {
+               passwd = passwdFile
+               defer passwdFile.Close()
+       }
+
+       if groupFile, err := os.Open(groupPath); err == nil {
+               group = groupFile
+               defer groupFile.Close()
+       }
+
+       return GetExecUser(userSpec, defaults, passwd, group)
+}
+
+// GetExecUser parses a user specification string (using the passwd and group
+// readers as sources for /etc/passwd and /etc/group data, respectively). In
+// the case of blank fields or missing data from the sources, the values in
+// defaults is used.
+//
+// GetExecUser will return an error if a user or group literal could not be
+// found in any entry in passwd and group respectively.
+//
+// Examples of valid user specifications are:
+//     * ""
+//     * "user"
+//     * "uid"
+//     * "user:group"
+//     * "uid:gid
+//     * "user:gid"
+//     * "uid:group"
+//
+// It should be noted that if you specify a numeric user or group id, they will
+// not be evaluated as usernames (only the metadata will be filled). So attempting
+// to parse a user with user.Name = "1337" will produce the user with a UID of
+// 1337.
+func GetExecUser(userSpec string, defaults *ExecUser, passwd, group io.Reader) (*ExecUser, error) {
+       if defaults == nil {
+               defaults = new(ExecUser)
+       }
+
+       // Copy over defaults.
+       user := &ExecUser{
+               Uid:   defaults.Uid,
+               Gid:   defaults.Gid,
+               Sgids: defaults.Sgids,
+               Home:  defaults.Home,
+       }
+
+       // Sgids slice *cannot* be nil.
+       if user.Sgids == nil {
+               user.Sgids = []int{}
+       }
+
+       // Allow for userArg to have either "user" syntax, or optionally "user:group" syntax
+       var userArg, groupArg string
+       parseLine(userSpec, &userArg, &groupArg)
+
+       // Convert userArg and groupArg to be numeric, so we don't have to execute
+       // Atoi *twice* for each iteration over lines.
+       uidArg, uidErr := strconv.Atoi(userArg)
+       gidArg, gidErr := strconv.Atoi(groupArg)
+
+       // Find the matching user.
+       users, err := ParsePasswdFilter(passwd, func(u User) bool {
+               if userArg == "" {
+                       // Default to current state of the user.
+                       return u.Uid == user.Uid
+               }
+
+               if uidErr == nil {
+                       // If the userArg is numeric, always treat it as a UID.
+                       return uidArg == u.Uid
+               }
+
+               return u.Name == userArg
+       })
+
+       // If we can't find the user, we have to bail.
+       if err != nil && passwd != nil {
+               if userArg == "" {
+                       userArg = strconv.Itoa(user.Uid)
+               }
+               return nil, fmt.Errorf("unable to find user %s: %v", userArg, err)
+       }
+
+       var matchedUserName string
+       if len(users) > 0 {
+               // First match wins, even if there's more than one matching entry.
+               matchedUserName = users[0].Name
+               user.Uid = users[0].Uid
+               user.Gid = users[0].Gid
+               user.Home = users[0].Home
+       } else if userArg != "" {
+               // If we can't find a user with the given username, the only other valid
+               // option is if it's a numeric username with no associated entry in passwd.
+
+               if uidErr != nil {
+                       // Not numeric.
+                       return nil, fmt.Errorf("unable to find user %s: %v", userArg, ErrNoPasswdEntries)
+               }
+               user.Uid = uidArg
+
+               // Must be inside valid uid range.
+               if user.Uid < minId || user.Uid > maxId {
+                       return nil, ErrRange
+               }
+
+               // Okay, so it's numeric. We can just roll with this.
+       }
+
+       // On to the groups. If we matched a username, we need to do this because of
+       // the supplementary group IDs.
+       if groupArg != "" || matchedUserName != "" {
+               groups, err := ParseGroupFilter(group, func(g Group) bool {
+                       // If the group argument isn't explicit, we'll just search for it.
+                       if groupArg == "" {
+                               // Check if user is a member of this group.
+                               for _, u := range g.List {
+                                       if u == matchedUserName {
+                                               return true
+                                       }
+                               }
+                               return false
+                       }
+
+                       if gidErr == nil {
+                               // If the groupArg is numeric, always treat it as a GID.
+                               return gidArg == g.Gid
+                       }
+
+                       return g.Name == groupArg
+               })
+               if err != nil && group != nil {
+                       return nil, fmt.Errorf("unable to find groups for spec %v: %v", matchedUserName, err)
+               }
+
+               // Only start modifying user.Gid if it is in explicit form.
+               if groupArg != "" {
+                       if len(groups) > 0 {
+                               // First match wins, even if there's more than one matching entry.
+                               user.Gid = groups[0].Gid
+                       } else {
+                               // If we can't find a group with the given name, the only other valid
+                               // option is if it's a numeric group name with no associated entry in group.
+
+                               if gidErr != nil {
+                                       // Not numeric.
+                                       return nil, fmt.Errorf("unable to find group %s: %v", groupArg, ErrNoGroupEntries)
+                               }
+                               user.Gid = gidArg
+
+                               // Must be inside valid gid range.
+                               if user.Gid < minId || user.Gid > maxId {
+                                       return nil, ErrRange
+                               }
+
+                               // Okay, so it's numeric. We can just roll with this.
+                       }
+               } else if len(groups) > 0 {
+                       // Supplementary group ids only make sense if in the implicit form.
+                       user.Sgids = make([]int, len(groups))
+                       for i, group := range groups {
+                               user.Sgids[i] = group.Gid
+                       }
+               }
+       }
+
+       return user, nil
+}
+
+// GetAdditionalGroups looks up a list of groups by name or group id
+// against the given /etc/group formatted data. If a group name cannot
+// be found, an error will be returned. If a group id cannot be found,
+// or the given group data is nil, the id will be returned as-is
+// provided it is in the legal range.
+func GetAdditionalGroups(additionalGroups []string, group io.Reader) ([]int, error) {
+       var groups = []Group{}
+       if group != nil {
+               var err error
+               groups, err = ParseGroupFilter(group, func(g Group) bool {
+                       for _, ag := range additionalGroups {
+                               if g.Name == ag || strconv.Itoa(g.Gid) == ag {
+                                       return true
+                               }
+                       }
+                       return false
+               })
+               if err != nil {
+                       return nil, fmt.Errorf("Unable to find additional groups %v: %v", additionalGroups, err)
+               }
+       }
+
+       gidMap := make(map[int]struct{})
+       for _, ag := range additionalGroups {
+               var found bool
+               for _, g := range groups {
+                       // if we found a matched group either by name or gid, take the
+                       // first matched as correct
+                       if g.Name == ag || strconv.Itoa(g.Gid) == ag {
+                               if _, ok := gidMap[g.Gid]; !ok {
+                                       gidMap[g.Gid] = struct{}{}
+                                       found = true
+                                       break
+                               }
+                       }
+               }
+               // we asked for a group but didn't find it. let's check to see
+               // if we wanted a numeric group
+               if !found {
+                       gid, err := strconv.Atoi(ag)
+                       if err != nil {
+                               return nil, fmt.Errorf("Unable to find group %s", ag)
+                       }
+                       // Ensure gid is inside gid range.
+                       if gid < minId || gid > maxId {
+                               return nil, ErrRange
+                       }
+                       gidMap[gid] = struct{}{}
+               }
+       }
+       gids := []int{}
+       for gid := range gidMap {
+               gids = append(gids, gid)
+       }
+       return gids, nil
+}
+
+// GetAdditionalGroupsPath is a wrapper around GetAdditionalGroups
+// that opens the groupPath given and gives it as an argument to
+// GetAdditionalGroups.
+func GetAdditionalGroupsPath(additionalGroups []string, groupPath string) ([]int, error) {
+       var group io.Reader
+
+       if groupFile, err := os.Open(groupPath); err == nil {
+               group = groupFile
+               defer groupFile.Close()
+       }
+       return GetAdditionalGroups(additionalGroups, group)
+}
+
+func ParseSubIDFile(path string) ([]SubID, error) {
+       subid, err := os.Open(path)
+       if err != nil {
+               return nil, err
+       }
+       defer subid.Close()
+       return ParseSubID(subid)
+}
+
+func ParseSubID(subid io.Reader) ([]SubID, error) {
+       return ParseSubIDFilter(subid, nil)
+}
+
+func ParseSubIDFileFilter(path string, filter func(SubID) bool) ([]SubID, error) {
+       subid, err := os.Open(path)
+       if err != nil {
+               return nil, err
+       }
+       defer subid.Close()
+       return ParseSubIDFilter(subid, filter)
+}
+
+func ParseSubIDFilter(r io.Reader, filter func(SubID) bool) ([]SubID, error) {
+       if r == nil {
+               return nil, fmt.Errorf("nil source for subid-formatted data")
+       }
+
+       var (
+               s   = bufio.NewScanner(r)
+               out = []SubID{}
+       )
+
+       for s.Scan() {
+               if err := s.Err(); err != nil {
+                       return nil, err
+               }
+
+               line := strings.TrimSpace(s.Text())
+               if line == "" {
+                       continue
+               }
+
+               // see: man 5 subuid
+               p := SubID{}
+               parseLine(line, &p.Name, &p.SubID, &p.Count)
+
+               if filter == nil || filter(p) {
+                       out = append(out, p)
+               }
+       }
+
+       return out, nil
+}
+
+func ParseIDMapFile(path string) ([]IDMap, error) {
+       r, err := os.Open(path)
+       if err != nil {
+               return nil, err
+       }
+       defer r.Close()
+       return ParseIDMap(r)
+}
+
+func ParseIDMap(r io.Reader) ([]IDMap, error) {
+       return ParseIDMapFilter(r, nil)
+}
+
+func ParseIDMapFileFilter(path string, filter func(IDMap) bool) ([]IDMap, error) {
+       r, err := os.Open(path)
+       if err != nil {
+               return nil, err
+       }
+       defer r.Close()
+       return ParseIDMapFilter(r, filter)
+}
+
+func ParseIDMapFilter(r io.Reader, filter func(IDMap) bool) ([]IDMap, error) {
+       if r == nil {
+               return nil, fmt.Errorf("nil source for idmap-formatted data")
+       }
+
+       var (
+               s   = bufio.NewScanner(r)
+               out = []IDMap{}
+       )
+
+       for s.Scan() {
+               if err := s.Err(); err != nil {
+                       return nil, err
+               }
+
+               line := strings.TrimSpace(s.Text())
+               if line == "" {
+                       continue
+               }
+
+               // see: man 7 user_namespaces
+               p := IDMap{}
+               parseParts(strings.Fields(line), &p.ID, &p.ParentID, &p.Count)
+
+               if filter == nil || filter(p) {
+                       out = append(out, p)
+               }
+       }
+
+       return out, nil
+}
diff --git a/libcontainer/user/user_test.go b/libcontainer/user/user_test.go
new file mode 100644 (file)
index 0000000..24ee559
--- /dev/null
@@ -0,0 +1,507 @@
+package user
+
+import (
+       "io"
+       "reflect"
+       "sort"
+       "strconv"
+       "strings"
+       "testing"
+
+       "github.com/opencontainers/runc/libcontainer/utils"
+)
+
+func TestUserParseLine(t *testing.T) {
+       var (
+               a, b string
+               c    []string
+               d    int
+       )
+
+       parseLine("", &a, &b)
+       if a != "" || b != "" {
+               t.Fatalf("a and b should be empty ('%v', '%v')", a, b)
+       }
+
+       parseLine("a", &a, &b)
+       if a != "a" || b != "" {
+               t.Fatalf("a should be 'a' and b should be empty ('%v', '%v')", a, b)
+       }
+
+       parseLine("bad boys:corny cows", &a, &b)
+       if a != "bad boys" || b != "corny cows" {
+               t.Fatalf("a should be 'bad boys' and b should be 'corny cows' ('%v', '%v')", a, b)
+       }
+
+       parseLine("", &c)
+       if len(c) != 0 {
+               t.Fatalf("c should be empty (%#v)", c)
+       }
+
+       parseLine("d,e,f:g:h:i,j,k", &c, &a, &b, &c)
+       if a != "g" || b != "h" || len(c) != 3 || c[0] != "i" || c[1] != "j" || c[2] != "k" {
+               t.Fatalf("a should be 'g', b should be 'h', and c should be ['i','j','k'] ('%v', '%v', '%#v')", a, b, c)
+       }
+
+       parseLine("::::::::::", &a, &b, &c)
+       if a != "" || b != "" || len(c) != 0 {
+               t.Fatalf("a, b, and c should all be empty ('%v', '%v', '%#v')", a, b, c)
+       }
+
+       parseLine("not a number", &d)
+       if d != 0 {
+               t.Fatalf("d should be 0 (%v)", d)
+       }
+
+       parseLine("b:12:c", &a, &d, &b)
+       if a != "b" || b != "c" || d != 12 {
+               t.Fatalf("a should be 'b' and b should be 'c', and d should be 12 ('%v', '%v', %v)", a, b, d)
+       }
+}
+
+func TestUserParsePasswd(t *testing.T) {
+       users, err := ParsePasswdFilter(strings.NewReader(`
+root:x:0:0:root:/root:/bin/bash
+adm:x:3:4:adm:/var/adm:/bin/false
+this is just some garbage data
+`), nil)
+       if err != nil {
+               t.Fatalf("Unexpected error: %v", err)
+       }
+       if len(users) != 3 {
+               t.Fatalf("Expected 3 users, got %v", len(users))
+       }
+       if users[0].Uid != 0 || users[0].Name != "root" {
+               t.Fatalf("Expected users[0] to be 0 - root, got %v - %v", users[0].Uid, users[0].Name)
+       }
+       if users[1].Uid != 3 || users[1].Name != "adm" {
+               t.Fatalf("Expected users[1] to be 3 - adm, got %v - %v", users[1].Uid, users[1].Name)
+       }
+}
+
+func TestUserParseGroup(t *testing.T) {
+       groups, err := ParseGroupFilter(strings.NewReader(`
+root:x:0:root
+adm:x:4:root,adm,daemon
+this is just some garbage data
+`), nil)
+       if err != nil {
+               t.Fatalf("Unexpected error: %v", err)
+       }
+       if len(groups) != 3 {
+               t.Fatalf("Expected 3 groups, got %v", len(groups))
+       }
+       if groups[0].Gid != 0 || groups[0].Name != "root" || len(groups[0].List) != 1 {
+               t.Fatalf("Expected groups[0] to be 0 - root - 1 member, got %v - %v - %v", groups[0].Gid, groups[0].Name, len(groups[0].List))
+       }
+       if groups[1].Gid != 4 || groups[1].Name != "adm" || len(groups[1].List) != 3 {
+               t.Fatalf("Expected groups[1] to be 4 - adm - 3 members, got %v - %v - %v", groups[1].Gid, groups[1].Name, len(groups[1].List))
+       }
+}
+
+func TestValidGetExecUser(t *testing.T) {
+       const passwdContent = `
+root:x:0:0:root user:/root:/bin/bash
+adm:x:42:43:adm:/var/adm:/bin/false
+111:x:222:333::/var/garbage
+odd:x:111:112::/home/odd:::::
+this is just some garbage data
+`
+       const groupContent = `
+root:x:0:root
+adm:x:43:
+grp:x:1234:root,adm
+444:x:555:111
+odd:x:444:
+this is just some garbage data
+`
+       defaultExecUser := ExecUser{
+               Uid:   8888,
+               Gid:   8888,
+               Sgids: []int{8888},
+               Home:  "/8888",
+       }
+
+       tests := []struct {
+               ref      string
+               expected ExecUser
+       }{
+               {
+                       ref: "root",
+                       expected: ExecUser{
+                               Uid:   0,
+                               Gid:   0,
+                               Sgids: []int{0, 1234},
+                               Home:  "/root",
+                       },
+               },
+               {
+                       ref: "adm",
+                       expected: ExecUser{
+                               Uid:   42,
+                               Gid:   43,
+                               Sgids: []int{1234},
+                               Home:  "/var/adm",
+                       },
+               },
+               {
+                       ref: "root:adm",
+                       expected: ExecUser{
+                               Uid:   0,
+                               Gid:   43,
+                               Sgids: defaultExecUser.Sgids,
+                               Home:  "/root",
+                       },
+               },
+               {
+                       ref: "adm:1234",
+                       expected: ExecUser{
+                               Uid:   42,
+                               Gid:   1234,
+                               Sgids: defaultExecUser.Sgids,
+                               Home:  "/var/adm",
+                       },
+               },
+               {
+                       ref: "42:1234",
+                       expected: ExecUser{
+                               Uid:   42,
+                               Gid:   1234,
+                               Sgids: defaultExecUser.Sgids,
+                               Home:  "/var/adm",
+                       },
+               },
+               {
+                       ref: "1337:1234",
+                       expected: ExecUser{
+                               Uid:   1337,
+                               Gid:   1234,
+                               Sgids: defaultExecUser.Sgids,
+                               Home:  defaultExecUser.Home,
+                       },
+               },
+               {
+                       ref: "1337",
+                       expected: ExecUser{
+                               Uid:   1337,
+                               Gid:   defaultExecUser.Gid,
+                               Sgids: defaultExecUser.Sgids,
+                               Home:  defaultExecUser.Home,
+                       },
+               },
+               {
+                       ref: "",
+                       expected: ExecUser{
+                               Uid:   defaultExecUser.Uid,
+                               Gid:   defaultExecUser.Gid,
+                               Sgids: defaultExecUser.Sgids,
+                               Home:  defaultExecUser.Home,
+                       },
+               },
+
+               // Regression tests for #695.
+               {
+                       ref: "111",
+                       expected: ExecUser{
+                               Uid:   111,
+                               Gid:   112,
+                               Sgids: defaultExecUser.Sgids,
+                               Home:  "/home/odd",
+                       },
+               },
+               {
+                       ref: "111:444",
+                       expected: ExecUser{
+                               Uid:   111,
+                               Gid:   444,
+                               Sgids: defaultExecUser.Sgids,
+                               Home:  "/home/odd",
+                       },
+               },
+       }
+
+       for _, test := range tests {
+               passwd := strings.NewReader(passwdContent)
+               group := strings.NewReader(groupContent)
+
+               execUser, err := GetExecUser(test.ref, &defaultExecUser, passwd, group)
+               if err != nil {
+                       t.Logf("got unexpected error when parsing '%s': %s", test.ref, err.Error())
+                       t.Fail()
+                       continue
+               }
+
+               if !reflect.DeepEqual(test.expected, *execUser) {
+                       t.Logf("ref:      %v", test.ref)
+                       t.Logf("got:      %#v", execUser)
+                       t.Logf("expected: %#v", test.expected)
+                       t.Fail()
+                       continue
+               }
+       }
+}
+
+func TestInvalidGetExecUser(t *testing.T) {
+       const passwdContent = `
+root:x:0:0:root user:/root:/bin/bash
+adm:x:42:43:adm:/var/adm:/bin/false
+-42:x:12:13:broken:/very/broken
+this is just some garbage data
+`
+       const groupContent = `
+root:x:0:root
+adm:x:43:
+grp:x:1234:root,adm
+this is just some garbage data
+`
+
+       tests := []string{
+               // No such user/group.
+               "notuser",
+               "notuser:notgroup",
+               "root:notgroup",
+               "notuser:adm",
+               "8888:notgroup",
+               "notuser:8888",
+
+               // Invalid user/group values.
+               "-1:0",
+               "0:-3",
+               "-5:-2",
+               "-42",
+               "-43",
+       }
+
+       for _, test := range tests {
+               passwd := strings.NewReader(passwdContent)
+               group := strings.NewReader(groupContent)
+
+               execUser, err := GetExecUser(test, nil, passwd, group)
+               if err == nil {
+                       t.Logf("got unexpected success when parsing '%s': %#v", test, execUser)
+                       t.Fail()
+                       continue
+               }
+       }
+}
+
+func TestGetExecUserNilSources(t *testing.T) {
+       const passwdContent = `
+root:x:0:0:root user:/root:/bin/bash
+adm:x:42:43:adm:/var/adm:/bin/false
+this is just some garbage data
+`
+       const groupContent = `
+root:x:0:root
+adm:x:43:
+grp:x:1234:root,adm
+this is just some garbage data
+`
+
+       defaultExecUser := ExecUser{
+               Uid:   8888,
+               Gid:   8888,
+               Sgids: []int{8888},
+               Home:  "/8888",
+       }
+
+       tests := []struct {
+               ref           string
+               passwd, group bool
+               expected      ExecUser
+       }{
+               {
+                       ref:    "",
+                       passwd: false,
+                       group:  false,
+                       expected: ExecUser{
+                               Uid:   8888,
+                               Gid:   8888,
+                               Sgids: []int{8888},
+                               Home:  "/8888",
+                       },
+               },
+               {
+                       ref:    "root",
+                       passwd: true,
+                       group:  false,
+                       expected: ExecUser{
+                               Uid:   0,
+                               Gid:   0,
+                               Sgids: []int{8888},
+                               Home:  "/root",
+                       },
+               },
+               {
+                       ref:    "0",
+                       passwd: false,
+                       group:  false,
+                       expected: ExecUser{
+                               Uid:   0,
+                               Gid:   8888,
+                               Sgids: []int{8888},
+                               Home:  "/8888",
+                       },
+               },
+               {
+                       ref:    "0:0",
+                       passwd: false,
+                       group:  false,
+                       expected: ExecUser{
+                               Uid:   0,
+                               Gid:   0,
+                               Sgids: []int{8888},
+                               Home:  "/8888",
+                       },
+               },
+       }
+
+       for _, test := range tests {
+               var passwd, group io.Reader
+
+               if test.passwd {
+                       passwd = strings.NewReader(passwdContent)
+               }
+
+               if test.group {
+                       group = strings.NewReader(groupContent)
+               }
+
+               execUser, err := GetExecUser(test.ref, &defaultExecUser, passwd, group)
+               if err != nil {
+                       t.Logf("got unexpected error when parsing '%s': %s", test.ref, err.Error())
+                       t.Fail()
+                       continue
+               }
+
+               if !reflect.DeepEqual(test.expected, *execUser) {
+                       t.Logf("got:      %#v", execUser)
+                       t.Logf("expected: %#v", test.expected)
+                       t.Fail()
+                       continue
+               }
+       }
+}
+
+func TestGetAdditionalGroups(t *testing.T) {
+       type foo struct {
+               groups   []string
+               expected []int
+               hasError bool
+       }
+
+       const groupContent = `
+root:x:0:root
+adm:x:43:
+grp:x:1234:root,adm
+adm:x:4343:root,adm-duplicate
+this is just some garbage data
+`
+       tests := []foo{
+               {
+                       // empty group
+                       groups:   []string{},
+                       expected: []int{},
+               },
+               {
+                       // single group
+                       groups:   []string{"adm"},
+                       expected: []int{43},
+               },
+               {
+                       // multiple groups
+                       groups:   []string{"adm", "grp"},
+                       expected: []int{43, 1234},
+               },
+               {
+                       // invalid group
+                       groups:   []string{"adm", "grp", "not-exist"},
+                       expected: nil,
+                       hasError: true,
+               },
+               {
+                       // group with numeric id
+                       groups:   []string{"43"},
+                       expected: []int{43},
+               },
+               {
+                       // group with unknown numeric id
+                       groups:   []string{"adm", "10001"},
+                       expected: []int{43, 10001},
+               },
+               {
+                       // groups specified twice with numeric and name
+                       groups:   []string{"adm", "43"},
+                       expected: []int{43},
+               },
+               {
+                       // groups with too small id
+                       groups:   []string{"-1"},
+                       expected: nil,
+                       hasError: true,
+               },
+       }
+
+       if utils.GetIntSize() > 4 {
+               tests = append(tests, foo{
+                       // groups with too large id
+                       groups:   []string{strconv.Itoa(1 << 31)},
+                       expected: nil,
+                       hasError: true,
+               })
+       }
+
+       for _, test := range tests {
+               group := strings.NewReader(groupContent)
+
+               gids, err := GetAdditionalGroups(test.groups, group)
+               if test.hasError && err == nil {
+                       t.Errorf("Parse(%#v) expects error but has none", test)
+                       continue
+               }
+               if !test.hasError && err != nil {
+                       t.Errorf("Parse(%#v) has error %v", test, err)
+                       continue
+               }
+               sort.Sort(sort.IntSlice(gids))
+               if !reflect.DeepEqual(gids, test.expected) {
+                       t.Errorf("Gids(%v), expect %v from groups %v", gids, test.expected, test.groups)
+               }
+       }
+}
+
+func TestGetAdditionalGroupsNumeric(t *testing.T) {
+       tests := []struct {
+               groups   []string
+               expected []int
+               hasError bool
+       }{
+               {
+                       // numeric groups only
+                       groups:   []string{"1234", "5678"},
+                       expected: []int{1234, 5678},
+               },
+               {
+                       // numeric and alphabetic
+                       groups:   []string{"1234", "fake"},
+                       expected: nil,
+                       hasError: true,
+               },
+       }
+
+       for _, test := range tests {
+               gids, err := GetAdditionalGroups(test.groups, nil)
+               if test.hasError && err == nil {
+                       t.Errorf("Parse(%#v) expects error but has none", test)
+                       continue
+               }
+               if !test.hasError && err != nil {
+                       t.Errorf("Parse(%#v) has error %v", test, err)
+                       continue
+               }
+               sort.Sort(sort.IntSlice(gids))
+               if !reflect.DeepEqual(gids, test.expected) {
+                       t.Errorf("Gids(%v), expect %v from groups %v", gids, test.expected, test.groups)
+               }
+       }
+}
diff --git a/libcontainer/utils/cmsg.go b/libcontainer/utils/cmsg.go
new file mode 100644 (file)
index 0000000..c8a9364
--- /dev/null
@@ -0,0 +1,93 @@
+// +build linux
+
+package utils
+
+/*
+ * Copyright 2016, 2017 SUSE LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import (
+       "fmt"
+       "os"
+
+       "golang.org/x/sys/unix"
+)
+
+// MaxSendfdLen is the maximum length of the name of a file descriptor being
+// sent using SendFd. The name of the file handle returned by RecvFd will never
+// be larger than this value.
+const MaxNameLen = 4096
+
+// oobSpace is the size of the oob slice required to store a single FD. Note
+// that unix.UnixRights appears to make the assumption that fd is always int32,
+// so sizeof(fd) = 4.
+var oobSpace = unix.CmsgSpace(4)
+
+// RecvFd waits for a file descriptor to be sent over the given AF_UNIX
+// socket. The file name of the remote file descriptor will be recreated
+// locally (it is sent as non-auxiliary data in the same payload).
+func RecvFd(socket *os.File) (*os.File, error) {
+       // For some reason, unix.Recvmsg uses the length rather than the capacity
+       // when passing the msg_controllen and other attributes to recvmsg.  So we
+       // have to actually set the length.
+       name := make([]byte, MaxNameLen)
+       oob := make([]byte, oobSpace)
+
+       sockfd := socket.Fd()
+       n, oobn, _, _, err := unix.Recvmsg(int(sockfd), name, oob, 0)
+       if err != nil {
+               return nil, err
+       }
+
+       if n >= MaxNameLen || oobn != oobSpace {
+               return nil, fmt.Errorf("recvfd: incorrect number of bytes read (n=%d oobn=%d)", n, oobn)
+       }
+
+       // Truncate.
+       name = name[:n]
+       oob = oob[:oobn]
+
+       scms, err := unix.ParseSocketControlMessage(oob)
+       if err != nil {
+               return nil, err
+       }
+       if len(scms) != 1 {
+               return nil, fmt.Errorf("recvfd: number of SCMs is not 1: %d", len(scms))
+       }
+       scm := scms[0]
+
+       fds, err := unix.ParseUnixRights(&scm)
+       if err != nil {
+               return nil, err
+       }
+       if len(fds) != 1 {
+               return nil, fmt.Errorf("recvfd: number of fds is not 1: %d", len(fds))
+       }
+       fd := uintptr(fds[0])
+
+       return os.NewFile(fd, string(name)), nil
+}
+
+// SendFd sends a file descriptor over the given AF_UNIX socket. In
+// addition, the file.Name() of the given file will also be sent as
+// non-auxiliary data in the same payload (allowing to send contextual
+// information for a file descriptor).
+func SendFd(socket *os.File, name string, fd uintptr) error {
+       if len(name) >= MaxNameLen {
+               return fmt.Errorf("sendfd: filename too long: %s", name)
+       }
+       oob := unix.UnixRights(int(fd))
+       return unix.Sendmsg(int(socket.Fd()), []byte(name), oob, nil, 0)
+}
diff --git a/libcontainer/utils/utils.go b/libcontainer/utils/utils.go
new file mode 100644 (file)
index 0000000..40ccfaa
--- /dev/null
@@ -0,0 +1,112 @@
+package utils
+
+import (
+       "encoding/json"
+       "io"
+       "os"
+       "path/filepath"
+       "strings"
+       "unsafe"
+
+       "golang.org/x/sys/unix"
+)
+
+const (
+       exitSignalOffset = 128
+)
+
+// ResolveRootfs ensures that the current working directory is
+// not a symlink and returns the absolute path to the rootfs
+func ResolveRootfs(uncleanRootfs string) (string, error) {
+       rootfs, err := filepath.Abs(uncleanRootfs)
+       if err != nil {
+               return "", err
+       }
+       return filepath.EvalSymlinks(rootfs)
+}
+
+// ExitStatus returns the correct exit status for a process based on if it
+// was signaled or exited cleanly
+func ExitStatus(status unix.WaitStatus) int {
+       if status.Signaled() {
+               return exitSignalOffset + int(status.Signal())
+       }
+       return status.ExitStatus()
+}
+
+// WriteJSON writes the provided struct v to w using standard json marshaling
+func WriteJSON(w io.Writer, v interface{}) error {
+       data, err := json.Marshal(v)
+       if err != nil {
+               return err
+       }
+       _, err = w.Write(data)
+       return err
+}
+
+// CleanPath makes a path safe for use with filepath.Join. This is done by not
+// only cleaning the path, but also (if the path is relative) adding a leading
+// '/' and cleaning it (then removing the leading '/'). This ensures that a
+// path resulting from prepending another path will always resolve to lexically
+// be a subdirectory of the prefixed path. This is all done lexically, so paths
+// that include symlinks won't be safe as a result of using CleanPath.
+func CleanPath(path string) string {
+       // Deal with empty strings nicely.
+       if path == "" {
+               return ""
+       }
+
+       // Ensure that all paths are cleaned (especially problematic ones like
+       // "/../../../../../" which can cause lots of issues).
+       path = filepath.Clean(path)
+
+       // If the path isn't absolute, we need to do more processing to fix paths
+       // such as "../../../../<etc>/some/path". We also shouldn't convert absolute
+       // paths to relative ones.
+       if !filepath.IsAbs(path) {
+               path = filepath.Clean(string(os.PathSeparator) + path)
+               // This can't fail, as (by definition) all paths are relative to root.
+               path, _ = filepath.Rel(string(os.PathSeparator), path)
+       }
+
+       // Clean the path again for good measure.
+       return filepath.Clean(path)
+}
+
+// SearchLabels searches a list of key-value pairs for the provided key and
+// returns the corresponding value. The pairs must be separated with '='.
+func SearchLabels(labels []string, query string) string {
+       for _, l := range labels {
+               parts := strings.SplitN(l, "=", 2)
+               if len(parts) < 2 {
+                       continue
+               }
+               if parts[0] == query {
+                       return parts[1]
+               }
+       }
+       return ""
+}
+
+// Annotations returns the bundle path and user defined annotations from the
+// libcontainer state.  We need to remove the bundle because that is a label
+// added by libcontainer.
+func Annotations(labels []string) (bundle string, userAnnotations map[string]string) {
+       userAnnotations = make(map[string]string)
+       for _, l := range labels {
+               parts := strings.SplitN(l, "=", 2)
+               if len(parts) < 2 {
+                       continue
+               }
+               if parts[0] == "bundle" {
+                       bundle = parts[1]
+               } else {
+                       userAnnotations[parts[0]] = parts[1]
+               }
+       }
+       return
+}
+
+func GetIntSize() int {
+       return int(unsafe.Sizeof(1))
+}
diff --git a/libcontainer/utils/utils_test.go b/libcontainer/utils/utils_test.go
new file mode 100644 (file)
index 0000000..395eedc
--- /dev/null
@@ -0,0 +1,142 @@
+package utils
+
+import (
+       "bytes"
+       "fmt"
+       "os"
+       "path/filepath"
+       "testing"
+
+       "golang.org/x/sys/unix"
+)
+
+var labelTest = []struct {
+       labels        []string
+       query         string
+       expectedValue string
+}{
+       {[]string{"bundle=/path/to/bundle"}, "bundle", "/path/to/bundle"},
+       {[]string{"test=a", "test=b"}, "bundle", ""},
+       {[]string{"bundle=a", "test=b", "bundle=c"}, "bundle", "a"},
+       {[]string{"", "test=a", "bundle=b"}, "bundle", "b"},
+       {[]string{"test", "bundle=a"}, "bundle", "a"},
+       {[]string{"test=a", "bundle="}, "bundle", ""},
+}
+
+func TestSearchLabels(t *testing.T) {
+       for _, tt := range labelTest {
+               if v := SearchLabels(tt.labels, tt.query); v != tt.expectedValue {
+                       t.Errorf("expected value '%s' for query '%s'; got '%s'", tt.expectedValue, tt.query, v)
+               }
+       }
+}
+
+func TestResolveRootfs(t *testing.T) {
+       dir := "rootfs"
+       os.Mkdir(dir, 0600)
+       defer os.Remove(dir)
+
+       path, err := ResolveRootfs(dir)
+       if err != nil {
+               t.Fatal(err)
+       }
+       pwd, err := os.Getwd()
+       if err != nil {
+               t.Fatal(err)
+       }
+       if path != fmt.Sprintf("%s/%s", pwd, "rootfs") {
+               t.Errorf("expected rootfs to be abs and was %s", path)
+       }
+}
+
+func TestResolveRootfsWithSymlink(t *testing.T) {
+       dir := "rootfs"
+       tmpDir, _ := filepath.EvalSymlinks(os.TempDir())
+       os.Symlink(tmpDir, dir)
+       defer os.Remove(dir)
+
+       path, err := ResolveRootfs(dir)
+       if err != nil {
+               t.Fatal(err)
+       }
+
+       if path != tmpDir {
+               t.Errorf("expected rootfs to be the real path %s and was %s", path, os.TempDir())
+       }
+}
+
+func TestResolveRootfsWithNonExistingDir(t *testing.T) {
+       _, err := ResolveRootfs("foo")
+       if err == nil {
+               t.Error("expected error to happen but received nil")
+       }
+}
+
+func TestExitStatus(t *testing.T) {
+       status := unix.WaitStatus(0)
+       ex := ExitStatus(status)
+       if ex != 0 {
+               t.Errorf("expected exit status to equal 0 and received %d", ex)
+       }
+}
+
+func TestExitStatusSignaled(t *testing.T) {
+       status := unix.WaitStatus(2)
+       ex := ExitStatus(status)
+       if ex != 130 {
+               t.Errorf("expected exit status to equal 130 and received %d", ex)
+       }
+}
+
+func TestWriteJSON(t *testing.T) {
+       person := struct {
+               Name string
+               Age  int
+       }{
+               Name: "Alice",
+               Age:  30,
+       }
+
+       var b bytes.Buffer
+       err := WriteJSON(&b, person)
+       if err != nil {
+               t.Fatal(err)
+       }
+
+       expected := `{"Name":"Alice","Age":30}`
+       if b.String() != expected {
+               t.Errorf("expected to write %s but was %s", expected, b.String())
+       }
+}
+
+func TestCleanPath(t *testing.T) {
+       path := CleanPath("")
+       if path != "" {
+               t.Errorf("expected to receive empty string and received %s", path)
+       }
+
+       path = CleanPath("rootfs")
+       if path != "rootfs" {
+               t.Errorf("expected to receive 'rootfs' and received %s", path)
+       }
+
+       path = CleanPath("../../../var")
+       if path != "var" {
+               t.Errorf("expected to receive 'var' and received %s", path)
+       }
+
+       path = CleanPath("/../../../var")
+       if path != "/var" {
+               t.Errorf("expected to receive '/var' and received %s", path)
+       }
+
+       path = CleanPath("/foo/bar/")
+       if path != "/foo/bar" {
+               t.Errorf("expected to receive '/foo/bar' and received %s", path)
+       }
+
+       path = CleanPath("/foo/bar/../")
+       if path != "/foo" {
+               t.Errorf("expected to receive '/foo' and received %s", path)
+       }
+}
diff --git a/libcontainer/utils/utils_unix.go b/libcontainer/utils/utils_unix.go
new file mode 100644 (file)
index 0000000..1576f2d
--- /dev/null
@@ -0,0 +1,68 @@
+// +build !windows
+
+package utils
+
+import (
+       "fmt"
+       "os"
+       "strconv"
+
+       "golang.org/x/sys/unix"
+)
+
+// EnsureProcHandle returns whether or not the given file handle is on procfs.
+func EnsureProcHandle(fh *os.File) error {
+       var buf unix.Statfs_t
+       if err := unix.Fstatfs(int(fh.Fd()), &buf); err != nil {
+               return fmt.Errorf("ensure %s is on procfs: %v", fh.Name(), err)
+       }
+       if buf.Type != unix.PROC_SUPER_MAGIC {
+               return fmt.Errorf("%s is not on procfs", fh.Name())
+       }
+       return nil
+}
+
+// CloseExecFrom applies O_CLOEXEC to all file descriptors currently open for
+// the process (except for those below the given fd value).
+func CloseExecFrom(minFd int) error {
+       fdDir, err := os.Open("/proc/self/fd")
+       if err != nil {
+               return err
+       }
+       defer fdDir.Close()
+
+       if err := EnsureProcHandle(fdDir); err != nil {
+               return err
+       }
+
+       fdList, err := fdDir.Readdirnames(-1)
+       if err != nil {
+               return err
+       }
+       for _, fdStr := range fdList {
+               fd, err := strconv.Atoi(fdStr)
+               // Ignore non-numeric file names.
+               if err != nil {
+                       continue
+               }
+               // Ignore descriptors lower than our specified minimum.
+               if fd < minFd {
+                       continue
+               }
+               // Intentionally ignore errors from unix.CloseOnExec -- the cases where
+               // this might fail are basically file descriptors that have already
+               // been closed (including and especially the one that was created when
+               // ioutil.ReadDir did the "opendir" syscall).
+               unix.CloseOnExec(fd)
+       }
+       return nil
+}
+
+// NewSockPair returns a new unix socket pair
+func NewSockPair(name string) (parent *os.File, child *os.File, err error) {
+       fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0)
+       if err != nil {
+               return nil, nil, err
+       }
+       return os.NewFile(uintptr(fds[1]), name+"-p"), os.NewFile(uintptr(fds[0]), name+"-c"), nil
+}
diff --git a/list.go b/list.go
new file mode 100644 (file)
index 0000000..0313d8c
--- /dev/null
+++ b/list.go
@@ -0,0 +1,175 @@
+// +build linux
+
+package main
+
+import (
+       "fmt"
+       "io/ioutil"
+       "os"
+       "path/filepath"
+       "syscall"
+       "text/tabwriter"
+       "time"
+
+       "encoding/json"
+
+       "github.com/opencontainers/runc/libcontainer"
+       "github.com/opencontainers/runc/libcontainer/user"
+       "github.com/opencontainers/runc/libcontainer/utils"
+       "github.com/urfave/cli"
+)
+
+const formatOptions = `table or json`
+
+// containerState represents the platform agnostic pieces relating to a
+// running container's status and state
+type containerState struct {
+       // Version is the OCI version for the container
+       Version string `json:"ociVersion"`
+       // ID is the container ID
+       ID string `json:"id"`
+       // InitProcessPid is the init process id in the parent namespace
+       InitProcessPid int `json:"pid"`
+       // Status is the current status of the container, running, paused, ...
+       Status string `json:"status"`
+       // Bundle is the path on the filesystem to the bundle
+       Bundle string `json:"bundle"`
+       // Rootfs is a path to a directory containing the container's root filesystem.
+       Rootfs string `json:"rootfs"`
+       // Created is the unix timestamp for the creation time of the container in UTC
+       Created time.Time `json:"created"`
+       // Annotations is the user defined annotations added to the config.
+       Annotations map[string]string `json:"annotations,omitempty"`
+       // The owner of the state directory (the owner of the container).
+       Owner string `json:"owner"`
+}
+
+var listCommand = cli.Command{
+       Name:  "list",
+       Usage: "lists containers started by runc with the given root",
+       ArgsUsage: `
+
+Where the given root is specified via the global option "--root"
+(default: "/run/runc").
+
+EXAMPLE 1:
+To list containers created via the default "--root":
+       # runc list
+
+EXAMPLE 2:
+To list containers created using a non-default value for "--root":
+       # runc --root value list`,
+       Flags: []cli.Flag{
+               cli.StringFlag{
+                       Name:  "format, f",
+                       Value: "table",
+                       Usage: `select one of: ` + formatOptions,
+               },
+               cli.BoolFlag{
+                       Name:  "quiet, q",
+                       Usage: "display only container IDs",
+               },
+       },
+       Action: func(context *cli.Context) error {
+               if err := checkArgs(context, 0, exactArgs); err != nil {
+                       return err
+               }
+               s, err := getContainers(context)
+               if err != nil {
+                       return err
+               }
+
+               if context.Bool("quiet") {
+                       for _, item := range s {
+                               fmt.Println(item.ID)
+                       }
+                       return nil
+               }
+
+               switch context.String("format") {
+               case "table":
+                       w := tabwriter.NewWriter(os.Stdout, 12, 1, 3, ' ', 0)
+                       fmt.Fprint(w, "ID\tPID\tSTATUS\tBUNDLE\tCREATED\tOWNER\n")
+                       for _, item := range s {
+                               fmt.Fprintf(w, "%s\t%d\t%s\t%s\t%s\t%s\n",
+                                       item.ID,
+                                       item.InitProcessPid,
+                                       item.Status,
+                                       item.Bundle,
+                                       item.Created.Format(time.RFC3339Nano),
+                                       item.Owner)
+                       }
+                       if err := w.Flush(); err != nil {
+                               return err
+                       }
+               case "json":
+                       if err := json.NewEncoder(os.Stdout).Encode(s); err != nil {
+                               return err
+                       }
+               default:
+                       return fmt.Errorf("invalid format option")
+               }
+               return nil
+       },
+}
+
+func getContainers(context *cli.Context) ([]containerState, error) {
+       factory, err := loadFactory(context)
+       if err != nil {
+               return nil, err
+       }
+       root := context.GlobalString("root")
+       absRoot, err := filepath.Abs(root)
+       if err != nil {
+               return nil, err
+       }
+       list, err := ioutil.ReadDir(absRoot)
+       if err != nil {
+               fatal(err)
+       }
+
+       var s []containerState
+       for _, item := range list {
+               if item.IsDir() {
+                       // This cast is safe on Linux.
+                       stat := item.Sys().(*syscall.Stat_t)
+                       owner, err := user.LookupUid(int(stat.Uid))
+                       if err != nil {
+                               owner.Name = fmt.Sprintf("#%d", stat.Uid)
+                       }
+
+                       container, err := factory.Load(item.Name())
+                       if err != nil {
+                               fmt.Fprintf(os.Stderr, "load container %s: %v\n", item.Name(), err)
+                               continue
+                       }
+                       containerStatus, err := container.Status()
+                       if err != nil {
+                               fmt.Fprintf(os.Stderr, "status for %s: %v\n", item.Name(), err)
+                               continue
+                       }
+                       state, err := container.State()
+                       if err != nil {
+                               fmt.Fprintf(os.Stderr, "state for %s: %v\n", item.Name(), err)
+                               continue
+                       }
+                       pid := state.BaseState.InitProcessPid
+                       if containerStatus == libcontainer.Stopped {
+                               pid = 0
+                       }
+                       bundle, annotations := utils.Annotations(state.Config.Labels)
+                       s = append(s, containerState{
+                               Version:        state.BaseState.Config.Version,
+                               ID:             state.BaseState.ID,
+                               InitProcessPid: pid,
+                               Status:         containerStatus.String(),
+                               Bundle:         bundle,
+                               Rootfs:         state.BaseState.Config.Rootfs,
+                               Created:        state.BaseState.Created,
+                               Annotations:    annotations,
+                               Owner:          owner.Name,
+                       })
+               }
+       }
+       return s, nil
+}
diff --git a/main.go b/main.go
new file mode 100644 (file)
index 0000000..3a8c163
--- /dev/null
+++ b/main.go
@@ -0,0 +1,176 @@
+package main
+
+import (
+       "fmt"
+       "io"
+       "os"
+       "strings"
+
+       "github.com/opencontainers/runc/libcontainer/logs"
+
+       "github.com/opencontainers/runtime-spec/specs-go"
+
+       "github.com/sirupsen/logrus"
+       "github.com/urfave/cli"
+)
+
+// version will be populated by the Makefile, read from
+// VERSION file of the source code.
+var version = ""
+
+// gitCommit will be the hash that the binary was built from
+// and will be populated by the Makefile
+var gitCommit = ""
+
+const (
+       specConfig = "config.json"
+       usage      = `Open Container Initiative runtime
+
+runc is a command line client for running applications packaged according to
+the Open Container Initiative (OCI) format and is a compliant implementation of the
+Open Container Initiative specification.
+
+runc integrates well with existing process supervisors to provide a production
+container runtime environment for applications. It can be used with your
+existing process monitoring tools and the container will be spawned as a
+direct child of the process supervisor.
+
+Containers are configured using bundles. A bundle for a container is a directory
+that includes a specification file named "` + specConfig + `" and a root filesystem.
+The root filesystem contains the contents of the container.
+
+To start a new instance of a container:
+
+    # runc run [ -b bundle ] <container-id>
+
+Where "<container-id>" is your name for the instance of the container that you
+are starting. The name you provide for the container instance must be unique on
+your host. Providing the bundle directory using "-b" is optional. The default
+value for "bundle" is the current directory.`
+)
+
+func main() {
+       app := cli.NewApp()
+       app.Name = "runc"
+       app.Usage = usage
+
+       var v []string
+       if version != "" {
+               v = append(v, version)
+       }
+       if gitCommit != "" {
+               v = append(v, fmt.Sprintf("commit: %s", gitCommit))
+       }
+       v = append(v, fmt.Sprintf("spec: %s", specs.Version))
+       app.Version = strings.Join(v, "\n")
+
+       root := "/run/runc"
+       if shouldHonorXDGRuntimeDir() {
+               if runtimeDir := os.Getenv("XDG_RUNTIME_DIR"); runtimeDir != "" {
+                       root = runtimeDir + "/runc"
+                       // According to the XDG specification, we need to set anything in
+                       // XDG_RUNTIME_DIR to have a sticky bit if we don't want it to get
+                       // auto-pruned.
+                       if err := os.MkdirAll(root, 0700); err != nil {
+                               fatal(err)
+                       }
+                       if err := os.Chmod(root, 0700|os.ModeSticky); err != nil {
+                               fatal(err)
+                       }
+               }
+       }
+
+       app.Flags = []cli.Flag{
+               cli.BoolFlag{
+                       Name:  "debug",
+                       Usage: "enable debug output for logging",
+               },
+               cli.StringFlag{
+                       Name:  "log",
+                       Value: "",
+                       Usage: "set the log file path where internal debug information is written",
+               },
+               cli.StringFlag{
+                       Name:  "log-format",
+                       Value: "text",
+                       Usage: "set the format used by logs ('text' (default), or 'json')",
+               },
+               cli.StringFlag{
+                       Name:  "root",
+                       Value: root,
+                       Usage: "root directory for storage of container state (this should be located in tmpfs)",
+               },
+               cli.StringFlag{
+                       Name:  "criu",
+                       Value: "criu",
+                       Usage: "path to the criu binary used for checkpoint and restore",
+               },
+               cli.BoolFlag{
+                       Name:  "systemd-cgroup",
+                       Usage: "enable systemd cgroup support, expects cgroupsPath to be of form \"slice:prefix:name\" for e.g. \"system.slice:runc:434234\"",
+               },
+               cli.StringFlag{
+                       Name:  "rootless",
+                       Value: "auto",
+                       Usage: "ignore cgroup permission errors ('true', 'false', or 'auto')",
+               },
+       }
+       app.Commands = []cli.Command{
+               checkpointCommand,
+               createCommand,
+               deleteCommand,
+               eventsCommand,
+               execCommand,
+               initCommand,
+               killCommand,
+               listCommand,
+               pauseCommand,
+               psCommand,
+               restoreCommand,
+               resumeCommand,
+               runCommand,
+               specCommand,
+               startCommand,
+               stateCommand,
+               updateCommand,
+       }
+       app.Before = func(context *cli.Context) error {
+               return logs.ConfigureLogging(createLogConfig(context))
+       }
+
+       // If the command returns an error, cli takes upon itself to print
+       // the error on cli.ErrWriter and exit.
+       // Use our own writer here to ensure the log gets sent to the right location.
+       cli.ErrWriter = &FatalWriter{cli.ErrWriter}
+       if err := app.Run(os.Args); err != nil {
+               fatal(err)
+       }
+}
+
+type FatalWriter struct {
+       cliErrWriter io.Writer
+}
+
+func (f *FatalWriter) Write(p []byte) (n int, err error) {
+       logrus.Error(string(p))
+       return f.cliErrWriter.Write(p)
+}
+
+func createLogConfig(context *cli.Context) logs.Config {
+       logFilePath := context.GlobalString("log")
+       logPipeFd := ""
+       if logFilePath == "" {
+               logPipeFd = "2"
+       }
+       config := logs.Config{
+               LogPipeFd:   logPipeFd,
+               LogLevel:    logrus.InfoLevel,
+               LogFilePath: logFilePath,
+               LogFormat:   context.GlobalString("log-format"),
+       }
+       if context.GlobalBool("debug") {
+               config.LogLevel = logrus.DebugLevel
+       }
+
+       return config
+}
diff --git a/man/README.md b/man/README.md
new file mode 100644 (file)
index 0000000..1d7a54f
--- /dev/null
@@ -0,0 +1,11 @@
+runc man pages
+====================
+
+This directory contains man pages for runc in markdown format.
+
+To generate man pages from it, use this command
+
+    ./md2man-all.sh
+
+You will see man pages generated under the man8 directory.
+
diff --git a/man/md2man-all.sh b/man/md2man-all.sh
new file mode 100755 (executable)
index 0000000..f850ddf
--- /dev/null
@@ -0,0 +1,27 @@
+#!/bin/bash
+set -e
+
+# get into this script's directory
+cd "$(dirname "$(readlink -f "$BASH_SOURCE")")"
+
+[ "$1" = '-q' ] || {
+       set -x
+       pwd
+}
+
+if ! ( which go-md2man &>/dev/null ); then
+       echo "To install man pages, please install 'go-md2man'."
+       exit 0
+fi
+
+for FILE in *.md; do
+       base="$(basename "$FILE")"
+       name="${base%.md}"
+       num="${name##*.}"
+       if [ -z "$num" -o "$name" = "$num" ]; then
+               # skip files that aren't of the format xxxx.N.md (like README.md)
+               continue
+       fi
+       mkdir -p "./man${num}"
+       go-md2man -in "$FILE" -out "./man${num}/${name}"
+done
diff --git a/man/runc-checkpoint.8.md b/man/runc-checkpoint.8.md
new file mode 100644 (file)
index 0000000..08e6b1f
--- /dev/null
@@ -0,0 +1,30 @@
+% runc-checkpoint "8"
+
+# NAME
+   runc checkpoint - checkpoint a running container
+
+# SYNOPSIS
+   runc checkpoint [command options] `<container-id>`
+
+Where "`<container-id>`" is the name for the instance of the container to be
+checkpointed.
+
+# DESCRIPTION
+   The checkpoint command saves the state of the container instance.
+
+# OPTIONS
+    --image-path value           path for saving criu image files
+    --work-path value            path for saving work files and logs
+    --parent-path value          path for previous criu image files in pre-dump
+    --leave-running              leave the process running after checkpointing
+    --tcp-established            allow open tcp connections
+    --ext-unix-sk                allow external unix sockets
+    --shell-job                  allow shell jobs
+    --lazy-pages                 use userfaultfd to lazily restore memory pages
+    --status-fd value            criu writes \0 to this FD once lazy-pages is ready
+    --page-server value          ADDRESS:PORT of the page server
+    --file-locks                 handle file locks, for safety
+    --pre-dump                   dump container's memory information only, leave the container running after this
+    --manage-cgroups-mode value  cgroups mode: 'soft' (default), 'full' and 'strict'
+    --empty-ns value             create a namespace, but don't restore its properties
+    --auto-dedup                 enable auto deduplication of memory images
diff --git a/man/runc-create.8.md b/man/runc-create.8.md
new file mode 100644 (file)
index 0000000..99c0a2c
--- /dev/null
@@ -0,0 +1,29 @@
+% runc-create "8"
+
+# NAME
+   runc create - create a container
+
+# SYNOPSIS
+   runc create [command options] `<container-id>`
+
+Where "`<container-id>`" is your name for the instance of the container that you
+are starting. The name you provide for the container instance must be unique on
+your host.
+
+# DESCRIPTION
+   The create command creates an instance of a container for a bundle. The bundle
+is a directory with a specification file named "config.json" and a root
+filesystem.
+
+The specification file includes an args parameter. The args parameter is used
+to specify command(s) that get run when the container is started. To change the
+command(s) that get executed on start, edit the args parameter of the spec. See
+"runc spec --help" for more explanation.
+
+# OPTIONS
+    --bundle value, -b value  path to the root of the bundle directory, defaults to the current directory
+    --console-socket value    path to an AF_UNIX socket which will receive a file descriptor referencing the master end of the console's pseudoterminal
+    --pid-file value          specify the file to write the process id to
+    --no-pivot                do not use pivot root to jail process inside rootfs.  This should be used whenever the rootfs is on top of a ramdisk
+    --no-new-keyring          do not create a new session keyring for the container.  This will cause the container to inherit the calling processes session key
+    --preserve-fds value      Pass N additional file descriptors to the container (stdio + $LISTEN_FDS + N in total) (default: 0)
diff --git a/man/runc-delete.8.md b/man/runc-delete.8.md
new file mode 100644 (file)
index 0000000..84922a0
--- /dev/null
@@ -0,0 +1,19 @@
+% runc-delete "8"
+
+# NAME
+   runc delete - delete any resources held by the container often used with detached container
+
+# SYNOPSIS
+   runc delete [command options] `<container-id>`
+
+Where "`<container-id>`" is the name for the instance of the container.
+
+# OPTIONS
+    --force, -f                Forcibly deletes the container if it is still running (uses SIGKILL)
+
+# EXAMPLE
+For example, if the container id is "ubuntu01" and runc list currently shows the
+status of "ubuntu01" as "stopped" the following will delete resources held for
+"ubuntu01" removing "ubuntu01" from the runc list of containers:  
+
+       # runc delete ubuntu01
diff --git a/man/runc-events.8.md b/man/runc-events.8.md
new file mode 100644 (file)
index 0000000..d998a38
--- /dev/null
@@ -0,0 +1,17 @@
+% runc-events "8"
+
+# NAME
+   runc events - display container events such as OOM notifications, cpu, memory, and IO usage statistics
+
+# SYNOPSIS
+   runc events [command options] `<container-id>`
+
+Where "`<container-id>`" is the name for the instance of the container.
+
+# DESCRIPTION
+   The events command displays information about the container. By default the
+information is displayed once every 5 seconds.
+
+# OPTIONS
+    --interval value     set the stats collection interval (default: 5s)
+    --stats              display the container's stats then exit
diff --git a/man/runc-exec.8.md b/man/runc-exec.8.md
new file mode 100644 (file)
index 0000000..dbaaefe
--- /dev/null
@@ -0,0 +1,33 @@
+% runc-exec "8"
+
+# NAME
+   runc exec - execute new process inside the container
+
+# SYNOPSIS
+   runc exec [command options] `<container-id>` -- `<container command>` [args...]
+
+Where "`<container-id>`" is the name for the instance of the container and
+"`<container command>`" is the command to be executed in the container.
+
+# EXAMPLE
+For example, if the container is configured to run the linux ps command the
+following will output a list of processes running in the container:
+
+       # runc exec <container-id> ps
+
+# OPTIONS
+    --console value                          specify the pty slave path for use with the container
+    --cwd value                              current working directory in the container
+    --env value, -e value                    set environment variables
+    --tty, -t                                allocate a pseudo-TTY
+    --user value, -u value                   UID (format: <uid>[:<gid>])
+    --additional-gids value, -g value        additional gids
+    --process value, -p value                path to the process.json
+    --detach, -d                             detach from the container's process
+    --pid-file value                         specify the file to write the process id to
+    --process-label value                    set the asm process label for the process commonly used with selinux
+    --apparmor value                         set the apparmor profile for the process
+    --no-new-privs                           set the no new privileges value for the process
+    --cap value, -c value                    add a capability to the bounding set for the process
+    --no-subreaper                           disable the use of the subreaper used to reap reparented processes
+    --preserve-fds value                     pass N additional file descriptors to the container (stdio + $LISTEN_FDS + N in total) (default: 0)
diff --git a/man/runc-kill.8.md b/man/runc-kill.8.md
new file mode 100644 (file)
index 0000000..1ea579a
--- /dev/null
@@ -0,0 +1,20 @@
+% runc-kill "8"
+
+# NAME
+   runc kill - kill sends the specified signal (default: SIGTERM) to the container's init process
+
+# SYNOPSIS
+   runc kill [command options] `<container-id>` `<signal>`
+
+Where "`<container-id>`" is the name for the instance of the container and
+"`<signal>`" is the signal to be sent to the init process.
+
+# OPTIONS
+    --all, -a  send the specified signal to all processes inside the container
+
+# EXAMPLE
+
+For example, if the container id is "ubuntu01" the following will send a "KILL"
+signal to the init process of the "ubuntu01" container:
+
+       # runc kill ubuntu01 KILL
diff --git a/man/runc-list.8.md b/man/runc-list.8.md
new file mode 100644 (file)
index 0000000..46cd5d0
--- /dev/null
@@ -0,0 +1,21 @@
+% runc-list "8"
+
+# NAME
+   runc list - lists containers started by runc with the given root
+
+# SYNOPSIS
+   runc list [command options]
+
+# EXAMPLE
+Where the given root is specified via the global option "--root"
+(default: "/run/runc").
+
+To list containers created via the default "--root":
+       # runc list
+
+To list containers created using a non-default value for "--root":
+       # runc --root value list
+
+# OPTIONS
+    --format value, -f value     select one of: table or json (default: "table")
+    --quiet, -q                  display only container IDs
diff --git a/man/runc-pause.8.md b/man/runc-pause.8.md
new file mode 100644 (file)
index 0000000..965f7da
--- /dev/null
@@ -0,0 +1,14 @@
+% runc-pause "8"
+
+# NAME
+   runc pause - pause suspends all processes inside the container
+
+# SYNOPSIS
+   runc pause `<container-id>`
+
+Where "`<container-id>`" is the name for the instance of the container to be
+paused. 
+
+# DESCRIPTION
+   The pause command suspends all processes in the instance of the container.
+Use runc list to identify instances of containers and their current status.
diff --git a/man/runc-ps.8.md b/man/runc-ps.8.md
new file mode 100644 (file)
index 0000000..1fad467
--- /dev/null
@@ -0,0 +1,15 @@
+% runc-ps "8"
+
+# NAME
+   runc ps - ps displays the processes running inside a container
+
+# SYNOPSIS
+   runc ps [command options] `<container-id>` [ps options]
+
+# OPTIONS
+    --format value, -f value     select one of: table(default) or json
+
+The default format is table. The following will output the processes of a container
+in json format:
+
+    # runc ps -f json <container-id>
diff --git a/man/runc-restore.8.md b/man/runc-restore.8.md
new file mode 100644 (file)
index 0000000..e475bd5
--- /dev/null
@@ -0,0 +1,28 @@
+% runc-restore "8"
+
+# NAME
+   runc restore - restore a container from a previous checkpoint
+
+# SYNOPSIS
+   runc restore [command options] `<container-id>`
+
+Where "`<container-id>`" is the name for the instance of the container to be
+restored.
+
+# DESCRIPTION
+   Restores the saved state of the container instance that was previously saved
+using the runc checkpoint command.
+
+# OPTIONS
+    --image-path value           path to criu image files for restoring
+    --work-path value            path for saving work files and logs
+    --tcp-established            allow open tcp connections
+    --ext-unix-sk                allow external unix sockets
+    --shell-job                  allow shell jobs
+    --file-locks                 handle file locks, for safety
+    --manage-cgroups-mode value  cgroups mode: 'soft' (default), 'full' and 'strict'
+    --bundle value, -b value     path to the root of the bundle directory
+    --detach, -d                 detach from the container's process
+    --pid-file value             specify the file to write the process id to
+    --no-subreaper               disable the use of the subreaper used to reap reparented processes
+    --no-pivot                   do not use pivot root to jail process inside rootfs.  This should be used whenever the rootfs is on top of a ramdisk
diff --git a/man/runc-resume.8.md b/man/runc-resume.8.md
new file mode 100644 (file)
index 0000000..25d342f
--- /dev/null
@@ -0,0 +1,14 @@
+% runc-resume "8"
+
+# NAME
+   runc resume - resumes all processes that have been previously paused
+
+# SYNOPSIS
+   runc resume `<container-id>`
+
+Where "`<container-id>`" is the name for the instance of the container to be
+resumed.
+
+# DESCRIPTION
+   The resume command resumes all processes in the instance of the container.
+Use runc list to identify instances of containers and their current status.
diff --git a/man/runc-run.8.md b/man/runc-run.8.md
new file mode 100644 (file)
index 0000000..ad2b8b2
--- /dev/null
@@ -0,0 +1,31 @@
+% runc-run "8"
+
+# NAME
+   runc run - create and run a container
+
+# SYNOPSIS
+   runc run [command options] `<container-id>`
+
+Where "`<container-id>`" is your name for the instance of the container that you
+are starting. The name you provide for the container instance must be unique on
+your host.
+
+# DESCRIPTION
+   The run command creates an instance of a container for a bundle. The bundle
+is a directory with a specification file named "config.json" and a root
+filesystem.
+
+The specification file includes an args parameter. The args parameter is used
+to specify command(s) that get run when the container is started. To change the
+command(s) that get executed on start, edit the args parameter of the spec. See
+"runc spec --help" for more explanation.
+
+# OPTIONS
+    --bundle value, -b value  path to the root of the bundle directory, defaults to the current directory
+    --console-socket value    path to an AF_UNIX socket which will receive a file descriptor referencing the master end of the console's pseudoterminal
+    --detach, -d              detach from the container's process
+    --pid-file value          specify the file to write the process id to
+    --no-subreaper            disable the use of the subreaper used to reap reparented processes
+    --no-pivot                do not use pivot root to jail process inside rootfs.  This should be used whenever the rootfs is on top of a ramdisk
+    --no-new-keyring          do not create a new session keyring for the container.  This will cause the container to inherit the calling processes session key
+    --preserve-fds value      Pass N additional file descriptors to the container (stdio + $LISTEN_FDS + N in total) (default: 0)
diff --git a/man/runc-spec.8.md b/man/runc-spec.8.md
new file mode 100644 (file)
index 0000000..6a181cd
--- /dev/null
@@ -0,0 +1,56 @@
+% runc-spec "8"
+
+# NAME
+   runc spec - create a new specification file
+
+# SYNOPSIS
+   runc spec [command options] [arguments...]
+
+# DESCRIPTION
+   The spec command creates the new specification file named "config.json" for
+the bundle.
+
+The spec generated is just a starter file. Editing of the spec is required to
+achieve desired results. For example, the newly generated spec includes an args
+parameter that is initially set to call the "sh" command when the container is
+started. Calling "sh" may work for an ubuntu container or busybox, but will not
+work for containers that do not include the "sh" program.
+
+# EXAMPLE
+  To run docker's hello-world container one needs to set the args parameter
+in the spec to call hello. This can be done using the sed command or a text
+editor. The following commands create a bundle for hello-world, change the
+default args parameter in the spec from "sh" to "/hello", then run the hello
+command in a new hello-world container named container1:
+
+    mkdir hello
+    cd hello
+    docker pull hello-world
+    docker export $(docker create hello-world) > hello-world.tar
+    mkdir rootfs
+    tar -C rootfs -xf hello-world.tar
+    runc spec
+    sed -i 's;"sh";"/hello";' config.json
+    runc start container1
+
+In the start command above, "container1" is the name for the instance of the
+container that you are starting. The name you provide for the container instance
+must be unique on your host.
+
+An alternative for generating a customized spec config is to use "oci-runtime-tool", the
+sub-command "oci-runtime-tool generate" has lots of options that can be used to do any
+customizations as you want, see [runtime-tools](https://github.com/opencontainers/runtime-tools)
+to get more information.
+
+When starting a container through runc, runc needs root privilege. If not
+already running as root, you can use sudo to give runc root privilege. For
+example: "sudo runc start container1" will give runc root privilege to start the
+container on your host.
+
+Alternatively, you can start a rootless container, which has the ability to run without root privileges.
+For this to work, the specification file needs to be adjusted accordingly.
+You can pass the parameter **--rootless** to this command to generate a proper rootless spec file.
+
+# OPTIONS
+    --bundle value, -b value     path to the root of the bundle directory
+    --rootless                   generate a configuration for a rootless container
diff --git a/man/runc-start.8.md b/man/runc-start.8.md
new file mode 100644 (file)
index 0000000..e4bbacc
--- /dev/null
@@ -0,0 +1,14 @@
+% runc-start "8"
+
+# NAME
+   runc start - start executes the user defined process in a created container
+
+# SYNOPSIS
+   runc start `<container-id>`
+
+Where "`<container-id>`" is your name for the instance of the container that you
+are starting. The name you provide for the container instance must be unique on
+your host.
+
+# DESCRIPTION
+   The start command executes the user defined process in a created container.
diff --git a/man/runc-state.8.md b/man/runc-state.8.md
new file mode 100644 (file)
index 0000000..768f79f
--- /dev/null
@@ -0,0 +1,13 @@
+% runc-state "8"
+
+# NAME
+   runc state - output the state of a container
+
+# SYNOPSIS
+   runc state `<container-id>`
+
+Where "`<container-id>`" is your name for the instance of the container.
+
+# DESCRIPTION
+   The state command outputs current state information for the
+instance of a container.
diff --git a/man/runc-update.8.md b/man/runc-update.8.md
new file mode 100644 (file)
index 0000000..fa269d6
--- /dev/null
@@ -0,0 +1,55 @@
+% runc-update "8"
+
+# NAME
+   runc update - update container resource constraints
+
+# SYNOPSIS
+   runc update [command options] `<container-id>`
+
+# DESCRIPTION
+   The data can be read from a file or the standard input, the
+accepted format is as follow (unchanged values can be omitted):
+
+   {
+     "memory": {
+       "limit": 0,
+       "reservation": 0,
+       "swap": 0,
+       "kernel": 0,
+       "kernelTCP": 0
+     },
+     "cpu": {
+       "shares": 0,
+       "quota": 0,
+       "period": 0,
+       "realtimeRuntime": 0,
+       "realtimePeriod": 0,
+       "cpus": "",
+       "mems": ""
+     },
+     "blockIO": {
+       "blkioWeight": 0
+     }
+   }
+
+Note: if data is to be read from a file or the standard input, all
+other options are ignored.
+
+# OPTIONS
+    --resources value, -r value  path to the file containing the resources to update or '-' to read from the standard input
+    --blkio-weight value         Specifies per cgroup weight, range is from 10 to 1000 (default: 0)
+    --cpu-period value           CPU CFS period to be used for hardcapping (in usecs). 0 to use system default
+    --cpu-quota value            CPU CFS hardcap limit (in usecs). Allowed cpu time in a given period
+    --cpu-rt-period value        CPU realtime period to be used for hardcapping (in usecs). 0 to use system default
+    --cpu-rt-runtime value       CPU realtime hardcap limit (in usecs). Allowed cpu time in a given period
+    --cpu-share value            CPU shares (relative weight vs. other containers)
+    --cpuset-cpus value          CPU(s) to use
+    --cpuset-mems value          Memory node(s) to use
+    --kernel-memory value        Kernel memory limit (in bytes)
+    --kernel-memory-tcp value    Kernel memory limit (in bytes) for tcp buffer
+    --memory value               Memory limit (in bytes)
+    --memory-reservation value   Memory reservation or soft_limit (in bytes)
+    --memory-swap value          Total memory usage (memory + swap); set '-1' to enable unlimited swap
+    --pids-limit value           Maximum number of pids allowed in the container (default: 0)
+    --l3-cache-schema            The string of Intel RDT/CAT L3 cache schema
+    --mem-bw-schema              The string of Intel RDT/MBA memory bandwidth schema
diff --git a/man/runc.8.md b/man/runc.8.md
new file mode 100644 (file)
index 0000000..49df525
--- /dev/null
@@ -0,0 +1,61 @@
+% runc "8"
+
+# NAME
+   runc - Open Container Initiative runtime
+
+# SYNOPSIS
+   runc [global options] command [command options] [arguments...]
+   
+# DESCRIPTION
+runc is a command line client for running applications packaged according to
+the Open Container Initiative (OCI) format and is a compliant implementation of the
+Open Container Initiative specification.
+
+runc integrates well with existing process supervisors to provide a production
+container runtime environment for applications. It can be used with your
+existing process monitoring tools and the container will be spawned as a
+direct child of the process supervisor.
+
+Containers are configured using bundles. A bundle for a container is a directory
+that includes a specification file named "config.json" and a root filesystem.
+The root filesystem contains the contents of the container. 
+
+To start a new instance of a container:
+
+    # runc start [ -b bundle ] <container-id>
+
+Where "`<container-id>`" is your name for the instance of the container that you
+are starting. The name you provide for the container instance must be unique on
+your host. Providing the bundle directory using "-b" is optional. The default
+value for "bundle" is the current directory.
+
+# COMMANDS
+    checkpoint   checkpoint a running container
+    create       create a container
+    delete       delete any resources held by the container often used with detached containers
+    events       display container events such as OOM notifications, cpu, memory, IO and network stats
+    exec         execute new process inside the container
+    init         initialize the namespaces and launch the process (do not call it outside of runc)
+    kill         kill sends the specified signal (default: SIGTERM) to the container's init process
+    list         lists containers started by runc with the given root
+    pause        pause suspends all processes inside the container
+    ps           displays the processes running inside a container
+    restore      restore a container from a previous checkpoint
+    resume       resumes all processes that have been previously paused
+    run          create and run a container
+    spec         create a new specification file
+    start        executes the user defined process in a created container
+    state        output the state of a container
+    update       update container resource constraints
+    help, h      Shows a list of commands or help for one command
+   
+# GLOBAL OPTIONS
+    --debug              enable debug output for logging
+    --log value          set the log file path where internal debug information is written (default: "/dev/null")
+    --log-format value   set the format used by logs ('text' (default), or 'json') (default: "text")
+    --root value         root directory for storage of container state (this should be located in tmpfs) (default: "/run/runc" or $XDG_RUNTIME_DIR/runc for rootless containers)
+    --criu value         path to the criu binary used for checkpoint and restore (default: "criu")
+    --systemd-cgroup     enable systemd cgroup support, expects cgroupsPath to be of form "slice:prefix:name" for e.g. "system.slice:runc:434234"
+    --rootless value    enable rootless mode ('true', 'false', or 'auto') (default: "auto")
+    --help, -h           show help
+    --version, -v        print the version
diff --git a/notify_socket.go b/notify_socket.go
new file mode 100644 (file)
index 0000000..e7453c6
--- /dev/null
@@ -0,0 +1,116 @@
+// +build linux
+
+package main
+
+import (
+       "bytes"
+       "fmt"
+       "net"
+       "os"
+       "path/filepath"
+
+       "github.com/opencontainers/runtime-spec/specs-go"
+
+       "github.com/sirupsen/logrus"
+       "github.com/urfave/cli"
+)
+
+type notifySocket struct {
+       socket     *net.UnixConn
+       host       string
+       socketPath string
+}
+
+func newNotifySocket(context *cli.Context, notifySocketHost string, id string) *notifySocket {
+       if notifySocketHost == "" {
+               return nil
+       }
+
+       root := filepath.Join(context.GlobalString("root"), id)
+       path := filepath.Join(root, "notify.sock")
+
+       notifySocket := &notifySocket{
+               socket:     nil,
+               host:       notifySocketHost,
+               socketPath: path,
+       }
+
+       return notifySocket
+}
+
+func (s *notifySocket) Close() error {
+       return s.socket.Close()
+}
+
+// If systemd is supporting sd_notify protocol, this function will add support
+// for sd_notify protocol from within the container.
+func (s *notifySocket) setupSpec(context *cli.Context, spec *specs.Spec) {
+       mount := specs.Mount{Destination: s.host, Source: s.socketPath, Options: []string{"bind"}}
+       spec.Mounts = append(spec.Mounts, mount)
+       spec.Process.Env = append(spec.Process.Env, fmt.Sprintf("NOTIFY_SOCKET=%s", s.host))
+}
+
+func (s *notifySocket) setupSocket() error {
+       addr := net.UnixAddr{
+               Name: s.socketPath,
+               Net:  "unixgram",
+       }
+
+       socket, err := net.ListenUnixgram("unixgram", &addr)
+       if err != nil {
+               return err
+       }
+
+       err = os.Chmod(s.socketPath, 0777)
+       if err != nil {
+               socket.Close()
+               return err
+       }
+
+       s.socket = socket
+       return nil
+}
+
+// pid1 must be set only with -d, as it is used to set the new process as the main process
+// for the service in systemd
+func (s *notifySocket) run(pid1 int) {
+       buf := make([]byte, 512)
+       notifySocketHostAddr := net.UnixAddr{Name: s.host, Net: "unixgram"}
+       client, err := net.DialUnix("unixgram", nil, &notifySocketHostAddr)
+       if err != nil {
+               logrus.Error(err)
+               return
+       }
+       for {
+               r, err := s.socket.Read(buf)
+               if err != nil {
+                       break
+               }
+               var out bytes.Buffer
+               for _, line := range bytes.Split(buf[0:r], []byte{'\n'}) {
+                       if bytes.HasPrefix(line, []byte("READY=")) {
+                               _, err = out.Write(line)
+                               if err != nil {
+                                       return
+                               }
+
+                               _, err = out.Write([]byte{'\n'})
+                               if err != nil {
+                                       return
+                               }
+
+                               _, err = client.Write(out.Bytes())
+                               if err != nil {
+                                       return
+                               }
+
+                               // now we can inform systemd to use pid1 as the pid to monitor
+                               if pid1 > 0 {
+                                       newPid := fmt.Sprintf("MAINPID=%d\n", pid1)
+                                       client.Write([]byte(newPid))
+                               }
+                               return
+                       }
+               }
+       }
+}
diff --git a/pause.go b/pause.go
new file mode 100644 (file)
index 0000000..224c79f
--- /dev/null
+++ b/pause.go
@@ -0,0 +1,66 @@
+// +build linux
+
+package main
+
+import (
+       "github.com/sirupsen/logrus"
+       "github.com/urfave/cli"
+)
+
+var pauseCommand = cli.Command{
+       Name:  "pause",
+       Usage: "pause suspends all processes inside the container",
+       ArgsUsage: `<container-id>
+
+Where "<container-id>" is the name for the instance of the container to be
+paused. `,
+       Description: `The pause command suspends all processes in the instance of the container.
+
+Use runc list to identify instances of containers and their current status.`,
+       Action: func(context *cli.Context) error {
+               if err := checkArgs(context, 1, exactArgs); err != nil {
+                       return err
+               }
+               rootlessCg, err := shouldUseRootlessCgroupManager(context)
+               if err != nil {
+                       return err
+               }
+               if rootlessCg {
+                       logrus.Warnf("runc pause may fail if you don't have the full access to cgroups")
+               }
+               container, err := getContainer(context)
+               if err != nil {
+                       return err
+               }
+               return container.Pause()
+       },
+}
+
+var resumeCommand = cli.Command{
+       Name:  "resume",
+       Usage: "resumes all processes that have been previously paused",
+       ArgsUsage: `<container-id>
+
+Where "<container-id>" is the name for the instance of the container to be
+resumed.`,
+       Description: `The resume command resumes all processes in the instance of the container.
+
+Use runc list to identify instances of containers and their current status.`,
+       Action: func(context *cli.Context) error {
+               if err := checkArgs(context, 1, exactArgs); err != nil {
+                       return err
+               }
+               rootlessCg, err := shouldUseRootlessCgroupManager(context)
+               if err != nil {
+                       return err
+               }
+               if rootlessCg {
+                       logrus.Warn("runc resume may fail if you don't have the full access to cgroups")
+               }
+               container, err := getContainer(context)
+               if err != nil {
+                       return err
+               }
+               return container.Resume()
+       },
+}
diff --git a/ps.go b/ps.go
new file mode 100644 (file)
index 0000000..e7f635f
--- /dev/null
+++ b/ps.go
@@ -0,0 +1,113 @@
+// +build linux
+
+package main
+
+import (
+       "encoding/json"
+       "fmt"
+       "os"
+       "os/exec"
+       "strconv"
+       "strings"
+
+       "github.com/sirupsen/logrus"
+       "github.com/urfave/cli"
+)
+
+var psCommand = cli.Command{
+       Name:      "ps",
+       Usage:     "ps displays the processes running inside a container",
+       ArgsUsage: `<container-id> [ps options]`,
+       Flags: []cli.Flag{
+               cli.StringFlag{
+                       Name:  "format, f",
+                       Value: "table",
+                       Usage: `select one of: ` + formatOptions,
+               },
+       },
+       Action: func(context *cli.Context) error {
+               if err := checkArgs(context, 1, minArgs); err != nil {
+                       return err
+               }
+               rootlessCg, err := shouldUseRootlessCgroupManager(context)
+               if err != nil {
+                       return err
+               }
+               if rootlessCg {
+                       logrus.Warn("runc ps may fail if you don't have the full access to cgroups")
+               }
+
+               container, err := getContainer(context)
+               if err != nil {
+                       return err
+               }
+
+               pids, err := container.Processes()
+               if err != nil {
+                       return err
+               }
+
+               switch context.String("format") {
+               case "table":
+               case "json":
+                       return json.NewEncoder(os.Stdout).Encode(pids)
+               default:
+                       return fmt.Errorf("invalid format option")
+               }
+
+               // [1:] is to remove command name, ex:
+               // context.Args(): [containet_id ps_arg1 ps_arg2 ...]
+               // psArgs:         [ps_arg1 ps_arg2 ...]
+               //
+               psArgs := context.Args()[1:]
+               if len(psArgs) == 0 {
+                       psArgs = []string{"-ef"}
+               }
+
+               cmd := exec.Command("ps", psArgs...)
+               output, err := cmd.CombinedOutput()
+               if err != nil {
+                       return fmt.Errorf("%s: %s", err, output)
+               }
+
+               lines := strings.Split(string(output), "\n")
+               pidIndex, err := getPidIndex(lines[0])
+               if err != nil {
+                       return err
+               }
+
+               fmt.Println(lines[0])
+               for _, line := range lines[1:] {
+                       if len(line) == 0 {
+                               continue
+                       }
+                       fields := strings.Fields(line)
+                       p, err := strconv.Atoi(fields[pidIndex])
+                       if err != nil {
+                               return fmt.Errorf("unexpected pid '%s': %s", fields[pidIndex], err)
+                       }
+
+                       for _, pid := range pids {
+                               if pid == p {
+                                       fmt.Println(line)
+                                       break
+                               }
+                       }
+               }
+               return nil
+       },
+       SkipArgReorder: true,
+}
+
+func getPidIndex(title string) (int, error) {
+       titles := strings.Fields(title)
+
+       pidIndex := -1
+       for i, name := range titles {
+               if name == "PID" {
+                       return i, nil
+               }
+       }
+
+       return pidIndex, fmt.Errorf("couldn't find PID field in ps output")
+}
diff --git a/restore.go b/restore.go
new file mode 100644 (file)
index 0000000..53f50d2
--- /dev/null
@@ -0,0 +1,142 @@
+// +build linux
+
+package main
+
+import (
+       "os"
+
+       "github.com/opencontainers/runc/libcontainer"
+       "github.com/opencontainers/runc/libcontainer/system"
+       "github.com/sirupsen/logrus"
+       "github.com/urfave/cli"
+)
+
+var restoreCommand = cli.Command{
+       Name:  "restore",
+       Usage: "restore a container from a previous checkpoint",
+       ArgsUsage: `<container-id>
+
+Where "<container-id>" is the name for the instance of the container to be
+restored.`,
+       Description: `Restores the saved state of the container instance that was previously saved
+using the runc checkpoint command.`,
+       Flags: []cli.Flag{
+               cli.StringFlag{
+                       Name:  "console-socket",
+                       Value: "",
+                       Usage: "path to an AF_UNIX socket which will receive a file descriptor referencing the master end of the console's pseudoterminal",
+               },
+               cli.StringFlag{
+                       Name:  "image-path",
+                       Value: "",
+                       Usage: "path to criu image files for restoring",
+               },
+               cli.StringFlag{
+                       Name:  "work-path",
+                       Value: "",
+                       Usage: "path for saving work files and logs",
+               },
+               cli.BoolFlag{
+                       Name:  "tcp-established",
+                       Usage: "allow open tcp connections",
+               },
+               cli.BoolFlag{
+                       Name:  "ext-unix-sk",
+                       Usage: "allow external unix sockets",
+               },
+               cli.BoolFlag{
+                       Name:  "shell-job",
+                       Usage: "allow shell jobs",
+               },
+               cli.BoolFlag{
+                       Name:  "file-locks",
+                       Usage: "handle file locks, for safety",
+               },
+               cli.StringFlag{
+                       Name:  "manage-cgroups-mode",
+                       Value: "",
+                       Usage: "cgroups mode: 'soft' (default), 'full' and 'strict'",
+               },
+               cli.StringFlag{
+                       Name:  "bundle, b",
+                       Value: "",
+                       Usage: "path to the root of the bundle directory",
+               },
+               cli.BoolFlag{
+                       Name:  "detach,d",
+                       Usage: "detach from the container's process",
+               },
+               cli.StringFlag{
+                       Name:  "pid-file",
+                       Value: "",
+                       Usage: "specify the file to write the process id to",
+               },
+               cli.BoolFlag{
+                       Name:  "no-subreaper",
+                       Usage: "disable the use of the subreaper used to reap reparented processes",
+               },
+               cli.BoolFlag{
+                       Name:  "no-pivot",
+                       Usage: "do not use pivot root to jail process inside rootfs.  This should be used whenever the rootfs is on top of a ramdisk",
+               },
+               cli.StringSliceFlag{
+                       Name:  "empty-ns",
+                       Usage: "create a namespace, but don't restore its properties",
+               },
+               cli.BoolFlag{
+                       Name:  "auto-dedup",
+                       Usage: "enable auto deduplication of memory images",
+               },
+               cli.BoolFlag{
+                       Name:  "lazy-pages",
+                       Usage: "use userfaultfd to lazily restore memory pages",
+               },
+       },
+       Action: func(context *cli.Context) error {
+               if err := checkArgs(context, 1, exactArgs); err != nil {
+                       return err
+               }
+               // XXX: Currently this is untested with rootless containers.
+               if os.Geteuid() != 0 || system.RunningInUserNS() {
+                       logrus.Warn("runc checkpoint is untested with rootless containers")
+               }
+
+               spec, err := setupSpec(context)
+               if err != nil {
+                       return err
+               }
+               options := criuOptions(context)
+               if err := setEmptyNsMask(context, options); err != nil {
+                       return err
+               }
+               status, err := startContainer(context, spec, CT_ACT_RESTORE, options)
+               if err != nil {
+                       return err
+               }
+               // exit with the container's exit status so any external supervisor is
+               // notified of the exit with the correct exit status.
+               os.Exit(status)
+               return nil
+       },
+}
+
+func criuOptions(context *cli.Context) *libcontainer.CriuOpts {
+       imagePath := getCheckpointImagePath(context)
+       if err := os.MkdirAll(imagePath, 0755); err != nil {
+               fatal(err)
+       }
+       return &libcontainer.CriuOpts{
+               ImagesDirectory:         imagePath,
+               WorkDirectory:           context.String("work-path"),
+               ParentImage:             context.String("parent-path"),
+               LeaveRunning:            context.Bool("leave-running"),
+               TcpEstablished:          context.Bool("tcp-established"),
+               ExternalUnixConnections: context.Bool("ext-unix-sk"),
+               ShellJob:                context.Bool("shell-job"),
+               FileLocks:               context.Bool("file-locks"),
+               PreDump:                 context.Bool("pre-dump"),
+               AutoDedup:               context.Bool("auto-dedup"),
+               LazyPages:               context.Bool("lazy-pages"),
+               StatusFd:                context.String("status-fd"),
+       }
+}
diff --git a/rlimit_linux.go b/rlimit_linux.go
new file mode 100644 (file)
index 0000000..c97a0fb
--- /dev/null
@@ -0,0 +1,49 @@
+package main
+
+import "fmt"
+
+const (
+       RLIMIT_CPU        = iota // CPU time in sec
+       RLIMIT_FSIZE             // Maximum filesize
+       RLIMIT_DATA              // max data size
+       RLIMIT_STACK             // max stack size
+       RLIMIT_CORE              // max core file size
+       RLIMIT_RSS               // max resident set size
+       RLIMIT_NPROC             // max number of processes
+       RLIMIT_NOFILE            // max number of open files
+       RLIMIT_MEMLOCK           // max locked-in-memory address space
+       RLIMIT_AS                // address space limit
+       RLIMIT_LOCKS             // maximum file locks held
+       RLIMIT_SIGPENDING        // max number of pending signals
+       RLIMIT_MSGQUEUE          // maximum bytes in POSIX mqueues
+       RLIMIT_NICE              // max nice prio allowed to raise to
+       RLIMIT_RTPRIO            // maximum realtime priority
+       RLIMIT_RTTIME            // timeout for RT tasks in us
+)
+
+var rlimitMap = map[string]int{
+       "RLIMIT_CPU":        RLIMIT_CPU,
+       "RLIMIT_FSIZE":      RLIMIT_FSIZE,
+       "RLIMIT_DATA":       RLIMIT_DATA,
+       "RLIMIT_STACK":      RLIMIT_STACK,
+       "RLIMIT_CORE":       RLIMIT_CORE,
+       "RLIMIT_RSS":        RLIMIT_RSS,
+       "RLIMIT_NPROC":      RLIMIT_NPROC,
+       "RLIMIT_NOFILE":     RLIMIT_NOFILE,
+       "RLIMIT_MEMLOCK":    RLIMIT_MEMLOCK,
+       "RLIMIT_AS":         RLIMIT_AS,
+       "RLIMIT_LOCKS":      RLIMIT_LOCKS,
+       "RLIMIT_SIGPENDING": RLIMIT_SIGPENDING,
+       "RLIMIT_MSGQUEUE":   RLIMIT_MSGQUEUE,
+       "RLIMIT_NICE":       RLIMIT_NICE,
+       "RLIMIT_RTPRIO":     RLIMIT_RTPRIO,
+       "RLIMIT_RTTIME":     RLIMIT_RTTIME,
+}
+
+func strToRlimit(key string) (int, error) {
+       rl, ok := rlimitMap[key]
+       if !ok {
+               return 0, fmt.Errorf("wrong rlimit value: %s", key)
+       }
+       return rl, nil
+}
diff --git a/rootless_linux.go b/rootless_linux.go
new file mode 100644 (file)
index 0000000..3c425dc
--- /dev/null
@@ -0,0 +1,58 @@
+// +build linux
+
+package main
+
+import (
+       "os"
+
+       "github.com/opencontainers/runc/libcontainer/system"
+       "github.com/urfave/cli"
+)
+
+func shouldUseRootlessCgroupManager(context *cli.Context) (bool, error) {
+       if context != nil {
+               b, err := parseBoolOrAuto(context.GlobalString("rootless"))
+               if err != nil {
+                       return false, err
+               }
+               // nil b stands for "auto detect"
+               if b != nil {
+                       return *b, nil
+               }
+
+               if context.GlobalBool("systemd-cgroup") {
+                       return false, nil
+               }
+       }
+       if os.Geteuid() != 0 {
+               return true, nil
+       }
+       if !system.RunningInUserNS() {
+               // euid == 0 , in the initial ns (i.e. the real root)
+               return false, nil
+       }
+       // euid = 0, in a userns.
+       // As we are unaware of cgroups path, we can't determine whether we have the full
+       // access to the cgroups path.
+       // Either way, we can safely decide to use the rootless cgroups manager.
+       return true, nil
+}
+
+func shouldHonorXDGRuntimeDir() bool {
+       if os.Getenv("XDG_RUNTIME_DIR") == "" {
+               return false
+       }
+       if os.Geteuid() != 0 {
+               return true
+       }
+       if !system.RunningInUserNS() {
+               // euid == 0 , in the initial ns (i.e. the real root)
+               // in this case, we should use /run/runc and ignore
+               // $XDG_RUNTIME_DIR (e.g. /run/user/0) for backward
+               // compatibility.
+               return false
+       }
+       // euid = 0, in a userns.
+       u, ok := os.LookupEnv("USER")
+       return !ok || u != "root"
+}
diff --git a/run.go b/run.go
new file mode 100644 (file)
index 0000000..f8d6317
--- /dev/null
+++ b/run.go
@@ -0,0 +1,84 @@
+// +build linux
+
+package main
+
+import (
+       "os"
+
+       "github.com/urfave/cli"
+)
+
+// default action is to start a container
+var runCommand = cli.Command{
+       Name:  "run",
+       Usage: "create and run a container",
+       ArgsUsage: `<container-id>
+
+Where "<container-id>" is your name for the instance of the container that you
+are starting. The name you provide for the container instance must be unique on
+your host.`,
+       Description: `The run command creates an instance of a container for a bundle. The bundle
+is a directory with a specification file named "` + specConfig + `" and a root
+filesystem.
+
+The specification file includes an args parameter. The args parameter is used
+to specify command(s) that get run when the container is started. To change the
+command(s) that get executed on start, edit the args parameter of the spec. See
+"runc spec --help" for more explanation.`,
+       Flags: []cli.Flag{
+               cli.StringFlag{
+                       Name:  "bundle, b",
+                       Value: "",
+                       Usage: `path to the root of the bundle directory, defaults to the current directory`,
+               },
+               cli.StringFlag{
+                       Name:  "console-socket",
+                       Value: "",
+                       Usage: "path to an AF_UNIX socket which will receive a file descriptor referencing the master end of the console's pseudoterminal",
+               },
+               cli.BoolFlag{
+                       Name:  "detach, d",
+                       Usage: "detach from the container's process",
+               },
+               cli.StringFlag{
+                       Name:  "pid-file",
+                       Value: "",
+                       Usage: "specify the file to write the process id to",
+               },
+               cli.BoolFlag{
+                       Name:  "no-subreaper",
+                       Usage: "disable the use of the subreaper used to reap reparented processes",
+               },
+               cli.BoolFlag{
+                       Name:  "no-pivot",
+                       Usage: "do not use pivot root to jail process inside rootfs.  This should be used whenever the rootfs is on top of a ramdisk",
+               },
+               cli.BoolFlag{
+                       Name:  "no-new-keyring",
+                       Usage: "do not create a new session keyring for the container.  This will cause the container to inherit the calling processes session key",
+               },
+               cli.IntFlag{
+                       Name:  "preserve-fds",
+                       Usage: "Pass N additional file descriptors to the container (stdio + $LISTEN_FDS + N in total)",
+               },
+       },
+       Action: func(context *cli.Context) error {
+               if err := checkArgs(context, 1, exactArgs); err != nil {
+                       return err
+               }
+               if err := revisePidFile(context); err != nil {
+                       return err
+               }
+               spec, err := setupSpec(context)
+               if err != nil {
+                       return err
+               }
+               status, err := startContainer(context, spec, CT_ACT_RUN, nil)
+               if err == nil {
+                       // exit with the container's exit status so any external supervisor is
+                       // notified of the exit with the correct exit status.
+                       os.Exit(status)
+               }
+               return err
+       },
+}
diff --git a/script/.validate b/script/.validate
new file mode 100644 (file)
index 0000000..170d674
--- /dev/null
@@ -0,0 +1,33 @@
+#!/bin/bash
+
+if [ -z "$VALIDATE_UPSTREAM" ]; then
+       # this is kind of an expensive check, so let's not do this twice if we
+       # are running more than one validate bundlescript
+       
+       VALIDATE_REPO='https://github.com/opencontainers/runc.git'
+       VALIDATE_BRANCH='master'
+       
+       if [ "$TRAVIS" = 'true' -a "$TRAVIS_PULL_REQUEST" != 'false' ]; then
+               VALIDATE_REPO="https://github.com/${TRAVIS_REPO_SLUG}.git"
+               VALIDATE_BRANCH="${TRAVIS_BRANCH}"
+       fi
+       
+       VALIDATE_HEAD="$(git rev-parse --verify HEAD)"
+       
+       git fetch -q "$VALIDATE_REPO" "refs/heads/$VALIDATE_BRANCH"
+       VALIDATE_UPSTREAM="$(git rev-parse --verify FETCH_HEAD)"
+       
+       VALIDATE_COMMIT_LOG="$VALIDATE_UPSTREAM..$VALIDATE_HEAD"
+       VALIDATE_COMMIT_DIFF="$VALIDATE_UPSTREAM...$VALIDATE_HEAD"
+       
+       validate_diff() {
+               if [ "$VALIDATE_UPSTREAM" != "$VALIDATE_HEAD" ]; then
+                       git diff "$VALIDATE_COMMIT_DIFF" "$@"
+               fi
+       }
+       validate_log() {
+               if [ "$VALIDATE_UPSTREAM" != "$VALIDATE_HEAD" ]; then
+                       git log "$VALIDATE_COMMIT_LOG" "$@"
+               fi
+       }
+fi
diff --git a/script/check-config.sh b/script/check-config.sh
new file mode 100755 (executable)
index 0000000..6b8158e
--- /dev/null
@@ -0,0 +1,253 @@
+#!/usr/bin/env bash
+set -e
+
+# bits of this were adapted from check_config.sh in docker 
+# see also https://github.com/docker/docker/blob/master/contrib/check-config.sh
+
+possibleConfigs=(
+       '/proc/config.gz'
+       "/boot/config-$(uname -r)"
+       "/usr/src/linux-$(uname -r)/.config"
+       '/usr/src/linux/.config'
+)
+possibleConfigFiles=(
+       'config.gz'
+       "config-$(uname -r)"
+       '.config'
+)
+
+if ! command -v zgrep &>/dev/null; then
+       zgrep() {
+               zcat "$2" | grep "$1"
+       }
+fi
+
+kernelVersion="$(uname -r)"
+kernelMajor="${kernelVersion%%.*}"
+kernelMinor="${kernelVersion#$kernelMajor.}"
+kernelMinor="${kernelMinor%%.*}"
+
+is_set() {
+       zgrep "CONFIG_$1=[y|m]" "$CONFIG" >/dev/null
+}
+is_set_in_kernel() {
+       zgrep "CONFIG_$1=y" "$CONFIG" >/dev/null
+}
+is_set_as_module() {
+       zgrep "CONFIG_$1=m" "$CONFIG" >/dev/null
+}
+
+color() {
+       local codes=()
+       if [ "$1" = 'bold' ]; then
+               codes=("${codes[@]}" '1')
+               shift
+       fi
+       if [ "$#" -gt 0 ]; then
+               local code
+               case "$1" in
+               # see https://en.wikipedia.org/wiki/ANSI_escape_code#Colors
+               black) code=30 ;;
+               red) code=31 ;;
+               green) code=32 ;;
+               yellow) code=33 ;;
+               blue) code=34 ;;
+               magenta) code=35 ;;
+               cyan) code=36 ;;
+               white) code=37 ;;
+               esac
+               if [ "$code" ]; then
+                       codes=("${codes[@]}" "$code")
+               fi
+       fi
+       local IFS=';'
+       echo -en '\033['"${codes[*]}"'m'
+}
+wrap_color() {
+       text="$1"
+       shift
+       color "$@"
+       echo -n "$text"
+       color reset
+       echo
+}
+
+wrap_good() {
+       echo "$(wrap_color "$1" white): $(wrap_color "$2" green)"
+}
+wrap_bad() {
+       echo "$(wrap_color "$1" bold): $(wrap_color "$2" bold red)"
+}
+wrap_warning() {
+       wrap_color >&2 "$*" red
+}
+
+check_flag() {
+       if is_set_in_kernel "$1"; then
+               wrap_good "CONFIG_$1" 'enabled'
+       elif is_set_as_module "$1"; then
+               wrap_good "CONFIG_$1" 'enabled (as module)'
+       else
+               wrap_bad "CONFIG_$1" 'missing'
+       fi
+}
+
+check_flags() {
+       for flag in "$@"; do
+               echo "- $(check_flag "$flag")"
+       done
+}
+
+check_distro_userns() {
+       source /etc/os-release 2>/dev/null || /bin/true
+       if [[ "${ID}" =~ ^(centos|rhel)$ && "${VERSION_ID}" =~ ^7 ]]; then
+               # this is a CentOS7 or RHEL7 system
+               grep -q "user_namespace.enable=1" /proc/cmdline || {
+                       # no user namespace support enabled
+                       wrap_bad "  (RHEL7/CentOS7" "User namespaces disabled; add 'user_namespace.enable=1' to boot command line)"
+               }
+       fi
+}
+
+is_config() {
+       local config="$1"
+
+       # Todo: more check
+       [[ -f "$config" ]] && return 0
+       return 1
+}
+
+search_config() {
+       local target_dir="$1"
+       [[ "$target_dir" ]] || target_dir=("${possibleConfigs[@]}")
+
+       local tryConfig
+       for tryConfig in "${target_dir[@]}"; do
+               is_config "$tryConfig" && {
+                       CONFIG="$tryConfig"
+                       return
+               }
+               [[ -d "$tryConfig" ]] && {
+                       for tryFile in "${possibleConfigFiles[@]}"; do
+                               is_config "$tryConfig/$tryFile" && {
+                                       CONFIG="$tryConfig/$tryFile"
+                                       return
+                               }
+                       done
+               }
+       done
+
+       wrap_warning "error: cannot find kernel config"
+       wrap_warning "  try running this script again, specifying the kernel config:"
+       wrap_warning "    CONFIG=/path/to/kernel/.config $0 or $0 /path/to/kernel/.config"
+       exit 1
+}
+
+CONFIG="$1"
+
+is_config "$CONFIG" || {
+       if [[ ! "$CONFIG" ]]; then
+               wrap_color "info: no config specified, searching for kernel config ..." white
+               search_config
+       elif [[ -d "$CONFIG" ]]; then
+               wrap_color "info: input is a directory, searching for kernel config in this directory..." white
+               search_config "$CONFIG"
+       else
+               wrap_warning "warning: $CONFIG seems not a kernel config, searching other paths for kernel config ..."
+               search_config
+       fi
+}
+
+wrap_color "info: reading kernel config from $CONFIG ..." white
+echo
+
+echo 'Generally Necessary:'
+
+echo -n '- '
+cgroupSubsystemDir="$(awk '/[, ](cpu|cpuacct|cpuset|devices|freezer|memory)[, ]/ && $3 == "cgroup" { print $2 }' /proc/mounts | head -n1)"
+cgroupDir="$(dirname "$cgroupSubsystemDir")"
+if [ -d "$cgroupDir/cpu" -o -d "$cgroupDir/cpuacct" -o -d "$cgroupDir/cpuset" -o -d "$cgroupDir/devices" -o -d "$cgroupDir/freezer" -o -d "$cgroupDir/memory" ]; then
+       echo "$(wrap_good 'cgroup hierarchy' 'properly mounted') [$cgroupDir]"
+else
+       if [ "$cgroupSubsystemDir" ]; then
+               echo "$(wrap_bad 'cgroup hierarchy' 'single mountpoint!') [$cgroupSubsystemDir]"
+       else
+               echo "$(wrap_bad 'cgroup hierarchy' 'nonexistent??')"
+       fi
+       echo "    $(wrap_color '(see https://github.com/tianon/cgroupfs-mount)' yellow)"
+fi
+
+if [ "$(cat /sys/module/apparmor/parameters/enabled 2>/dev/null)" = 'Y' ]; then
+       echo -n '- '
+       if command -v apparmor_parser &>/dev/null; then
+               echo "$(wrap_good 'apparmor' 'enabled and tools installed')"
+       else
+               echo "$(wrap_bad 'apparmor' 'enabled, but apparmor_parser missing')"
+               echo -n '    '
+               if command -v apt-get &>/dev/null; then
+                       echo "$(wrap_color '(use "apt-get install apparmor" to fix this)')"
+               elif command -v yum &>/dev/null; then
+                       echo "$(wrap_color '(your best bet is "yum install apparmor-parser")')"
+               else
+                       echo "$(wrap_color '(look for an "apparmor" package for your distribution)')"
+               fi
+       fi
+fi
+
+flags=(
+       NAMESPACES {NET,PID,IPC,UTS}_NS
+       CGROUPS CGROUP_CPUACCT CGROUP_DEVICE CGROUP_FREEZER CGROUP_SCHED CPUSETS MEMCG
+       KEYS
+       VETH BRIDGE BRIDGE_NETFILTER
+       NF_NAT_IPV4 IP_NF_FILTER IP_NF_TARGET_MASQUERADE
+       NETFILTER_XT_MATCH_{ADDRTYPE,CONNTRACK,IPVS}
+       IP_NF_NAT NF_NAT NF_NAT_NEEDED
+       
+       # required for bind-mounting /dev/mqueue into containers
+       POSIX_MQUEUE
+)
+check_flags "${flags[@]}"
+echo
+
+echo 'Optional Features:'
+{
+       check_flags USER_NS
+       check_distro_userns
+
+       check_flags SECCOMP
+       check_flags CGROUP_PIDS
+
+       check_flags MEMCG_SWAP MEMCG_SWAP_ENABLED
+       if is_set MEMCG_SWAP && ! is_set MEMCG_SWAP_ENABLED; then
+               echo "    $(wrap_color '(note that cgroup swap accounting is not enabled in your kernel config, you can enable it by setting boot option "swapaccount=1")' bold black)"
+       fi
+}
+
+if [ "$kernelMajor" -lt 4 ] || [ "$kernelMajor" -eq 4 -a "$kernelMinor" -le 5 ]; then
+       check_flags MEMCG_KMEM
+fi
+
+if [ "$kernelMajor" -lt 3 ] || [ "$kernelMajor" -eq 3 -a "$kernelMinor" -le 18 ]; then
+       check_flags RESOURCE_COUNTERS
+fi
+
+if [ "$kernelMajor" -lt 3 ] || [ "$kernelMajor" -eq 3 -a "$kernelMinor" -le 13 ]; then
+       netprio=NETPRIO_CGROUP
+else
+       netprio=CGROUP_NET_PRIO
+fi
+
+flags=(
+       BLK_CGROUP BLK_DEV_THROTTLING IOSCHED_CFQ CFQ_GROUP_IOSCHED
+       CGROUP_PERF
+       CGROUP_HUGETLB
+       NET_CLS_CGROUP $netprio
+       CFS_BANDWIDTH FAIR_GROUP_SCHED RT_GROUP_SCHED
+       IP_NF_TARGET_REDIRECT
+       IP_VS
+       IP_VS_NFCT
+       IP_VS_PROTO_TCP
+       IP_VS_PROTO_UDP
+       IP_VS_RR
+)
+check_flags "${flags[@]}"
diff --git a/script/release.sh b/script/release.sh
new file mode 100755 (executable)
index 0000000..a1ebc95
--- /dev/null
@@ -0,0 +1,130 @@
+#!/bin/bash
+# Copyright (C) 2017 SUSE LLC.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+
+## --->
+# Project-specific options and functions. In *theory* you shouldn't need to
+# touch anything else in this script in order to use this elsewhere.
+project="runc"
+root="$(readlink -f "$(dirname "${BASH_SOURCE}")/..")"
+
+# This function takes an output path as an argument, where the built
+# (preferably static) binary should be placed.
+function build_project() {
+       builddir="$(dirname "$1")"
+
+       # Build with all tags enabled.
+       make -C "$root" COMMIT_NO= BUILDTAGS="seccomp selinux apparmor" static
+       mv "$root/$project" "$1"
+}
+
+# End of the easy-to-configure portion.
+## <---
+
+# Print usage information.
+function usage() {
+       echo "usage: release.sh [-S <gpg-key-id>] [-c <commit-ish>] [-r <release-dir>] [-v <version>]" >&2
+       exit 1
+}
+
+# Log something to stderr.
+function log() {
+       echo "[*] $*" >&2
+}
+
+# Log something to stderr and then exit with 0.
+function bail() {
+       log "$@"
+       exit 0
+}
+
+# Conduct a sanity-check to make sure that GPG provided with the given
+# arguments can sign something. Inability to sign things is not a fatal error.
+function gpg_cansign() {
+       gpg "$@" --clear-sign </dev/null >/dev/null
+}
+
+# When creating releases we need to build static binaries, an archive of the
+# current commit, and generate detached signatures for both.
+keyid=""
+commit="HEAD"
+version=""
+releasedir=""
+hashcmd=""
+while getopts "S:c:r:v:h:" opt; do
+       case "$opt" in
+               S)
+                       keyid="$OPTARG"
+                       ;;
+               c)
+                       commit="$OPTARG"
+                       ;;
+               r)
+                       releasedir="$OPTARG"
+                       ;;
+               v)
+                       version="$OPTARG"
+                       ;;
+               h)
+                       hashcmd="$OPTARG"
+                       ;;
+               \:)
+                       echo "Missing argument: -$OPTARG" >&2
+                       usage
+                       ;;
+               \?)
+                       echo "Invalid option: -$OPTARG" >&2
+                       usage
+                       ;;
+       esac
+done
+
+version="${version:-$(<"$root/VERSION")}"
+releasedir="${releasedir:-release/$version}"
+hashcmd="${hashcmd:-sha256sum}"
+goarch="$(go env GOARCH || echo "amd64")"
+
+log "creating $project release in '$releasedir'"
+log "  version: $version"
+log "   commit: $commit"
+log "      key: ${keyid:-DEFAULT}"
+log "     hash: $hashcmd"
+
+# Make explicit what we're doing.
+set -x
+
+# Make the release directory.
+rm -rf "$releasedir" && mkdir -p "$releasedir"
+
+# Build project.
+build_project "$releasedir/$project.$goarch"
+
+# Generate new archive.
+git archive --format=tar --prefix="$project-$version/" "$commit" | xz > "$releasedir/$project.tar.xz"
+
+# Generate sha256 checksums for both.
+( cd "$releasedir" ; "$hashcmd" "$project".{"$goarch",tar.xz} > "$project.$hashcmd" ; )
+
+# Set up the gpgflags.
+[[ "$keyid" ]] && export gpgflags="--default-key $keyid"
+gpg_cansign $gpgflags || bail "Could not find suitable GPG key, skipping signing step."
+
+# Sign everything.
+gpg $gpgflags --detach-sign --armor "$releasedir/$project.$goarch"
+gpg $gpgflags --detach-sign --armor "$releasedir/$project.tar.xz"
+gpg $gpgflags --clear-sign --armor \
+       --output "$releasedir/$project.$hashcmd"{.tmp,} && \
+       mv "$releasedir/$project.$hashcmd"{.tmp,}
diff --git a/script/tmpmount b/script/tmpmount
new file mode 100755 (executable)
index 0000000..5ac6bc2
--- /dev/null
@@ -0,0 +1,4 @@
+#!/bin/bash
+
+mount -t tmpfs none /tmp
+exec "$@"
diff --git a/script/validate-c b/script/validate-c
new file mode 100755 (executable)
index 0000000..7c01b51
--- /dev/null
@@ -0,0 +1,42 @@
+#!/bin/bash
+
+source "$(dirname "$BASH_SOURCE")/.validate"
+
+IFS=$'\n'
+files=($(validate_diff --diff-filter=ACMR --name-only -- '*.c' | grep -v '^vendor/' || true))
+unset IFS
+
+# indent(1): "You must use the â€˜-T’ option to tell indent the name of all the typenames in your program that are defined by typedef."
+INDENT="indent -linux -l120 -T size_t -T jmp_buf"
+if [ -z "$(indent --version 2>&1 | grep GNU)" ]; then
+       echo "Skipping C indentation checks, as GNU indent is not installed."
+       exit 0
+fi
+
+badFiles=()
+for f in "${files[@]}"; do
+       orig=$(mktemp)
+       formatted=$(mktemp)
+       # we use "git show" here to validate that what's committed is formatted
+       git show "$VALIDATE_HEAD:$f" > ${orig}
+       ${INDENT} ${orig} -o ${formatted}
+       if [ "$(diff -u ${orig} ${formatted})" ]; then
+               badFiles+=("$f")
+       fi
+       rm -f ${orig} ${formatted}
+done
+
+if [ ${#badFiles[@]} -eq 0 ]; then
+       echo 'Congratulations!  All C source files are properly formatted.'
+else
+       {
+               echo "These files are not properly formatted:"
+               for f in "${badFiles[@]}"; do
+                       echo " - $f"
+               done
+               echo
+               echo "Please reformat the above files using \"${INDENT}\" and commit the result."
+               echo
+       } >&2
+       false
+fi
diff --git a/script/validate-gofmt b/script/validate-gofmt
new file mode 100755 (executable)
index 0000000..8337ed2
--- /dev/null
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+source "$(dirname "$BASH_SOURCE")/.validate"
+
+IFS=$'\n'
+files=($(validate_diff --diff-filter=ACMR --name-only -- '*.go' | grep -v '^vendor/' || true))
+unset IFS
+
+badFiles=()
+for f in "${files[@]}"; do
+       # we use "git show" here to validate that what's committed is formatted
+       if [ "$(git show "$VALIDATE_HEAD:$f" | gofmt -s -l)" ]; then
+               badFiles+=("$f")
+       fi
+done
+
+if [ ${#badFiles[@]} -eq 0 ]; then
+       echo 'Congratulations!  All Go source files are properly formatted.'
+else
+       {
+               echo "These files are not properly gofmt'd:"
+               for f in "${badFiles[@]}"; do
+                       echo " - $f"
+               done
+               echo
+               echo 'Please reformat the above files using "gofmt -s -w" and commit the result.'
+               echo
+       } >&2
+       false
+fi
diff --git a/signalmap.go b/signalmap.go
new file mode 100644 (file)
index 0000000..f9a6347
--- /dev/null
@@ -0,0 +1,47 @@
+// +build linux
+// +build !mips,!mipsle,!mips64,!mips64le
+
+package main
+
+import (
+       "syscall"
+
+       "golang.org/x/sys/unix"
+)
+
+var signalMap = map[string]syscall.Signal{
+       "ABRT":   unix.SIGABRT,
+       "ALRM":   unix.SIGALRM,
+       "BUS":    unix.SIGBUS,
+       "CHLD":   unix.SIGCHLD,
+       "CLD":    unix.SIGCLD,
+       "CONT":   unix.SIGCONT,
+       "FPE":    unix.SIGFPE,
+       "HUP":    unix.SIGHUP,
+       "ILL":    unix.SIGILL,
+       "INT":    unix.SIGINT,
+       "IO":     unix.SIGIO,
+       "IOT":    unix.SIGIOT,
+       "KILL":   unix.SIGKILL,
+       "PIPE":   unix.SIGPIPE,
+       "POLL":   unix.SIGPOLL,
+       "PROF":   unix.SIGPROF,
+       "PWR":    unix.SIGPWR,
+       "QUIT":   unix.SIGQUIT,
+       "SEGV":   unix.SIGSEGV,
+       "STKFLT": unix.SIGSTKFLT,
+       "STOP":   unix.SIGSTOP,
+       "SYS":    unix.SIGSYS,
+       "TERM":   unix.SIGTERM,
+       "TRAP":   unix.SIGTRAP,
+       "TSTP":   unix.SIGTSTP,
+       "TTIN":   unix.SIGTTIN,
+       "TTOU":   unix.SIGTTOU,
+       "URG":    unix.SIGURG,
+       "USR1":   unix.SIGUSR1,
+       "USR2":   unix.SIGUSR2,
+       "VTALRM": unix.SIGVTALRM,
+       "WINCH":  unix.SIGWINCH,
+       "XCPU":   unix.SIGXCPU,
+       "XFSZ":   unix.SIGXFSZ,
+}
diff --git a/signalmap_mipsx.go b/signalmap_mipsx.go
new file mode 100644 (file)
index 0000000..046bf15
--- /dev/null
@@ -0,0 +1,45 @@
+// +build linux,mips linux,mipsle linux,mips64 linux,mips64le
+
+package main
+
+import (
+       "syscall"
+
+       "golang.org/x/sys/unix"
+)
+
+var signalMap = map[string]syscall.Signal{
+       "ABRT":   unix.SIGABRT,
+       "ALRM":   unix.SIGALRM,
+       "BUS":    unix.SIGBUS,
+       "CHLD":   unix.SIGCHLD,
+       "CLD":    unix.SIGCLD,
+       "CONT":   unix.SIGCONT,
+       "FPE":    unix.SIGFPE,
+       "HUP":    unix.SIGHUP,
+       "ILL":    unix.SIGILL,
+       "INT":    unix.SIGINT,
+       "IO":     unix.SIGIO,
+       "IOT":    unix.SIGIOT,
+       "KILL":   unix.SIGKILL,
+       "PIPE":   unix.SIGPIPE,
+       "POLL":   unix.SIGPOLL,
+       "PROF":   unix.SIGPROF,
+       "PWR":    unix.SIGPWR,
+       "QUIT":   unix.SIGQUIT,
+       "SEGV":   unix.SIGSEGV,
+       "STOP":   unix.SIGSTOP,
+       "SYS":    unix.SIGSYS,
+       "TERM":   unix.SIGTERM,
+       "TRAP":   unix.SIGTRAP,
+       "TSTP":   unix.SIGTSTP,
+       "TTIN":   unix.SIGTTIN,
+       "TTOU":   unix.SIGTTOU,
+       "URG":    unix.SIGURG,
+       "USR1":   unix.SIGUSR1,
+       "USR2":   unix.SIGUSR2,
+       "VTALRM": unix.SIGVTALRM,
+       "WINCH":  unix.SIGWINCH,
+       "XCPU":   unix.SIGXCPU,
+       "XFSZ":   unix.SIGXFSZ,
+}
diff --git a/signals.go b/signals.go
new file mode 100644 (file)
index 0000000..b67f65a
--- /dev/null
@@ -0,0 +1,139 @@
+// +build linux
+
+package main
+
+import (
+       "os"
+       "os/signal"
+       "syscall" // only for Signal
+
+       "github.com/opencontainers/runc/libcontainer"
+       "github.com/opencontainers/runc/libcontainer/system"
+       "github.com/opencontainers/runc/libcontainer/utils"
+
+       "github.com/sirupsen/logrus"
+       "golang.org/x/sys/unix"
+)
+
+const signalBufferSize = 2048
+
+// newSignalHandler returns a signal handler for processing SIGCHLD and SIGWINCH signals
+// while still forwarding all other signals to the process.
+// If notifySocket is present, use it to read systemd notifications from the container and
+// forward them to notifySocketHost.
+func newSignalHandler(enableSubreaper bool, notifySocket *notifySocket) *signalHandler {
+       if enableSubreaper {
+               // set us as the subreaper before registering the signal handler for the container
+               if err := system.SetSubreaper(1); err != nil {
+                       logrus.Warn(err)
+               }
+       }
+       // ensure that we have a large buffer size so that we do not miss any signals
+       // in case we are not processing them fast enough.
+       s := make(chan os.Signal, signalBufferSize)
+       // handle all signals for the process.
+       signal.Notify(s)
+       return &signalHandler{
+               signals:      s,
+               notifySocket: notifySocket,
+       }
+}
+
+// exit models a process exit status with the pid and
+// exit status.
+type exit struct {
+       pid    int
+       status int
+}
+
+type signalHandler struct {
+       signals      chan os.Signal
+       notifySocket *notifySocket
+}
+
+// forward handles the main signal event loop forwarding, resizing, or reaping depending
+// on the signal received.
+func (h *signalHandler) forward(process *libcontainer.Process, tty *tty, detach bool) (int, error) {
+       // make sure we know the pid of our main process so that we can return
+       // after it dies.
+       if detach && h.notifySocket == nil {
+               return 0, nil
+       }
+
+       pid1, err := process.Pid()
+       if err != nil {
+               return -1, err
+       }
+
+       if h.notifySocket != nil {
+               if detach {
+                       h.notifySocket.run(pid1)
+                       return 0, nil
+               }
+               go h.notifySocket.run(0)
+       }
+
+       // Perform the initial tty resize. Always ignore errors resizing because
+       // stdout might have disappeared (due to races with when SIGHUP is sent).
+       _ = tty.resize()
+       // Handle and forward signals.
+       for s := range h.signals {
+               switch s {
+               case unix.SIGWINCH:
+                       // Ignore errors resizing, as above.
+                       _ = tty.resize()
+               case unix.SIGCHLD:
+                       exits, err := h.reap()
+                       if err != nil {
+                               logrus.Error(err)
+                       }
+                       for _, e := range exits {
+                               logrus.WithFields(logrus.Fields{
+                                       "pid":    e.pid,
+                                       "status": e.status,
+                               }).Debug("process exited")
+                               if e.pid == pid1 {
+                                       // call Wait() on the process even though we already have the exit
+                                       // status because we must ensure that any of the go specific process
+                                       // fun such as flushing pipes are complete before we return.
+                                       process.Wait()
+                                       if h.notifySocket != nil {
+                                               h.notifySocket.Close()
+                                       }
+                                       return e.status, nil
+                               }
+                       }
+               default:
+                       logrus.Debugf("sending signal to process %s", s)
+                       if err := unix.Kill(pid1, s.(syscall.Signal)); err != nil {
+                               logrus.Error(err)
+                       }
+               }
+       }
+       return -1, nil
+}
+
+// reap runs wait4 in a loop until we have finished processing any existing exits
+// then returns all exits to the main event loop for further processing.
+func (h *signalHandler) reap() (exits []exit, err error) {
+       var (
+               ws  unix.WaitStatus
+               rus unix.Rusage
+       )
+       for {
+               pid, err := unix.Wait4(-1, &ws, unix.WNOHANG, &rus)
+               if err != nil {
+                       if err == unix.ECHILD {
+                               return exits, nil
+                       }
+                       return nil, err
+               }
+               if pid <= 0 {
+                       return exits, nil
+               }
+               exits = append(exits, exit{
+                       pid:    pid,
+                       status: utils.ExitStatus(ws),
+               })
+       }
+}
diff --git a/spec.go b/spec.go
new file mode 100644 (file)
index 0000000..322a83d
--- /dev/null
+++ b/spec.go
@@ -0,0 +1,145 @@
+// +build linux
+
+package main
+
+import (
+       "encoding/json"
+       "fmt"
+       "io/ioutil"
+       "os"
+
+       "github.com/opencontainers/runc/libcontainer/configs"
+       "github.com/opencontainers/runc/libcontainer/specconv"
+       "github.com/opencontainers/runtime-spec/specs-go"
+       "github.com/urfave/cli"
+)
+
+var specCommand = cli.Command{
+       Name:      "spec",
+       Usage:     "create a new specification file",
+       ArgsUsage: "",
+       Description: `The spec command creates the new specification file named "` + specConfig + `" for
+the bundle.
+
+The spec generated is just a starter file. Editing of the spec is required to
+achieve desired results. For example, the newly generated spec includes an args
+parameter that is initially set to call the "sh" command when the container is
+started. Calling "sh" may work for an ubuntu container or busybox, but will not
+work for containers that do not include the "sh" program.
+
+EXAMPLE:
+  To run docker's hello-world container one needs to set the args parameter
+in the spec to call hello. This can be done using the sed command or a text
+editor. The following commands create a bundle for hello-world, change the
+default args parameter in the spec from "sh" to "/hello", then run the hello
+command in a new hello-world container named container1:
+
+    mkdir hello
+    cd hello
+    docker pull hello-world
+    docker export $(docker create hello-world) > hello-world.tar
+    mkdir rootfs
+    tar -C rootfs -xf hello-world.tar
+    runc spec
+    sed -i 's;"sh";"/hello";' ` + specConfig + `
+    runc run container1
+
+In the run command above, "container1" is the name for the instance of the
+container that you are starting. The name you provide for the container instance
+must be unique on your host.
+
+An alternative for generating a customized spec config is to use "oci-runtime-tool", the
+sub-command "oci-runtime-tool generate" has lots of options that can be used to do any
+customizations as you want, see runtime-tools (https://github.com/opencontainers/runtime-tools)
+to get more information.
+
+When starting a container through runc, runc needs root privilege. If not
+already running as root, you can use sudo to give runc root privilege. For
+example: "sudo runc start container1" will give runc root privilege to start the
+container on your host.
+
+Alternatively, you can start a rootless container, which has the ability to run
+without root privileges. For this to work, the specification file needs to be
+adjusted accordingly. You can pass the parameter --rootless to this command to
+generate a proper rootless spec file.
+
+Note that --rootless is not needed when you execute runc as the root in a user namespace
+created by an unprivileged user.
+`,
+       Flags: []cli.Flag{
+               cli.StringFlag{
+                       Name:  "bundle, b",
+                       Value: "",
+                       Usage: "path to the root of the bundle directory",
+               },
+               cli.BoolFlag{
+                       Name:  "rootless",
+                       Usage: "generate a configuration for a rootless container",
+               },
+       },
+       Action: func(context *cli.Context) error {
+               if err := checkArgs(context, 0, exactArgs); err != nil {
+                       return err
+               }
+               spec := specconv.Example()
+
+               rootless := context.Bool("rootless")
+               if rootless {
+                       specconv.ToRootless(spec)
+               }
+
+               checkNoFile := func(name string) error {
+                       _, err := os.Stat(name)
+                       if err == nil {
+                               return fmt.Errorf("File %s exists. Remove it first", name)
+                       }
+                       if !os.IsNotExist(err) {
+                               return err
+                       }
+                       return nil
+               }
+               bundle := context.String("bundle")
+               if bundle != "" {
+                       if err := os.Chdir(bundle); err != nil {
+                               return err
+                       }
+               }
+               if err := checkNoFile(specConfig); err != nil {
+                       return err
+               }
+               data, err := json.MarshalIndent(spec, "", "\t")
+               if err != nil {
+                       return err
+               }
+               return ioutil.WriteFile(specConfig, data, 0666)
+       },
+}
+
+// loadSpec loads the specification from the provided path.
+func loadSpec(cPath string) (spec *specs.Spec, err error) {
+       cf, err := os.Open(cPath)
+       if err != nil {
+               if os.IsNotExist(err) {
+                       return nil, fmt.Errorf("JSON specification file %s not found", cPath)
+               }
+               return nil, err
+       }
+       defer cf.Close()
+
+       if err = json.NewDecoder(cf).Decode(&spec); err != nil {
+               return nil, err
+       }
+       return spec, validateProcessSpec(spec.Process)
+}
+
+func createLibContainerRlimit(rlimit specs.POSIXRlimit) (configs.Rlimit, error) {
+       rl, err := strToRlimit(rlimit.Type)
+       if err != nil {
+               return configs.Rlimit{}, err
+       }
+       return configs.Rlimit{
+               Type: rl,
+               Hard: rlimit.Hard,
+               Soft: rlimit.Soft,
+       }, nil
+}
diff --git a/start.go b/start.go
new file mode 100644 (file)
index 0000000..2bb698b
--- /dev/null
+++ b/start.go
@@ -0,0 +1,43 @@
+package main
+
+import (
+       "errors"
+       "fmt"
+
+       "github.com/opencontainers/runc/libcontainer"
+       "github.com/urfave/cli"
+)
+
+var startCommand = cli.Command{
+       Name:  "start",
+       Usage: "executes the user defined process in a created container",
+       ArgsUsage: `<container-id>
+
+Where "<container-id>" is your name for the instance of the container that you
+are starting. The name you provide for the container instance must be unique on
+your host.`,
+       Description: `The start command executes the user defined process in a created container.`,
+       Action: func(context *cli.Context) error {
+               if err := checkArgs(context, 1, exactArgs); err != nil {
+                       return err
+               }
+               container, err := getContainer(context)
+               if err != nil {
+                       return err
+               }
+               status, err := container.Status()
+               if err != nil {
+                       return err
+               }
+               switch status {
+               case libcontainer.Created:
+                       return container.Exec()
+               case libcontainer.Stopped:
+                       return errors.New("cannot start a container that has stopped")
+               case libcontainer.Running:
+                       return errors.New("cannot start an already running container")
+               default:
+                       return fmt.Errorf("cannot start a container in the %s state\n", status)
+               }
+       },
+}
diff --git a/state.go b/state.go
new file mode 100644 (file)
index 0000000..718813c
--- /dev/null
+++ b/state.go
@@ -0,0 +1,60 @@
+// +build linux
+
+package main
+
+import (
+       "encoding/json"
+       "os"
+
+       "github.com/opencontainers/runc/libcontainer"
+       "github.com/opencontainers/runc/libcontainer/utils"
+       "github.com/urfave/cli"
+)
+
+var stateCommand = cli.Command{
+       Name:  "state",
+       Usage: "output the state of a container",
+       ArgsUsage: `<container-id>
+
+Where "<container-id>" is your name for the instance of the container.`,
+       Description: `The state command outputs current state information for the
+instance of a container.`,
+       Action: func(context *cli.Context) error {
+               if err := checkArgs(context, 1, exactArgs); err != nil {
+                       return err
+               }
+               container, err := getContainer(context)
+               if err != nil {
+                       return err
+               }
+               containerStatus, err := container.Status()
+               if err != nil {
+                       return err
+               }
+               state, err := container.State()
+               if err != nil {
+                       return err
+               }
+               pid := state.BaseState.InitProcessPid
+               if containerStatus == libcontainer.Stopped {
+                       pid = 0
+               }
+               bundle, annotations := utils.Annotations(state.Config.Labels)
+               cs := containerState{
+                       Version:        state.BaseState.Config.Version,
+                       ID:             state.BaseState.ID,
+                       InitProcessPid: pid,
+                       Status:         containerStatus.String(),
+                       Bundle:         bundle,
+                       Rootfs:         state.BaseState.Config.Rootfs,
+                       Created:        state.BaseState.Created,
+                       Annotations:    annotations,
+               }
+               data, err := json.MarshalIndent(cs, "", "  ")
+               if err != nil {
+                       return err
+               }
+               os.Stdout.Write(data)
+               return nil
+       },
+}
diff --git a/tests/integration/README.md b/tests/integration/README.md
new file mode 100644 (file)
index 0000000..8ee6ebf
--- /dev/null
@@ -0,0 +1,83 @@
+# runc Integration Tests
+
+Integration tests provide end-to-end testing of runc.
+
+Note that integration tests do **not** replace unit tests.
+
+As a rule of thumb, code should be tested thoroughly with unit tests.
+Integration tests on the other hand are meant to test a specific feature end
+to end.
+
+Integration tests are written in *bash* using the
+[bats](https://github.com/sstephenson/bats) framework.
+
+## Running integration tests
+
+The easiest way to run integration tests is with Docker:
+```
+$ make integration
+```
+Alternatively, you can run integration tests directly on your host through make:
+```
+$ sudo make localintegration
+```
+Or you can just run them directly using bats
+```
+$ sudo bats tests/integration
+```
+To run a single test bucket:
+```
+$ make integration TESTPATH="/checkpoint.bats"
+```
+
+
+To run them on your host, you will need to setup a development environment plus
+[bats](https://github.com/sstephenson/bats#installing-bats-from-source)
+For example:
+```
+$ cd ~/go/src/github.com
+$ git clone https://github.com/sstephenson/bats.git
+$ cd bats
+$ ./install.sh /usr/local
+```
+
+> **Note**: There are known issues running the integration tests using
+> **devicemapper** as a storage driver, make sure that your docker daemon
+> is using **aufs** if you want to successfully run the integration tests.
+
+## Writing integration tests
+
+[helper functions]
+(https://github.com/opencontainers/runc/blob/master/test/integration/helpers.bash)
+are provided in order to facilitate writing tests.
+
+```sh
+#!/usr/bin/env bats
+
+# This will load the helpers.
+load helpers
+
+# setup is called at the beginning of every test.
+function setup() {
+  # see functions teardown_hello and setup_hello in helpers.bash, used to
+  # create a pristine environment for running your tests
+  teardown_hello
+  setup_hello
+}
+
+# teardown is called at the end of every test.
+function teardown() {
+  teardown_hello
+}
+
+@test "this is a simple test" {
+  runc run containerid
+  # "The runc macro" automatically populates $status, $output and $lines.
+  # Please refer to bats documentation to find out more.
+  [ "$status" -eq 0 ]
+
+  # check expected output
+  [[ "${output}" == *"Hello"* ]]
+}
+
+```
diff --git a/tests/integration/cgroups.bats b/tests/integration/cgroups.bats
new file mode 100644 (file)
index 0000000..17812ab
--- /dev/null
@@ -0,0 +1,127 @@
+#!/usr/bin/env bats
+
+load helpers
+
+function teardown() {
+    rm -f $BATS_TMPDIR/runc-cgroups-integration-test.json
+    teardown_running_container test_cgroups_kmem
+    teardown_running_container test_cgroups_permissions
+    teardown_busybox
+}
+
+function setup() {
+    teardown
+    setup_busybox
+}
+
+function check_cgroup_value() {
+    cgroup=$1
+    source=$2
+    expected=$3
+
+    current=$(cat $cgroup/$source)
+    echo  $cgroup/$source
+    echo "current" $current "!?" "$expected"
+    [ "$current" -eq "$expected" ]
+}
+
+@test "runc update --kernel-memory (initialized)" {
+    [[ "$ROOTLESS" -ne 0 ]] && requires rootless_cgroup
+    requires cgroups_kmem
+
+    set_cgroups_path "$BUSYBOX_BUNDLE"
+
+    # Set some initial known values
+    DATA=$(cat <<-EOF
+    "memory": {
+        "kernel": 16777216
+    },
+EOF
+    )
+    DATA=$(echo ${DATA} | sed 's/\n/\\n/g')
+    sed -i "s/\(\"resources\": {\)/\1\n${DATA}/" ${BUSYBOX_BUNDLE}/config.json
+
+    # run a detached busybox to work with
+    runc run -d --console-socket $CONSOLE_SOCKET test_cgroups_kmem
+    [ "$status" -eq 0 ]
+
+    # update kernel memory limit
+    runc update test_cgroups_kmem --kernel-memory 50331648
+    [ "$status" -eq 0 ]
+
+       # check the value
+    check_cgroup_value $CGROUP_MEMORY "memory.kmem.limit_in_bytes" 50331648
+}
+
+@test "runc update --kernel-memory (uninitialized)" {
+    [[ "$ROOTLESS" -ne 0 ]] && requires rootless_cgroup
+    requires cgroups_kmem
+
+    set_cgroups_path "$BUSYBOX_BUNDLE"
+
+    # run a detached busybox to work with
+    runc run -d --console-socket $CONSOLE_SOCKET test_cgroups_kmem
+    [ "$status" -eq 0 ]
+
+    # update kernel memory limit
+    runc update test_cgroups_kmem --kernel-memory 50331648
+    # Since kernel 4.6, we can update kernel memory without initialization
+    # because it's accounted by default.
+    if [ "$KERNEL_MAJOR" -lt 4 ] || [ "$KERNEL_MAJOR" -eq 4 -a "$KERNEL_MINOR" -le 5 ]; then
+        [ ! "$status" -eq 0 ]
+    else
+        [ "$status" -eq 0 ]
+        check_cgroup_value $CGROUP_MEMORY "memory.kmem.limit_in_bytes" 50331648
+    fi
+}
+
+@test "runc create (no limits + no cgrouppath + no permission) succeeds" {
+    runc run -d --console-socket $CONSOLE_SOCKET test_cgroups_permissions
+    [ "$status" -eq 0 ]
+}
+
+@test "runc create (rootless + no limits + cgrouppath + no permission) fails with permission error" {
+    requires rootless
+    requires rootless_no_cgroup
+
+    set_cgroups_path "$BUSYBOX_BUNDLE"
+
+    runc run -d --console-socket $CONSOLE_SOCKET test_cgroups_permissions
+    [ "$status" -eq 1 ]
+    [[ ${lines[1]} == *"permission denied"* ]]
+}
+
+@test "runc create (rootless + limits + no cgrouppath + no permission) fails with informative error" {
+    requires rootless
+    requires rootless_no_cgroup
+
+    set_resources_limit "$BUSYBOX_BUNDLE"
+
+    runc run -d --console-socket $CONSOLE_SOCKET test_cgroups_permissions
+    [ "$status" -eq 1 ]
+    [[ ${lines[1]} == *"cannot set pids limit: container could not join or create cgroup"* ]]
+}
+
+@test "runc create (limits + cgrouppath + permission on the cgroup dir) succeeds" {
+   [[ "$ROOTLESS" -ne 0 ]] && requires rootless_cgroup
+
+    set_cgroups_path "$BUSYBOX_BUNDLE"
+    set_resources_limit "$BUSYBOX_BUNDLE"
+
+    runc run -d --console-socket $CONSOLE_SOCKET test_cgroups_permissions
+    [ "$status" -eq 0 ]
+}
+
+@test "runc exec (limits + cgrouppath + permission on the cgroup dir) succeeds" {
+   [[ "$ROOTLESS" -ne 0 ]] && requires rootless_cgroup
+
+    set_cgroups_path "$BUSYBOX_BUNDLE"
+    set_resources_limit "$BUSYBOX_BUNDLE"
+
+    runc run -d --console-socket $CONSOLE_SOCKET test_cgroups_permissions
+    [ "$status" -eq 0 ]
+
+    runc exec test_cgroups_permissions echo "cgroups_exec"
+    [ "$status" -eq 0 ]
+    [[ ${lines[0]} == *"cgroups_exec"* ]]
+}
diff --git a/tests/integration/checkpoint.bats b/tests/integration/checkpoint.bats
new file mode 100644 (file)
index 0000000..87696df
--- /dev/null
@@ -0,0 +1,348 @@
+#!/usr/bin/env bats
+
+load helpers
+
+function setup() {
+  if [[ -n "${RUNC_USE_SYSTEMD}" ]] ; then
+    skip "CRIU test suite is skipped on systemd cgroup driver for now."
+  fi
+
+  teardown_busybox
+  setup_busybox
+}
+
+function teardown() {
+  teardown_busybox
+}
+
+@test "checkpoint and restore" {
+  # XXX: currently criu require root containers.
+  requires criu root
+
+  runc run -d --console-socket $CONSOLE_SOCKET test_busybox
+  [ "$status" -eq 0 ]
+
+  testcontainer test_busybox running
+
+  for i in `seq 2`; do
+    # checkpoint the running container
+    runc --criu "$CRIU" checkpoint --work-path ./work-dir test_busybox
+    ret=$?
+    # if you are having problems getting criu to work uncomment the following dump:
+    #cat /run/opencontainer/containers/test_busybox/criu.work/dump.log
+    cat ./work-dir/dump.log | grep -B 5 Error || true
+    [ "$ret" -eq 0 ]
+
+    # after checkpoint busybox is no longer running
+    runc state test_busybox
+    [ "$status" -ne 0 ]
+
+    # restore from checkpoint
+    runc --criu "$CRIU" restore -d --work-path ./work-dir --console-socket $CONSOLE_SOCKET test_busybox
+    ret=$?
+    cat ./work-dir/restore.log | grep -B 5 Error || true
+    [ "$ret" -eq 0 ]
+
+    # busybox should be back up and running
+    testcontainer test_busybox running
+  done
+}
+
+@test "checkpoint --pre-dump and restore" {
+  # XXX: currently criu require root containers.
+  requires criu root
+
+  # The changes to 'terminal' are needed for running in detached mode
+  sed -i 's;"terminal": true;"terminal": false;' config.json
+  sed -i 's/"sh"/"sh","-c","for i in `seq 10`; do read xxx || continue; echo ponG $xxx; done"/' config.json
+
+  # The following code creates pipes for stdin and stdout.
+  # CRIU can't handle fifo-s, so we need all these tricks.
+  fifo=`mktemp -u /tmp/runc-fifo-XXXXXX`
+  mkfifo $fifo
+
+  # stdout
+  cat $fifo | cat $fifo &
+  pid=$!
+  exec 50</proc/$pid/fd/0
+  exec 51>/proc/$pid/fd/0
+
+  # stdin
+  cat $fifo | cat $fifo &
+  pid=$!
+  exec 60</proc/$pid/fd/0
+  exec 61>/proc/$pid/fd/0
+
+  echo -n > $fifo
+  unlink $fifo
+
+  # run busybox
+  __runc run -d test_busybox <&60 >&51 2>&51
+  [ $? -eq 0 ]
+
+  testcontainer test_busybox running
+
+  #test checkpoint pre-dump
+  mkdir parent-dir
+  runc --criu "$CRIU" checkpoint --pre-dump --image-path ./parent-dir test_busybox
+  [ "$status" -eq 0 ]
+
+  # busybox should still be running
+  runc state test_busybox
+  [ "$status" -eq 0 ]
+  [[ "${output}" == *"running"* ]]
+
+  # checkpoint the running container
+  mkdir image-dir
+  mkdir work-dir
+  runc --criu "$CRIU" checkpoint --parent-path ./parent-dir --work-path ./work-dir --image-path ./image-dir test_busybox
+  cat ./work-dir/dump.log | grep -B 5 Error || true
+  [ "$status" -eq 0 ]
+
+  # after checkpoint busybox is no longer running
+  runc state test_busybox
+  [ "$status" -ne 0 ]
+
+  # restore from checkpoint
+  __runc --criu "$CRIU" restore -d --work-path ./work-dir --image-path ./image-dir test_busybox <&60 >&51 2>&51
+  ret=$?
+  cat ./work-dir/restore.log | grep -B 5 Error || true
+  [ $ret -eq 0 ]
+
+  # busybox should be back up and running
+  testcontainer test_busybox running
+
+  runc exec --cwd /bin test_busybox echo ok
+  [ "$status" -eq 0 ]
+  [[ ${output} == "ok" ]]
+
+  echo Ping >&61
+  exec 61>&-
+  exec 51>&-
+  run cat <&50
+  [ "$status" -eq 0 ]
+  [[ "${output}" == *"ponG Ping"* ]]
+}
+
+@test "checkpoint --lazy-pages and restore" {
+  # XXX: currently criu require root containers.
+  requires criu root
+
+  # check if lazy-pages is supported
+  run ${CRIU} check --feature uffd-noncoop
+  if [ "$status" -eq 1 ]; then
+    # this criu does not support lazy migration; skip the test
+    skip "this criu does not support lazy migration"
+  fi
+
+  # The changes to 'terminal' are needed for running in detached mode
+  sed -i 's;"terminal": true;"terminal": false;' config.json
+  # This should not be necessary: https://github.com/checkpoint-restore/criu/issues/575
+  sed -i 's;"readonly": true;"readonly": false;' config.json
+  sed -i 's/"sh"/"sh","-c","for i in `seq 10`; do read xxx || continue; echo ponG $xxx; done"/' config.json
+
+  # The following code creates pipes for stdin and stdout.
+  # CRIU can't handle fifo-s, so we need all these tricks.
+  fifo=`mktemp -u /tmp/runc-fifo-XXXXXX`
+  mkfifo $fifo
+
+  # For lazy migration we need to know when CRIU is ready to serve
+  # the memory pages via TCP.
+  lazy_pipe=`mktemp -u /tmp/lazy-pipe-XXXXXX`
+  mkfifo $lazy_pipe
+
+  # TCP port for lazy migration
+  port=27277
+
+  # stdout
+  cat $fifo | cat $fifo &
+  pid=$!
+  exec 50</proc/$pid/fd/0
+  exec 51>/proc/$pid/fd/0
+
+  # stdin
+  cat $fifo | cat $fifo &
+  pid=$!
+  exec 60</proc/$pid/fd/0
+  exec 61>/proc/$pid/fd/0
+
+  echo -n > $fifo
+  unlink $fifo
+
+  # run busybox
+  __runc run -d test_busybox <&60 >&51 2>&51
+  [ $? -eq 0 ]
+
+  testcontainer test_busybox running
+
+  # checkpoint the running container
+  mkdir image-dir
+  mkdir work-dir
+  # Double fork taken from helpers.bats
+  # We need to start 'runc checkpoint --lazy-pages' in the background,
+  # so we double fork in the shell.
+  (runc --criu "$CRIU" checkpoint --lazy-pages --page-server 0.0.0.0:${port} --status-fd ${lazy_pipe} --work-path ./work-dir --image-path ./image-dir test_busybox & ) &
+  # Sleeping here. This is ugly, but not sure how else to handle it.
+  # The return code of the in the background running runc is needed, if
+  # there is some basic error. If the lazy migration is ready can
+  # be handled by $lazy_pipe. Which probably will always be ready
+  # after sleeping two seconds.
+  sleep 2
+  # Check if inventory.img was written
+  [ -e image-dir/inventory.img ]
+  # If the inventory.img exists criu checkpointed some things, let's see
+  # if there were other errors in the log file.
+  run grep -B 5 Error ./work-dir/dump.log -q
+  [ "$status" -eq 1 ]
+
+  # This will block until CRIU is ready to serve memory pages
+  cat $lazy_pipe
+  [ "$status" -eq 1 ]
+
+  unlink $lazy_pipe
+
+  # Double fork taken from helpers.bats
+  # We need to start 'criu lazy-pages' in the background,
+  # so we double fork in the shell.
+  # Start CRIU in lazy-daemon mode
+  $(${CRIU} lazy-pages --page-server --address 127.0.0.1 --port ${port} -D image-dir &) &
+
+  # Restore lazily from checkpoint.
+  # The restored container needs a different name as the checkpointed
+  # container is not yet destroyed. It is only destroyed at that point
+  # in time when the last page is lazily transferred to the destination.
+  # Killing the CRIU on the checkpoint side will let the container
+  # continue to run if the migration failed at some point.
+  __runc --criu "$CRIU" restore -d --work-path ./image-dir --image-path ./image-dir --lazy-pages test_busybox_restore <&60 >&51 2>&51
+  ret=$?
+  [ $ret -eq 0 ]
+  run grep -B 5 Error ./work-dir/dump.log -q
+  [ "$status" -eq 1 ]
+
+  # busybox should be back up and running
+  testcontainer test_busybox_restore running
+
+  runc exec --cwd /bin test_busybox_restore echo ok
+  [ "$status" -eq 0 ]
+  [[ ${output} == "ok" ]]
+
+  echo Ping >&61
+  exec 61>&-
+  exec 51>&-
+  run cat <&50
+  [ "$status" -eq 0 ]
+  [[ "${output}" == *"ponG Ping"* ]]
+}
+
+@test "checkpoint and restore in external network namespace" {
+  # XXX: currently criu require root containers.
+  requires criu root
+
+  # check if external_net_ns is supported; only with criu 3.10++
+  run ${CRIU} check --feature external_net_ns
+  if [ "$status" -eq 1 ]; then
+    # this criu does not support external_net_ns; skip the test
+    skip "this criu does not support external network namespaces"
+  fi
+
+  # create a temporary name for the test network namespace
+  tmp=`mktemp`
+  rm -f $tmp
+  ns_name=`basename $tmp`
+  # create network namespace
+  ip netns add $ns_name
+  ns_path=`ip netns add $ns_name 2>&1 | sed -e 's/.*"\(.*\)".*/\1/'`
+
+  ns_inode=`ls -iL $ns_path | awk '{ print $1 }'`
+
+  # tell runc which network namespace to use
+  sed -i "s;\"type\": \"network\";\"type\": \"network\",\"path\": \"$ns_path\";" config.json
+
+  runc run -d --console-socket $CONSOLE_SOCKET test_busybox
+  [ "$status" -eq 0 ]
+
+  testcontainer test_busybox running
+
+  for i in `seq 2`; do
+    # checkpoint the running container; this automatically tells CRIU to
+    # handle the network namespace defined in config.json as an external
+    runc --criu "$CRIU" checkpoint --work-path ./work-dir test_busybox
+    ret=$?
+    # if you are having problems getting criu to work uncomment the following dump:
+    #cat /run/opencontainer/containers/test_busybox/criu.work/dump.log
+    cat ./work-dir/dump.log | grep -B 5 Error || true
+    [ "$ret" -eq 0 ]
+
+    # after checkpoint busybox is no longer running
+    runc state test_busybox
+    [ "$status" -ne 0 ]
+
+    # restore from checkpoint; this should restore the container into the existing network namespace
+    runc --criu "$CRIU" restore -d --work-path ./work-dir --console-socket $CONSOLE_SOCKET test_busybox
+    ret=$?
+    cat ./work-dir/restore.log | grep -B 5 Error || true
+    [ "$ret" -eq 0 ]
+
+    # busybox should be back up and running
+    testcontainer test_busybox running
+
+    # container should be running in same network namespace as before
+    pid=`__runc state test_busybox | jq '.pid'`
+    ns_inode_new=`readlink /proc/$pid/ns/net | sed -e 's/.*\[\(.*\)\]/\1/'`
+    echo "old network namespace inode $ns_inode"
+    echo "new network namespace inode $ns_inode_new"
+    [ "$ns_inode" -eq "$ns_inode_new" ]
+  done
+  ip netns del $ns_name
+}
+
+@test "checkpoint and restore with container specific CRIU config" {
+  # XXX: currently criu require root containers.
+  requires criu root
+
+  tmp=`mktemp /tmp/runc-criu-XXXXXX.conf`
+  # This is the file we write to /etc/criu/default.conf
+  tmplog1=`mktemp /tmp/runc-criu-log-XXXXXX.log`
+  unlink $tmplog1
+  tmplog1=`basename $tmplog1`
+  # That is the actual configuration file to be used
+  tmplog2=`mktemp /tmp/runc-criu-log-XXXXXX.log`
+  unlink $tmplog2
+  tmplog2=`basename $tmplog2`
+  # This adds the annotation 'org.criu.config' to set a container
+  # specific CRIU config file.
+  sed -i "s;\"process\";\"annotations\":{\"org.criu.config\": \"$tmp\"},\"process\";" config.json
+  # Tell CRIU to use another configuration file
+  mkdir -p /etc/criu
+  echo "log-file=$tmplog1" > /etc/criu/default.conf
+  # Make sure the RPC defined configuration file overwrites the previous
+  echo "log-file=$tmplog2" > $tmp
+
+  runc run -d --console-socket $CONSOLE_SOCKET test_busybox
+  [ "$status" -eq 0 ]
+
+  testcontainer test_busybox running
+
+  # checkpoint the running container
+  runc --criu "$CRIU" checkpoint --work-path ./work-dir test_busybox
+  [ "$status" -eq 0 ]
+  ! test -f ./work-dir/$tmplog1
+  test -f ./work-dir/$tmplog2
+
+  # after checkpoint busybox is no longer running
+  runc state test_busybox
+  [ "$status" -ne 0 ]
+
+  test -f ./work-dir/$tmplog2 && unlink ./work-dir/$tmplog2
+  # restore from checkpoint
+  runc --criu "$CRIU" restore -d --work-path ./work-dir --console-socket $CONSOLE_SOCKET test_busybox
+  [ "$status" -eq 0 ]
+  ! test -f ./work-dir/$tmplog1
+  test -f ./work-dir/$tmplog2
+
+  # busybox should be back up and running
+  testcontainer test_busybox running
+  unlink $tmp
+  test -f ./work-dir/$tmplog2 && unlink ./work-dir/$tmplog2
+}
+
diff --git a/tests/integration/config.json b/tests/integration/config.json
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/tests/integration/create.bats b/tests/integration/create.bats
new file mode 100644 (file)
index 0000000..abd4da2
--- /dev/null
@@ -0,0 +1,89 @@
+#!/usr/bin/env bats
+
+load helpers
+
+function setup() {
+  teardown_busybox
+  setup_busybox
+}
+
+function teardown() {
+  teardown_busybox
+}
+
+@test "runc create" {
+  runc create --console-socket $CONSOLE_SOCKET test_busybox
+  [ "$status" -eq 0 ]
+
+  testcontainer test_busybox created
+
+  # start the command
+  runc start test_busybox
+  [ "$status" -eq 0 ]
+
+  testcontainer test_busybox running
+}
+
+@test "runc create exec" {
+  runc create --console-socket $CONSOLE_SOCKET test_busybox
+  [ "$status" -eq 0 ]
+
+  testcontainer test_busybox created
+
+  runc exec test_busybox true
+  [ "$status" -eq 0 ]
+
+  testcontainer test_busybox created
+
+  # start the command
+  runc start test_busybox
+  [ "$status" -eq 0 ]
+
+  testcontainer test_busybox running
+}
+
+@test "runc create --pid-file" {
+  runc create --pid-file pid.txt --console-socket $CONSOLE_SOCKET test_busybox
+  [ "$status" -eq 0 ]
+
+  testcontainer test_busybox created
+
+  # check pid.txt was generated
+  [ -e pid.txt ]
+
+  run cat pid.txt
+  [ "$status" -eq 0 ]
+  [[ ${lines[0]} == $(__runc state test_busybox | jq '.pid') ]]
+
+  # start the command
+  runc start test_busybox
+  [ "$status" -eq 0 ]
+
+  testcontainer test_busybox running
+}
+
+@test "runc create --pid-file with new CWD" {
+  # create pid_file directory as the CWD
+  run mkdir pid_file
+  [ "$status" -eq 0 ]
+  run cd pid_file
+  [ "$status" -eq 0 ]
+
+  runc create --pid-file pid.txt -b $BUSYBOX_BUNDLE --console-socket $CONSOLE_SOCKET  test_busybox
+  [ "$status" -eq 0 ]
+
+  testcontainer test_busybox created
+
+  # check pid.txt was generated
+  [ -e pid.txt ]
+
+  run cat pid.txt
+  [ "$status" -eq 0 ]
+  [[ ${lines[0]} == $(__runc state test_busybox | jq '.pid') ]]
+
+  # start the command
+  runc start test_busybox
+  [ "$status" -eq 0 ]
+
+  testcontainer test_busybox running
+}
diff --git a/tests/integration/debug.bats b/tests/integration/debug.bats
new file mode 100644 (file)
index 0000000..e02cf4a
--- /dev/null
@@ -0,0 +1,81 @@
+#!/usr/bin/env bats
+
+load helpers
+
+function setup() {
+  teardown_hello
+  setup_hello
+}
+
+function teardown() {
+  teardown_hello
+}
+
+@test "global --debug" {
+  # run hello-world
+  runc --debug run test_hello
+  echo "${output}"
+  [ "$status" -eq 0 ]
+
+  # check expected debug output was sent to stderr
+  [[ "${output}" == *"level=debug"* ]]
+  [[ "${output}" == *"nsexec started"* ]]
+  [[ "${output}" == *"child process in init()"* ]]
+}
+
+@test "global --debug to --log" {
+  # run hello-world
+  runc --log log.out --debug run test_hello
+  [ "$status" -eq 0 ]
+
+  # check output does not include debug info
+  [[ "${output}" != *"level=debug"* ]]
+
+  # check log.out was generated
+  [ -e log.out ]
+
+  # check expected debug output was sent to log.out
+  run cat log.out
+  [ "$status" -eq 0 ]
+  [[ "${output}" == *"level=debug"* ]]
+  [[ "${output}" == *"nsexec started"* ]]
+  [[ "${output}" == *"child process in init()"* ]]
+}
+
+@test "global --debug to --log --log-format 'text'" {
+  # run hello-world
+  runc --log log.out --log-format "text" --debug run test_hello
+  [ "$status" -eq 0 ]
+
+  # check output does not include debug info
+  [[ "${output}" != *"level=debug"* ]]
+
+  # check log.out was generated
+  [ -e log.out ]
+
+  # check expected debug output was sent to log.out
+  run cat log.out
+  [ "$status" -eq 0 ]
+  [[ "${output}" == *"level=debug"* ]]
+  [[ "${output}" == *"nsexec started"* ]]
+  [[ "${output}" == *"child process in init()"* ]]
+}
+
+@test "global --debug to --log --log-format 'json'" {
+  # run hello-world
+  runc --log log.out --log-format "json" --debug run test_hello
+  [ "$status" -eq 0 ]
+
+  # check output does not include debug info
+  [[ "${output}" != *"level=debug"* ]]
+
+  # check log.out was generated
+  [ -e log.out ]
+
+  # check expected debug output was sent to log.out
+  run cat log.out
+  [ "$status" -eq 0 ]
+  [[ "${output}" == *'"level":"debug"'* ]]
+  [[ "${output}" == *"nsexec started"* ]]
+  [[ "${output}" == *"child process in init()"* ]]
+}
diff --git a/tests/integration/delete.bats b/tests/integration/delete.bats
new file mode 100644 (file)
index 0000000..c5ed215
--- /dev/null
@@ -0,0 +1,53 @@
+#!/usr/bin/env bats
+
+load helpers
+
+function setup() {
+  teardown_busybox
+  setup_busybox
+}
+
+function teardown() {
+  teardown_busybox
+}
+
+@test "runc delete" {
+  # run busybox detached
+  runc run -d --console-socket $CONSOLE_SOCKET test_busybox
+  [ "$status" -eq 0 ]
+
+  # check state
+  testcontainer test_busybox running
+
+  runc kill test_busybox KILL
+  [ "$status" -eq 0 ]
+  # wait for busybox to be in the destroyed state
+  retry 10 1 eval "__runc state test_busybox | grep -q 'stopped'"
+
+  # delete test_busybox
+  runc delete test_busybox
+  [ "$status" -eq 0 ]
+
+  runc state test_busybox
+  [ "$status" -ne 0 ]
+}
+
+@test "runc delete --force" {
+  # run busybox detached
+  runc run -d --console-socket $CONSOLE_SOCKET test_busybox
+  [ "$status" -eq 0 ]
+
+  # check state
+  testcontainer test_busybox running
+
+  # force delete test_busybox
+  runc delete --force test_busybox
+
+  runc state test_busybox
+  [ "$status" -ne 0 ]
+}
+
+@test "runc delete --force ignore not exist" {
+  runc delete --force notexists
+  [ "$status" -eq 0 ]
+}
diff --git a/tests/integration/events.bats b/tests/integration/events.bats
new file mode 100644 (file)
index 0000000..b3e6315
--- /dev/null
@@ -0,0 +1,109 @@
+#!/usr/bin/env bats
+
+load helpers
+
+function setup() {
+  teardown_busybox
+  setup_busybox
+}
+
+function teardown() {
+  teardown_busybox
+}
+
+@test "events --stats" {
+  # XXX: currently cgroups require root containers.
+  requires root
+
+  # run busybox detached
+  runc run -d --console-socket $CONSOLE_SOCKET test_busybox
+  [ "$status" -eq 0 ]
+
+  # generate stats
+  runc events --stats test_busybox
+  [ "$status" -eq 0 ]
+  [[ "${lines[0]}" == [\{]"\"type\""[:]"\"stats\""[,]"\"id\""[:]"\"test_busybox\""[,]* ]]
+  [[ "${lines[0]}" == *"data"* ]]
+}
+
+@test "events --interval default " {
+  # XXX: currently cgroups require root containers.
+  requires root
+
+  # run busybox detached
+  runc run -d --console-socket $CONSOLE_SOCKET test_busybox
+  [ "$status" -eq 0 ]
+
+  # spawn two sub processes (shells)
+  # the first sub process is an event logger that sends stats events to events.log
+  # the second sub process waits for an event that includes test_busybox then
+  # kills the test_busybox container which causes the event logger to exit
+  (__runc events test_busybox > events.log) &
+  (
+    retry 10 1 eval "grep -q 'test_busybox' events.log"
+    teardown_running_container test_busybox
+  ) &
+  wait # wait for the above sub shells to finish
+
+  [ -e events.log ]
+
+  run cat events.log
+  [ "$status" -eq 0 ]
+  [[ "${lines[0]}" == [\{]"\"type\""[:]"\"stats\""[,]"\"id\""[:]"\"test_busybox\""[,]* ]]
+  [[ "${lines[0]}" == *"data"* ]]
+}
+
+@test "events --interval 1s " {
+  # XXX: currently cgroups require root containers.
+  requires root
+
+  # run busybox detached
+  runc run -d --console-socket $CONSOLE_SOCKET test_busybox
+  [ "$status" -eq 0 ]
+
+  # spawn two sub processes (shells)
+  # the first sub process is an event logger that sends stats events to events.log once a second
+  # the second sub process tries 3 times for an event that incudes test_busybox
+  # pausing 1s between each attempt then kills the test_busybox container which
+  # causes the event logger to exit
+  (__runc events --interval 1s test_busybox > events.log) &
+  (
+    retry 3 1 eval "grep -q 'test_busybox' events.log"
+    teardown_running_container test_busybox
+  ) &
+  wait # wait for the above sub shells to finish
+
+  [ -e events.log ]
+
+  run eval "grep -q 'test_busybox' events.log"
+  [ "$status" -eq 0 ]
+}
+
+@test "events --interval 100ms " {
+  # XXX: currently cgroups require root containers.
+  requires root
+
+  # run busybox detached
+  runc run -d --console-socket $CONSOLE_SOCKET test_busybox
+  [ "$status" -eq 0 ]
+
+  #prove there is no carry over of events.log from a prior test
+  [ ! -e events.log ]
+
+  # spawn two sub processes (shells)
+  # the first sub process is an event logger that sends stats events to events.log once every 100ms
+  # the second sub process tries 3 times for an event that incudes test_busybox
+  # pausing 100s between each attempt then kills the test_busybox container which
+  # causes the event logger to exit
+  (__runc events --interval 100ms test_busybox > events.log) &
+  (
+    retry 3 0.100 eval "grep -q 'test_busybox' events.log"
+    teardown_running_container test_busybox
+  ) &
+  wait # wait for the above sub shells to finish
+
+  [ -e events.log ]
+
+  run eval "grep -q 'test_busybox' events.log"
+  [ "$status" -eq 0 ]
+}
diff --git a/tests/integration/exec.bats b/tests/integration/exec.bats
new file mode 100644 (file)
index 0000000..19647c1
--- /dev/null
@@ -0,0 +1,140 @@
+#!/usr/bin/env bats
+
+load helpers
+
+function setup() {
+  teardown_busybox
+  setup_busybox
+}
+
+function teardown() {
+  teardown_busybox
+}
+
+@test "runc exec" {
+  # run busybox detached
+  runc run -d --console-socket $CONSOLE_SOCKET test_busybox
+  [ "$status" -eq 0 ]
+
+  runc exec test_busybox echo Hello from exec
+  [ "$status" -eq 0 ]
+  echo text echoed = "'""${output}""'"
+  [[ "${output}" == *"Hello from exec"* ]]
+}
+
+@test "runc exec --pid-file" {
+  # run busybox detached
+  runc run -d --console-socket $CONSOLE_SOCKET test_busybox
+  [ "$status" -eq 0 ]
+
+  runc exec --pid-file pid.txt test_busybox echo Hello from exec
+  [ "$status" -eq 0 ]
+  echo text echoed = "'""${output}""'"
+  [[ "${output}" == *"Hello from exec"* ]]
+
+  # check pid.txt was generated
+  [ -e pid.txt ]
+
+  run cat pid.txt
+  [ "$status" -eq 0 ]
+  [[ ${lines[0]} =~ [0-9]+ ]]
+  [[ ${lines[0]} != $(__runc state test_busybox | jq '.pid') ]]
+}
+
+@test "runc exec --pid-file with new CWD" {
+  # create pid_file directory as the CWD
+  run mkdir pid_file
+  [ "$status" -eq 0 ]
+  run cd pid_file
+  [ "$status" -eq 0 ]
+
+  # run busybox detached
+  runc run -d -b $BUSYBOX_BUNDLE --console-socket $CONSOLE_SOCKET test_busybox
+  [ "$status" -eq 0 ]
+
+  runc exec --pid-file pid.txt test_busybox echo Hello from exec
+  [ "$status" -eq 0 ]
+  echo text echoed = "'""${output}""'"
+  [[ "${output}" == *"Hello from exec"* ]]
+
+  # check pid.txt was generated
+  [ -e pid.txt ]
+
+  run cat pid.txt
+  [ "$status" -eq 0 ]
+  [[ ${lines[0]} =~ [0-9]+ ]]
+  [[ ${lines[0]} != $(__runc state test_busybox | jq '.pid') ]]
+}
+
+@test "runc exec ls -la" {
+  # run busybox detached
+  runc run -d --console-socket $CONSOLE_SOCKET test_busybox
+  [ "$status" -eq 0 ]
+
+  runc exec test_busybox ls -la
+  [ "$status" -eq 0 ]
+  [[ ${lines[0]} == *"total"* ]]
+  [[ ${lines[1]} == *"."* ]]
+  [[ ${lines[2]} == *".."* ]]
+}
+
+@test "runc exec ls -la with --cwd" {
+  # run busybox detached
+  runc run -d --console-socket $CONSOLE_SOCKET test_busybox
+  [ "$status" -eq 0 ]
+
+  runc exec --cwd /bin test_busybox pwd
+  [ "$status" -eq 0 ]
+  [[ ${output} == "/bin"* ]]
+}
+
+@test "runc exec --env" {
+  # run busybox detached
+  runc run -d --console-socket $CONSOLE_SOCKET test_busybox
+  [ "$status" -eq 0 ]
+
+  runc exec --env RUNC_EXEC_TEST=true test_busybox env
+  [ "$status" -eq 0 ]
+
+  [[ ${output} == *"RUNC_EXEC_TEST=true"* ]]
+}
+
+@test "runc exec --user" {
+  # --user can't work in rootless containers that don't have idmap.
+  [[ "$ROOTLESS" -ne 0 ]] && requires rootless_idmap
+
+  # run busybox detached
+  runc run -d --console-socket $CONSOLE_SOCKET test_busybox
+  [ "$status" -eq 0 ]
+
+  runc exec --user 1000:1000 test_busybox id
+  [ "$status" -eq 0 ]
+
+  [[ "${output}" == "uid=1000 gid=1000"* ]]
+}
+
+@test "runc exec --additional-gids" {
+  requires root
+
+  # run busybox detached
+  runc run -d --console-socket $CONSOLE_SOCKET test_busybox
+  [ "$status" -eq 0 ]
+
+  wait_for_container 15 1 test_busybox
+
+  runc exec --user 1000:1000 --additional-gids 100 --additional-gids 65534 test_busybox id
+  [ "$status" -eq 0 ]
+
+  [[ ${output} == "uid=1000 gid=1000 groups=100(users),65534(nogroup)" ]]
+}
+
+@test "runc exec --preserve-fds" {
+  # run busybox detached
+  runc run -d --console-socket $CONSOLE_SOCKET test_busybox
+  [ "$status" -eq 0 ]
+
+  run bash -c "cat hello > preserve-fds.test; exec 3<preserve-fds.test; $RUNC ${RUNC_USE_SYSTEMD:+--systemd-cgroup} --log /proc/self/fd/2 --root $ROOT exec --preserve-fds=1 test_busybox cat /proc/self/fd/3"
+  [ "$status" -eq 0 ]
+
+  [[ "${output}" == *"hello"* ]]
+}
diff --git a/tests/integration/help.bats b/tests/integration/help.bats
new file mode 100644 (file)
index 0000000..163de2d
--- /dev/null
@@ -0,0 +1,87 @@
+#!/usr/bin/env bats
+
+load helpers
+
+@test "runc -h" {
+  runc -h
+  [ "$status" -eq 0 ]
+  [[ ${lines[0]} =~ NAME:+ ]]
+  [[ ${lines[1]} =~ runc\ '-'\ Open\ Container\ Initiative\ runtime+ ]]
+
+  runc --help
+  [ "$status" -eq 0 ]
+  [[ ${lines[0]} =~ NAME:+ ]]
+  [[ ${lines[1]} =~ runc\ '-'\ Open\ Container\ Initiative\ runtime+ ]]
+}
+
+@test "runc command -h" {
+  runc checkpoint -h
+  [ "$status" -eq 0 ]
+  [[ ${lines[1]} =~ runc\ checkpoint+ ]]
+
+  runc delete -h
+  [ "$status" -eq 0 ]
+  [[ ${lines[1]} =~ runc\ delete+ ]]
+
+  runc events -h
+  [ "$status" -eq 0 ]
+  [[ ${lines[1]} =~ runc\ events+ ]]
+
+  runc exec -h
+  [ "$status" -eq 0 ]
+  [[ ${lines[1]} =~ runc\ exec+ ]]
+
+  runc kill -h
+  [ "$status" -eq 0 ]
+  [[ ${lines[1]} =~ runc\ kill+ ]]
+
+  runc list -h
+  [ "$status" -eq 0 ]
+  [[ ${lines[0]} =~ NAME:+ ]]
+  [[ ${lines[1]} =~ runc\ list+ ]]
+
+  runc list --help
+  [ "$status" -eq 0 ]
+  [[ ${lines[0]} =~ NAME:+ ]]
+  [[ ${lines[1]} =~ runc\ list+ ]]
+
+  runc pause -h
+  [ "$status" -eq 0 ]
+  [[ ${lines[1]} =~ runc\ pause+ ]]
+
+  runc restore -h
+  [ "$status" -eq 0 ]
+  [[ ${lines[1]} =~ runc\ restore+ ]]
+
+  runc resume -h
+  [ "$status" -eq 0 ]
+  [[ ${lines[1]} =~ runc\ resume+ ]]
+
+  # We don't use runc_spec here, because we're just testing the help page.
+  runc spec -h
+  [ "$status" -eq 0 ]
+  [[ ${lines[1]} =~ runc\ spec+ ]]
+
+  runc start -h
+  [ "$status" -eq 0 ]
+  [[ ${lines[1]} =~ runc\ start+ ]]
+
+  runc run -h
+  [ "$status" -eq 0 ]
+  [[ ${lines[1]} =~ runc\ run+ ]]
+
+  runc state -h
+  [ "$status" -eq 0 ]
+  [[ ${lines[1]} =~ runc\ state+ ]]
+
+  runc update -h
+  [ "$status" -eq 0 ]
+  [[ ${lines[1]} =~ runc\ update+ ]]
+
+}
+
+@test "runc foo -h" {
+  runc foo -h
+  [ "$status" -ne 0 ]
+  [[ "${output}" == *"No help topic for 'foo'"* ]]
+}
diff --git a/tests/integration/helpers.bash b/tests/integration/helpers.bash
new file mode 100644 (file)
index 0000000..8862dcb
--- /dev/null
@@ -0,0 +1,346 @@
+#!/bin/bash
+
+# Root directory of integration tests.
+INTEGRATION_ROOT=$(dirname "$(readlink -f "$BASH_SOURCE")")
+
+. ${INTEGRATION_ROOT}/multi-arch.bash
+
+RUNC="${INTEGRATION_ROOT}/../../runc"
+RECVTTY="${INTEGRATION_ROOT}/../../contrib/cmd/recvtty/recvtty"
+GOPATH="$(mktemp -d --tmpdir runc-integration-gopath.XXXXXX)"
+
+# Test data path.
+TESTDATA="${INTEGRATION_ROOT}/testdata"
+
+# Busybox image
+BUSYBOX_IMAGE="$BATS_TMPDIR/busybox.tar"
+BUSYBOX_BUNDLE="$BATS_TMPDIR/busyboxtest"
+
+# hello-world in tar format
+HELLO_FILE=`get_hello`
+HELLO_IMAGE="$TESTDATA/$HELLO_FILE"
+HELLO_BUNDLE="$BATS_TMPDIR/hello-world"
+
+# CRIU PATH
+CRIU="$(which criu || true)"
+
+# Kernel version
+KERNEL_VERSION="$(uname -r)"
+KERNEL_MAJOR="${KERNEL_VERSION%%.*}"
+KERNEL_MINOR="${KERNEL_VERSION#$KERNEL_MAJOR.}"
+KERNEL_MINOR="${KERNEL_MINOR%%.*}"
+
+# Root state path.
+ROOT=$(mktemp -d "$BATS_TMPDIR/runc.XXXXXX")
+
+# Path to console socket.
+CONSOLE_SOCKET="$BATS_TMPDIR/console.sock"
+
+# Cgroup paths
+CGROUP_MEMORY_BASE_PATH=$(grep "cgroup" /proc/self/mountinfo | gawk 'toupper($NF) ~ /\<MEMORY\>/ { print $5; exit }')
+CGROUP_CPU_BASE_PATH=$(grep "cgroup" /proc/self/mountinfo | gawk 'toupper($NF) ~ /\<CPU\>/ { print $5; exit }')
+if [[ -n "${RUNC_USE_SYSTEMD}" ]] ; then
+       CGROUPS_PATH="/machine.slice/runc-cgroups-integration-test.scope"
+else
+       CGROUPS_PATH="/runc-cgroups-integration-test/test-cgroup"
+fi
+CGROUP_MEMORY="${CGROUP_MEMORY_BASE_PATH}${CGROUPS_PATH}"
+
+# CONFIG_MEMCG_KMEM support
+KMEM="${CGROUP_MEMORY_BASE_PATH}/memory.kmem.limit_in_bytes"
+RT_PERIOD="${CGROUP_CPU_BASE_PATH}/cpu.rt_period_us"
+
+# Check if we're in rootless mode.
+ROOTLESS=$(id -u)
+
+# Wrapper for runc.
+function runc() {
+       run __runc "$@"
+
+       # Some debug information to make life easier. bats will only print it if the
+       # test failed, in which case the output is useful.
+       echo "runc $@ (status=$status):" >&2
+       echo "$output" >&2
+}
+
+# Raw wrapper for runc.
+function __runc() {
+       "$RUNC" ${RUNC_USE_SYSTEMD+--systemd-cgroup} --root "$ROOT" "$@"
+}
+
+# Wrapper for runc spec, which takes only one argument (the bundle path).
+function runc_spec() {
+       ! [[ "$#" > 1 ]]
+
+       local args=()
+       local bundle=""
+
+       if [ "$ROOTLESS" -ne 0 ]; then
+               args+=("--rootless")
+       fi
+       if [ "$#" -ne 0 ]; then
+               bundle="$1"
+               args+=("--bundle" "$bundle")
+       fi
+
+       runc spec "${args[@]}"
+
+       # Always add additional mappings if we have idmaps.
+       if [[ "$ROOTLESS" -ne 0 ]] && [[ "$ROOTLESS_FEATURES" == *"idmap"* ]]; then
+               runc_rootless_idmap "$bundle"
+       fi
+
+       # Ensure config.json contains linux.resources
+       if [[ "$ROOTLESS" -ne 0 ]] && [[ "$ROOTLESS_FEATURES" == *"cgroup"* ]]; then
+               runc_rootless_cgroup "$bundle"
+       fi
+}
+
+# Shortcut to add additional uids and gids, based on the values set as part of
+# a rootless configuration.
+function runc_rootless_idmap() {
+       bundle="${1:-.}"
+       cat "$bundle/config.json" \
+               | jq '.mounts |= map((select(.type == "devpts") | .options += ["gid=5"]) // .)' \
+               | jq '.linux.uidMappings |= .+ [{"hostID": '"$ROOTLESS_UIDMAP_START"', "containerID": 1000, "size": '"$ROOTLESS_UIDMAP_LENGTH"'}]' \
+               | jq '.linux.gidMappings |= .+ [{"hostID": '"$ROOTLESS_GIDMAP_START"', "containerID": 100, "size": 1}]' \
+               | jq '.linux.gidMappings |= .+ [{"hostID": '"$(($ROOTLESS_GIDMAP_START+10))"', "containerID": 1, "size": 20}]' \
+               | jq '.linux.gidMappings |= .+ [{"hostID": '"$(($ROOTLESS_GIDMAP_START+100))"', "containerID": 1000, "size": '"$(($ROOTLESS_GIDMAP_LENGTH-1000))"'}]' \
+               >"$bundle/config.json.tmp"
+       mv "$bundle/config.json"{.tmp,}
+}
+
+# Shortcut to add empty resources as part of a rootless configuration.
+function runc_rootless_cgroup() {
+       bundle="${1:-.}"
+       cat "$bundle/config.json" \
+               | jq '.linux.resources |= .+ {"memory":{},"cpu":{},"blockio":{},"pids":{}}' \
+               >"$bundle/config.json.tmp"
+       mv "$bundle/config.json"{.tmp,}
+}
+
+# Helper function to set cgroupsPath to the value of $CGROUPS_PATH
+function set_cgroups_path() {
+  bundle="${1:-.}"
+  cgroups_path="/runc-cgroups-integration-test/test-cgroup"
+  if [[ -n "${RUNC_USE_SYSTEMD}" ]] ; then
+    cgroups_path="machine.slice:runc-cgroups:integration-test"
+  fi
+  sed -i 's#\("linux": {\)#\1\n    "cgroupsPath": "'"${cgroups_path}"'",#' "$bundle/config.json"
+}
+
+# Helper function to set a resources limit
+function set_resources_limit() {
+  bundle="${1:-.}"
+  sed -i 's/\("linux": {\)/\1\n   "resources": { "pids": { "limit": 100 } },/'  "$bundle/config.json"
+}
+
+# Fails the current test, providing the error given.
+function fail() {
+       echo "$@" >&2
+       exit 1
+}
+
+# Allows a test to specify what things it requires. If the environment can't
+# support it, the test is skipped with a message.
+function requires() {
+       for var in "$@"; do
+               case $var in
+               criu)
+                       if [ ! -e "$CRIU" ]; then
+                               skip "test requires ${var}"
+                       fi
+                       ;;
+               root)
+                       if [ "$ROOTLESS" -ne 0 ]; then
+                               skip "test requires ${var}"
+                       fi
+                       ;;
+               rootless)
+                       if [ "$ROOTLESS" -eq 0 ]; then
+                               skip "test requires ${var}"
+                       fi
+                       ;;
+               rootless_idmap)
+                       if [[ "$ROOTLESS_FEATURES" != *"idmap"* ]]; then
+                               skip "test requires ${var}"
+                       fi
+                       ;;
+               rootless_cgroup)
+                       if [[ "$ROOTLESS_FEATURES" != *"cgroup"* ]]; then
+                               skip "test requires ${var}"
+                       fi
+                       ;;
+               rootless_no_cgroup)
+                       if [[ "$ROOTLESS_FEATURES" == *"cgroup"* ]]; then
+                               skip "test requires ${var}"
+                       fi
+                       ;;
+               cgroups_kmem)
+                       if [ ! -e "$KMEM" ]; then
+                               skip "Test requires ${var}"
+                       fi
+                       ;;
+               cgroups_rt)
+                       if [ ! -e "$RT_PERIOD" ]; then
+                               skip "Test requires ${var}"
+                       fi
+                       ;;
+               *)
+                       fail "BUG: Invalid requires ${var}."
+                       ;;
+               esac
+       done
+}
+
+# Retry a command $1 times until it succeeds. Wait $2 seconds between retries.
+function retry() {
+       local attempts=$1
+       shift
+       local delay=$1
+       shift
+       local i
+
+       for ((i = 0; i < attempts; i++)); do
+               run "$@"
+               if [[ "$status" -eq 0 ]]; then
+                       return 0
+               fi
+               sleep $delay
+       done
+
+       echo "Command \"$@\" failed $attempts times. Output: $output"
+       false
+}
+
+# retry until the given container has state
+function wait_for_container() {
+       local attempts=$1
+       local delay=$2
+       local cid=$3
+       # optionally wait for a specific status
+       local wait_for_status="${4:-}"
+       local i
+
+       for ((i = 0; i < attempts; i++)); do
+               runc state $cid
+               if [[ "$status" -eq 0 ]]; then
+                       if [[ "${output}" == *"${wait_for_status}"* ]]; then
+                               return 0
+                       fi
+               fi
+               sleep $delay
+       done
+
+       echo "runc state failed to return state $statecheck $attempts times. Output: $output"
+       false
+}
+
+# retry until the given container has state
+function wait_for_container_inroot() {
+       local attempts=$1
+       local delay=$2
+       local cid=$3
+       # optionally wait for a specific status
+       local wait_for_status="${4:-}"
+       local i
+
+       for ((i = 0; i < attempts; i++)); do
+               ROOT=$4 runc state $cid
+               if [[ "$status" -eq 0 ]]; then
+                       if [[ "${output}" == *"${wait_for_status}"* ]]; then
+                               return 0
+                       fi
+               fi
+               sleep $delay
+       done
+
+       echo "runc state failed to return state $statecheck $attempts times. Output: $output"
+       false
+}
+
+function testcontainer() {
+       # test state of container
+       runc state $1
+       [ "$status" -eq 0 ]
+       [[ "${output}" == *"$2"* ]]
+}
+
+function setup_recvtty() {
+       # We need to start recvtty in the background, so we double fork in the shell.
+       ("$RECVTTY" --pid-file "$BATS_TMPDIR/recvtty.pid" --mode null "$CONSOLE_SOCKET" &) &
+}
+
+function teardown_recvtty() {
+       # When we kill recvtty, the container will also be killed.
+       if [ -f "$BATS_TMPDIR/recvtty.pid" ]; then
+               kill -9 $(cat "$BATS_TMPDIR/recvtty.pid")
+       fi
+
+       # Clean up the files that might be left over.
+       rm -f "$BATS_TMPDIR/recvtty.pid"
+       rm -f "$CONSOLE_SOCKET"
+}
+
+function setup_busybox() {
+       setup_recvtty
+       run mkdir "$BUSYBOX_BUNDLE"
+       run mkdir "$BUSYBOX_BUNDLE"/rootfs
+       if [ -e "/testdata/busybox.tar" ]; then
+               BUSYBOX_IMAGE="/testdata/busybox.tar"
+       fi
+       if [ ! -e $BUSYBOX_IMAGE ]; then
+               curl -o $BUSYBOX_IMAGE -sSL `get_busybox`
+       fi
+       tar --exclude './dev/*' -C "$BUSYBOX_BUNDLE"/rootfs -xf "$BUSYBOX_IMAGE"
+       cd "$BUSYBOX_BUNDLE"
+       runc_spec
+}
+
+function setup_hello() {
+       setup_recvtty
+       run mkdir "$HELLO_BUNDLE"
+       run mkdir "$HELLO_BUNDLE"/rootfs
+       tar --exclude './dev/*' -C "$HELLO_BUNDLE"/rootfs -xf "$HELLO_IMAGE"
+       cd "$HELLO_BUNDLE"
+       runc_spec
+       sed -i 's;"sh";"/hello";' config.json
+}
+
+function teardown_running_container() {
+       runc list
+       # $1 should be a container name such as "test_busybox"
+       # here we detect "test_busybox "(with one extra blank) to avoid conflict prefix
+       # e.g. "test_busybox" and "test_busybox_update"
+       if [[ "${output}" == *"$1 "* ]]; then
+               runc kill $1 KILL
+               retry 10 1 eval "__runc state '$1' | grep -q 'stopped'"
+               runc delete $1
+       fi
+}
+
+function teardown_running_container_inroot() {
+       ROOT=$2 runc list
+       # $1 should be a container name such as "test_busybox"
+       # here we detect "test_busybox "(with one extra blank) to avoid conflict prefix
+       # e.g. "test_busybox" and "test_busybox_update"
+       if [[ "${output}" == *"$1 "* ]]; then
+               ROOT=$2 runc kill $1 KILL
+               retry 10 1 eval "ROOT='$2' __runc state '$1' | grep -q 'stopped'"
+               ROOT=$2 runc delete $1
+       fi
+}
+
+function teardown_busybox() {
+       cd "$INTEGRATION_ROOT"
+       teardown_recvtty
+       teardown_running_container test_busybox
+       run rm -f -r "$BUSYBOX_BUNDLE"
+}
+
+function teardown_hello() {
+       cd "$INTEGRATION_ROOT"
+       teardown_recvtty
+       teardown_running_container test_hello
+       run rm -f -r "$HELLO_BUNDLE"
+}
diff --git a/tests/integration/kill.bats b/tests/integration/kill.bats
new file mode 100644 (file)
index 0000000..d9afe92
--- /dev/null
@@ -0,0 +1,30 @@
+#!/usr/bin/env bats
+
+load helpers
+
+function setup() {
+  teardown_busybox
+  setup_busybox
+}
+
+function teardown() {
+  teardown_busybox
+}
+
+
+@test "kill detached busybox" {
+  # run busybox detached
+  runc run -d --console-socket $CONSOLE_SOCKET test_busybox
+  [ "$status" -eq 0 ]
+
+  # check state
+  testcontainer test_busybox running
+
+  runc kill test_busybox KILL
+  [ "$status" -eq 0 ]
+
+  retry 10 1 eval "__runc state test_busybox | grep -q 'stopped'"
+
+  runc delete test_busybox
+  [ "$status" -eq 0 ]
+}
diff --git a/tests/integration/list.bats b/tests/integration/list.bats
new file mode 100644 (file)
index 0000000..0a938c0
--- /dev/null
@@ -0,0 +1,56 @@
+#!/usr/bin/env bats
+
+load helpers
+
+function setup() {
+  teardown_running_container_inroot test_box1 $HELLO_BUNDLE
+  teardown_running_container_inroot test_box2 $HELLO_BUNDLE
+  teardown_running_container_inroot test_box3 $HELLO_BUNDLE
+  teardown_busybox
+  setup_busybox
+}
+
+function teardown() {
+  teardown_running_container_inroot test_box1 $HELLO_BUNDLE
+  teardown_running_container_inroot test_box2 $HELLO_BUNDLE
+  teardown_running_container_inroot test_box3 $HELLO_BUNDLE
+  teardown_busybox
+}
+
+@test "list" {
+  # run a few busyboxes detached
+  ROOT=$HELLO_BUNDLE runc run -d --console-socket $CONSOLE_SOCKET test_box1
+  [ "$status" -eq 0 ]
+
+  ROOT=$HELLO_BUNDLE runc run -d --console-socket $CONSOLE_SOCKET test_box2
+  [ "$status" -eq 0 ]
+
+  ROOT=$HELLO_BUNDLE runc run -d --console-socket $CONSOLE_SOCKET test_box3
+  [ "$status" -eq 0 ]
+
+  ROOT=$HELLO_BUNDLE runc list
+  [ "$status" -eq 0 ]
+  [[ ${lines[0]} =~ ID\ +PID\ +STATUS\ +BUNDLE\ +CREATED+ ]]
+  [[ "${lines[1]}" == *"test_box1"*[0-9]*"running"*$BUSYBOX_BUNDLE*[0-9]* ]]
+  [[ "${lines[2]}" == *"test_box2"*[0-9]*"running"*$BUSYBOX_BUNDLE*[0-9]* ]]
+  [[ "${lines[3]}" == *"test_box3"*[0-9]*"running"*$BUSYBOX_BUNDLE*[0-9]* ]]
+
+  ROOT=$HELLO_BUNDLE runc list -q
+  [ "$status" -eq 0 ]
+  [[ "${lines[0]}" == "test_box1" ]]
+  [[ "${lines[1]}" == "test_box2" ]]
+  [[ "${lines[2]}" == "test_box3" ]]
+
+  ROOT=$HELLO_BUNDLE runc list --format table
+  [ "$status" -eq 0 ]
+  [[ ${lines[0]} =~ ID\ +PID\ +STATUS\ +BUNDLE\ +CREATED+ ]]
+  [[ "${lines[1]}" == *"test_box1"*[0-9]*"running"*$BUSYBOX_BUNDLE*[0-9]* ]]
+  [[ "${lines[2]}" == *"test_box2"*[0-9]*"running"*$BUSYBOX_BUNDLE*[0-9]* ]]
+  [[ "${lines[3]}" == *"test_box3"*[0-9]*"running"*$BUSYBOX_BUNDLE*[0-9]* ]]
+
+  ROOT=$HELLO_BUNDLE runc list --format json
+  [ "$status" -eq 0 ]
+  [[ "${lines[0]}" == [\[][\{]"\"ociVersion\""[:]"\""*[0-9][\.]*[0-9][\.]*[0-9]*"\""[,]"\"id\""[:]"\"test_box1\""[,]"\"pid\""[:]*[0-9][,]"\"status\""[:]*"\"running\""[,]"\"bundle\""[:]*$BUSYBOX_BUNDLE*[,]"\"rootfs\""[:]"\""*"\""[,]"\"created\""[:]*[0-9]*[\}]* ]]
+  [[ "${lines[0]}" == *[,][\{]"\"ociVersion\""[:]"\""*[0-9][\.]*[0-9][\.]*[0-9]*"\""[,]"\"id\""[:]"\"test_box2\""[,]"\"pid\""[:]*[0-9][,]"\"status\""[:]*"\"running\""[,]"\"bundle\""[:]*$BUSYBOX_BUNDLE*[,]"\"rootfs\""[:]"\""*"\""[,]"\"created\""[:]*[0-9]*[\}]* ]]
+  [[ "${lines[0]}" == *[,][\{]"\"ociVersion\""[:]"\""*[0-9][\.]*[0-9][\.]*[0-9]*"\""[,]"\"id\""[:]"\"test_box3\""[,]"\"pid\""[:]*[0-9][,]"\"status\""[:]*"\"running\""[,]"\"bundle\""[:]*$BUSYBOX_BUNDLE*[,]"\"rootfs\""[:]"\""*"\""[,]"\"created\""[:]*[0-9]*[\}][\]] ]]
+}
diff --git a/tests/integration/mask.bats b/tests/integration/mask.bats
new file mode 100644 (file)
index 0000000..aaa8042
--- /dev/null
@@ -0,0 +1,59 @@
+#!/usr/bin/env bats
+
+load helpers
+
+function setup() {
+       teardown_busybox
+       setup_busybox
+
+       # Create fake rootfs.
+       mkdir rootfs/testdir
+       echo "Forbidden information!" > rootfs/testfile
+
+       # add extra masked paths
+       sed -i 's;"maskedPaths": \[;"maskedPaths": \["/testdir","/testfile",;g' config.json
+}
+
+function teardown() {
+       teardown_busybox
+}
+
+@test "mask paths [file]" {
+       # run busybox detached
+       runc run -d --console-socket $CONSOLE_SOCKET test_busybox
+       [ "$status" -eq 0 ]
+
+       runc exec test_busybox cat /testfile
+       [ "$status" -eq 0 ]
+       [[ "${output}" == "" ]]
+
+       runc exec test_busybox rm -f /testfile
+       [ "$status" -eq 1 ]
+       [[ "${output}" == *"Read-only file system"* ]]
+
+       runc exec test_busybox umount /testfile
+       [ "$status" -eq 1 ]
+       [[ "${output}" == *"Operation not permitted"* ]]
+}
+
+@test "mask paths [directory]" {
+       # run busybox detached
+       runc run -d --console-socket $CONSOLE_SOCKET test_busybox
+       [ "$status" -eq 0 ]
+
+       runc exec test_busybox ls /testdir
+       [ "$status" -eq 0 ]
+       [[ "${output}" == "" ]]
+
+       runc exec test_busybox touch /testdir/foo
+       [ "$status" -eq 1 ]
+       [[ "${output}" == *"Read-only file system"* ]]
+
+       runc exec test_busybox rm -rf /testdir
+       [ "$status" -eq 1 ]
+       [[ "${output}" == *"Read-only file system"* ]]
+
+       runc exec test_busybox umount /testdir
+       [ "$status" -eq 1 ]
+       [[ "${output}" == *"Operation not permitted"* ]]
+}
diff --git a/tests/integration/mounts.bats b/tests/integration/mounts.bats
new file mode 100755 (executable)
index 0000000..c35b3c5
--- /dev/null
@@ -0,0 +1,21 @@
+#!/usr/bin/env bats
+
+load helpers
+
+function setup() {
+       teardown_busybox
+       setup_busybox
+}
+
+function teardown() {
+       teardown_busybox
+}
+
+@test "runc run [bind mount]" {
+       CONFIG=$(jq '.mounts |= . + [{"source": ".", "destination": "/tmp/bind", "options": ["bind"]}] | .process.args = ["ls", "/tmp/bind/config.json"]' config.json)
+       echo "${CONFIG}" >config.json
+
+       runc run test_bind_mount
+       [ "$status" -eq 0 ]
+       [[ "${lines[0]}" =~ '/tmp/bind/config.json' ]]
+}
diff --git a/tests/integration/multi-arch.bash b/tests/integration/multi-arch.bash
new file mode 100644 (file)
index 0000000..5616bf7
--- /dev/null
@@ -0,0 +1,22 @@
+#!/bin/bash
+get_busybox(){
+       case $(go env GOARCH) in
+       arm64)
+               echo 'https://github.com/docker-library/busybox/raw/dist-arm64v8/glibc/busybox.tar.xz'
+       ;;
+       *)
+               echo 'https://github.com/docker-library/busybox/raw/dist-amd64/glibc/busybox.tar.xz'
+       ;;
+       esac
+}
+
+get_hello(){
+       case $(go env GOARCH) in
+        arm64)
+                echo 'hello-world-aarch64.tar'
+        ;;
+        *)
+                echo 'hello-world.tar'
+        ;;
+        esac
+}
diff --git a/tests/integration/pause.bats b/tests/integration/pause.bats
new file mode 100644 (file)
index 0000000..4e25e59
--- /dev/null
@@ -0,0 +1,72 @@
+#!/usr/bin/env bats
+
+load helpers
+
+function setup() {
+  teardown_busybox
+  setup_busybox
+}
+
+function teardown() {
+  teardown_busybox
+}
+
+@test "runc pause and resume" {
+  # XXX: currently cgroups require root containers.
+  requires root
+
+  # run busybox detached
+  runc run -d --console-socket $CONSOLE_SOCKET test_busybox
+  [ "$status" -eq 0 ]
+
+  testcontainer test_busybox running
+
+  # pause busybox
+  runc pause test_busybox
+  [ "$status" -eq 0 ]
+
+  # test state of busybox is paused
+  testcontainer test_busybox paused
+
+  # resume busybox
+  runc resume test_busybox
+  [ "$status" -eq 0 ]
+
+  # test state of busybox is back to running
+  testcontainer test_busybox running
+}
+
+@test "runc pause and resume with nonexist container" {
+  # XXX: currently cgroups require root containers.
+  requires root
+
+  # run test_busybox detached
+  runc run -d --console-socket $CONSOLE_SOCKET test_busybox
+  [ "$status" -eq 0 ]
+
+  testcontainer test_busybox running
+
+  # pause test_busybox and nonexistent container
+  runc pause test_busybox
+  [ "$status" -eq 0 ]
+  runc pause nonexistent
+  [ "$status" -ne 0 ]
+
+  # test state of test_busybox is paused
+  testcontainer test_busybox paused
+
+  # resume test_busybox and nonexistent container
+  runc resume test_busybox
+  [ "$status" -eq 0 ]
+  runc resume nonexistent
+  [ "$status" -ne 0 ]
+
+  # test state of test_busybox is back to running
+  testcontainer test_busybox running
+
+  # delete test_busybox
+  runc delete --force test_busybox
+
+  runc state test_busybox
+  [ "$status" -ne 0 ]
+}
diff --git a/tests/integration/ps.bats b/tests/integration/ps.bats
new file mode 100644 (file)
index 0000000..646b5ab
--- /dev/null
@@ -0,0 +1,62 @@
+#!/usr/bin/env bats
+
+load helpers
+
+function setup() {
+  teardown_busybox
+  setup_busybox
+}
+
+function teardown() {
+  teardown_busybox
+}
+
+@test "ps" {
+  # ps is not supported, it requires cgroups
+  requires root
+
+  # start busybox detached
+  runc run -d --console-socket $CONSOLE_SOCKET test_busybox
+  [ "$status" -eq 0 ]
+
+  # check state
+  testcontainer test_busybox running
+
+  runc ps test_busybox
+  [ "$status" -eq 0 ]
+  [[ ${lines[0]} =~ UID\ +PID\ +PPID\ +C\ +STIME\ +TTY\ +TIME\ +CMD+ ]]
+  [[ "${lines[1]}" == *"$(id -un 2>/dev/null)"*[0-9]* ]]
+}
+
+@test "ps -f json" {
+  # ps is not supported, it requires cgroups
+  requires root
+
+  # start busybox detached
+  runc run -d --console-socket $CONSOLE_SOCKET test_busybox
+  [ "$status" -eq 0 ]
+
+  # check state
+  testcontainer test_busybox running
+
+  runc ps -f json test_busybox
+  [ "$status" -eq 0 ]
+  [[ ${lines[0]} =~ [0-9]+ ]]
+}
+
+@test "ps -e -x" {
+  # ps is not supported, it requires cgroups
+  requires root
+
+  # start busybox detached
+  runc run -d --console-socket $CONSOLE_SOCKET test_busybox
+  [ "$status" -eq 0 ]
+
+  # check state
+  testcontainer test_busybox running
+
+  runc ps test_busybox -e -x
+  [ "$status" -eq 0 ]
+  [[ ${lines[0]} =~ \ +PID\ +TTY\ +STAT\ +TIME\ +COMMAND+ ]]
+  [[ "${lines[1]}" =~ [0-9]+ ]]
+}
diff --git a/tests/integration/root.bats b/tests/integration/root.bats
new file mode 100644 (file)
index 0000000..90b53b4
--- /dev/null
@@ -0,0 +1,50 @@
+#!/usr/bin/env bats
+
+load helpers
+
+function setup() {
+  teardown_running_container_inroot test_dotbox $HELLO_BUNDLE
+  teardown_busybox
+  setup_busybox
+}
+
+function teardown() {
+  teardown_running_container_inroot test_dotbox $HELLO_BUNDLE
+  teardown_busybox
+}
+
+@test "global --root" {
+  # run busybox detached using $HELLO_BUNDLE for state
+  ROOT=$HELLO_BUNDLE runc run -d --console-socket $CONSOLE_SOCKET test_dotbox
+  [ "$status" -eq 0 ]
+
+  # run busybox detached in default root
+  runc run -d --console-socket $CONSOLE_SOCKET test_busybox
+  [ "$status" -eq 0 ]
+
+  runc state test_busybox
+  [ "$status" -eq 0 ]
+  [[ "${output}" == *"running"* ]]
+
+  ROOT=$HELLO_BUNDLE runc state test_dotbox
+  [ "$status" -eq 0 ]
+  [[ "${output}" == *"running"* ]]
+
+  ROOT=$HELLO_BUNDLE runc state test_busybox
+  [ "$status" -ne 0 ]
+
+  runc state test_dotbox
+  [ "$status" -ne 0 ]
+
+  runc kill test_busybox KILL
+  [ "$status" -eq 0 ]
+  retry 10 1 eval "__runc state test_busybox | grep -q 'stopped'"
+  runc delete test_busybox
+  [ "$status" -eq 0 ]
+
+  ROOT=$HELLO_BUNDLE runc kill test_dotbox KILL
+  [ "$status" -eq 0 ]
+  retry 10 1 eval "ROOT='$HELLO_BUNDLE' __runc state test_dotbox | grep -q 'stopped'"
+  ROOT=$HELLO_BUNDLE runc delete test_dotbox
+  [ "$status" -eq 0 ]
+}
diff --git a/tests/integration/spec.bats b/tests/integration/spec.bats
new file mode 100644 (file)
index 0000000..5df8f70
--- /dev/null
@@ -0,0 +1,96 @@
+#!/usr/bin/env bats
+
+load helpers
+
+function setup() {
+  # initial cleanup in case a prior test exited and did not cleanup
+  cd "$INTEGRATION_ROOT"
+  run rm -f -r "$HELLO_BUNDLE"
+
+  # setup hello-world for spec generation testing
+  run mkdir "$HELLO_BUNDLE"
+  run mkdir "$HELLO_BUNDLE"/rootfs
+  run tar -C "$HELLO_BUNDLE"/rootfs -xf "$HELLO_IMAGE"
+}
+
+function teardown() {
+  cd "$INTEGRATION_ROOT"
+  run rm -f -r "$HELLO_BUNDLE"
+}
+
+@test "spec generation cwd" {
+  cd "$HELLO_BUNDLE"
+  # note this test runs from the bundle not the integration root
+
+  # test that config.json does not exist after the above partial setup
+  [ ! -e config.json ]
+
+  # test generation of spec does not return an error
+  runc_spec
+  [ "$status" -eq 0 ]
+
+  # test generation of spec created our config.json (spec)
+  [ -e config.json ]
+
+  # test existence of required args parameter in the generated config.json
+  run bash -c "grep -A2 'args' config.json | grep 'sh'"
+  [[ "${output}" == *"sh"* ]]
+
+  # change the default args parameter from sh to hello
+  sed -i 's;"sh";"/hello";' config.json
+
+  # ensure the generated spec works by running hello-world
+  runc run test_hello
+  [ "$status" -eq 0 ]
+}
+
+@test "spec generation --bundle" {
+  # note this test runs from the integration root not the bundle
+
+  # test that config.json does not exist after the above partial setup
+  [ ! -e "$HELLO_BUNDLE"/config.json ]
+
+  # test generation of spec does not return an error
+  runc_spec "$HELLO_BUNDLE"
+  [ "$status" -eq 0 ]
+
+  # test generation of spec created our config.json (spec)
+  [ -e "$HELLO_BUNDLE"/config.json ]
+
+  # change the default args parameter from sh to hello
+  sed -i 's;"sh";"/hello";' "$HELLO_BUNDLE"/config.json
+
+  # ensure the generated spec works by running hello-world
+  runc run --bundle "$HELLO_BUNDLE" test_hello
+  [ "$status" -eq 0 ]
+}
+
+@test "spec validator" {
+  TESTDIR=$(pwd)
+  cd "$HELLO_BUNDLE"
+
+  run git clone https://github.com/opencontainers/runtime-spec.git src/runtime-spec
+  [ "$status" -eq 0 ]
+
+  SPEC_COMMIT=$(grep '^github.com/opencontainers/runtime-spec' ${TESTDIR}/../../vendor.conf | tr -s ' ' | cut -d ' ' -f 2)
+  run git -C src/runtime-spec reset --hard "${SPEC_COMMIT}"
+
+  [ "$status" -eq 0 ]
+  [ -e src/runtime-spec/schema/config-schema.json ]
+
+  run bash -c "GOPATH='$GOPATH' go get github.com/xeipuuv/gojsonschema"
+  [ "$status" -eq 0 ]
+
+  run git -C "${GOPATH}/src/github.com/xeipuuv/gojsonschema" reset --hard 6637feb73ee44cd4640bb3def285c29774234c7f
+  [ "$status" -eq 0 ]
+
+  GOPATH="$GOPATH" go build src/runtime-spec/schema/validate.go
+  [ -e ./validate ]
+
+  runc spec
+  [ -e config.json ]
+
+  run ./validate src/runtime-spec/schema/config-schema.json config.json
+  [ "$status" -eq 0 ]
+  [[ "${lines[0]}" == *"The document is valid"* ]]
+}
diff --git a/tests/integration/start.bats b/tests/integration/start.bats
new file mode 100644 (file)
index 0000000..1f0ea8e
--- /dev/null
@@ -0,0 +1,31 @@
+#!/usr/bin/env bats
+
+load helpers
+
+function setup() {
+  teardown_busybox
+  setup_busybox
+}
+
+function teardown() {
+  teardown_busybox
+}
+
+@test "runc start" {
+  runc create --console-socket $CONSOLE_SOCKET test_busybox
+  [ "$status" -eq 0 ]
+
+  testcontainer test_busybox created
+
+  # start container test_busybox
+  runc start test_busybox
+  [ "$status" -eq 0 ]
+
+  testcontainer test_busybox running
+
+  # delete test_busybox
+  runc delete --force test_busybox
+
+  runc state test_busybox
+  [ "$status" -ne 0 ]
+}
diff --git a/tests/integration/start_detached.bats b/tests/integration/start_detached.bats
new file mode 100644 (file)
index 0000000..7f177b8
--- /dev/null
@@ -0,0 +1,76 @@
+#!/usr/bin/env bats
+
+load helpers
+
+function setup() {
+  teardown_busybox
+  setup_busybox
+}
+
+function teardown() {
+  teardown_busybox
+}
+
+@test "runc run detached" {
+  # run busybox detached
+  runc run -d --console-socket $CONSOLE_SOCKET test_busybox
+  [ "$status" -eq 0 ]
+
+  # check state
+  testcontainer test_busybox running
+}
+
+@test "runc run detached ({u,g}id != 0)" {
+  # cannot start containers as another user in rootless setup without idmap
+  [[ "$ROOTLESS" -ne 0 ]] && requires rootless_idmap
+
+  # replace "uid": 0 with "uid": 1000
+  # and do a similar thing for gid.
+  sed -i 's;"uid": 0;"uid": 1000;g' config.json
+  sed -i 's;"gid": 0;"gid": 100;g' config.json
+
+  # run busybox detached
+  runc run -d --console-socket $CONSOLE_SOCKET test_busybox
+  [ "$status" -eq 0 ]
+
+  # check state
+  testcontainer test_busybox running
+}
+
+@test "runc run detached --pid-file" {
+  # run busybox detached
+  runc run --pid-file pid.txt -d --console-socket $CONSOLE_SOCKET test_busybox
+  [ "$status" -eq 0 ]
+
+  # check state
+  testcontainer test_busybox running
+
+  # check pid.txt was generated
+  [ -e pid.txt ]
+
+  run cat pid.txt
+  [ "$status" -eq 0 ]
+  [[ ${lines[0]} == $(__runc state test_busybox | jq '.pid') ]]
+}
+
+@test "runc run detached --pid-file with new CWD" {
+  # create pid_file directory as the CWD
+  run mkdir pid_file
+  [ "$status" -eq 0 ]
+  run cd pid_file
+  [ "$status" -eq 0 ]
+
+  # run busybox detached
+  runc run --pid-file pid.txt -d  -b $BUSYBOX_BUNDLE --console-socket $CONSOLE_SOCKET test_busybox
+  [ "$status" -eq 0 ]
+
+  # check state
+  testcontainer test_busybox running
+
+  # check pid.txt was generated
+  [ -e pid.txt ]
+
+  run cat pid.txt
+  [ "$status" -eq 0 ]
+  [[ ${lines[0]} == $(__runc state test_busybox | jq '.pid') ]]
+}
diff --git a/tests/integration/start_hello.bats b/tests/integration/start_hello.bats
new file mode 100644 (file)
index 0000000..a706be2
--- /dev/null
@@ -0,0 +1,64 @@
+#!/usr/bin/env bats
+
+load helpers
+
+function setup() {
+  teardown_hello
+  setup_hello
+}
+
+function teardown() {
+  teardown_hello
+}
+
+@test "runc run" {
+  # run hello-world
+  runc run test_hello
+  [ "$status" -eq 0 ]
+
+  # check expected output
+  [[ "${output}" == *"Hello"* ]]
+}
+
+@test "runc run ({u,g}id != 0)" {
+  # cannot start containers as another user in rootless setup without idmap
+  [[ "$ROOTLESS" -ne 0 ]] && requires rootless_idmap
+
+  # replace "uid": 0 with "uid": 1000
+  # and do a similar thing for gid.
+  sed -i 's;"uid": 0;"uid": 1000;g' config.json
+  sed -i 's;"gid": 0;"gid": 100;g' config.json
+
+  # run hello-world
+  runc run test_hello
+  [ "$status" -eq 0 ]
+
+  # check expected output
+  [[ "${output}" == *"Hello"* ]]
+}
+
+@test "runc run with rootfs set to ." {
+  cp config.json rootfs/.
+  rm config.json
+  cd rootfs
+  sed -i 's;"rootfs";".";' config.json
+
+  # run hello-world
+  runc run test_hello
+  [ "$status" -eq 0 ]
+  [[ "${output}" == *"Hello"* ]]
+}
+
+@test "runc run --pid-file" {
+  # run hello-world
+  runc run --pid-file pid.txt test_hello
+  [ "$status" -eq 0 ]
+  [[ "${output}" == *"Hello"* ]]
+
+  # check pid.txt was generated
+  [ -e pid.txt ]
+
+  run cat pid.txt
+  [ "$status" -eq 0 ]
+  [[ ${lines[0]} =~ [0-9]+ ]]
+}
diff --git a/tests/integration/state.bats b/tests/integration/state.bats
new file mode 100644 (file)
index 0000000..68dae38
--- /dev/null
@@ -0,0 +1,66 @@
+#!/usr/bin/env bats
+
+load helpers
+
+function setup() {
+  teardown_busybox
+  setup_busybox
+}
+
+function teardown() {
+  teardown_busybox
+}
+
+@test "state (kill + delete)" {
+  runc state test_busybox
+  [ "$status" -ne 0 ]
+
+  # run busybox detached
+  runc run -d --console-socket $CONSOLE_SOCKET test_busybox
+  [ "$status" -eq 0 ]
+
+  # check state
+  testcontainer test_busybox running
+
+  runc kill test_busybox KILL
+  [ "$status" -eq 0 ]
+
+  # wait for busybox to be in the destroyed state
+  retry 10 1 eval "__runc state test_busybox | grep -q 'stopped'"
+
+  # delete test_busybox
+  runc delete test_busybox
+  [ "$status" -eq 0 ]
+
+  runc state test_busybox
+  [ "$status" -ne 0 ]
+}
+
+@test "state (pause + resume)" {
+  # XXX: pause and resume require cgroups.
+  requires root
+
+  runc state test_busybox
+  [ "$status" -ne 0 ]
+
+  # run busybox detached
+  runc run -d --console-socket $CONSOLE_SOCKET test_busybox
+  [ "$status" -eq 0 ]
+
+  # check state
+  testcontainer test_busybox running
+
+  # pause busybox
+  runc pause test_busybox
+  [ "$status" -eq 0 ]
+
+  # test state of busybox is paused
+  testcontainer test_busybox paused
+
+  # resume busybox
+  runc resume test_busybox
+  [ "$status" -eq 0 ]
+
+  # test state of busybox is back to running
+  testcontainer test_busybox running
+}
diff --git a/tests/integration/testdata/hello-world-aarch64.tar b/tests/integration/testdata/hello-world-aarch64.tar
new file mode 100644 (file)
index 0000000..186c8ae
Binary files /dev/null and b/tests/integration/testdata/hello-world-aarch64.tar differ
diff --git a/tests/integration/testdata/hello-world.tar b/tests/integration/testdata/hello-world.tar
new file mode 100644 (file)
index 0000000..aec830e
Binary files /dev/null and b/tests/integration/testdata/hello-world.tar differ
diff --git a/tests/integration/tty.bats b/tests/integration/tty.bats
new file mode 100644 (file)
index 0000000..688875d
--- /dev/null
@@ -0,0 +1,230 @@
+#!/usr/bin/env bats
+
+load helpers
+
+function setup() {
+       teardown_busybox
+       setup_busybox
+}
+
+function teardown() {
+       teardown_busybox
+}
+
+@test "runc run [tty ptsname]" {
+       # Replace sh script with readlink.
+    sed -i 's|"sh"|"sh", "-c", "for file in /proc/self/fd/[012]; do readlink $file; done"|' config.json
+
+       # run busybox
+       runc run test_busybox
+       [ "$status" -eq 0 ]
+       [[ ${lines[0]} =~ /dev/pts/+ ]]
+       [[ ${lines[1]} =~ /dev/pts/+ ]]
+       [[ ${lines[2]} =~ /dev/pts/+ ]]
+}
+
+@test "runc run [tty owner]" {
+       # tty chmod is not doable in rootless containers without idmap.
+       # TODO: this can be made as a change to the gid test.
+       [[ "$ROOTLESS" -ne 0 ]] && requires rootless_idmap
+
+       # Replace sh script with stat.
+       sed -i 's/"sh"/"sh", "-c", "stat -c %u:%g $(tty) | tr : \\\\\\\\n"/' config.json
+
+       # run busybox
+       runc run test_busybox
+       [ "$status" -eq 0 ]
+       [[ ${lines[0]} =~ 0 ]]
+       # This is set by the default config.json (it corresponds to the standard tty group).
+       [[ ${lines[1]} =~ 5 ]]
+}
+
+@test "runc run [tty owner] ({u,g}id != 0)" {
+       # tty chmod is not doable in rootless containers without idmap.
+       [[ "$ROOTLESS" -ne 0 ]] && requires rootless_idmap
+
+       # replace "uid": 0 with "uid": 1000
+       # and do a similar thing for gid.
+       sed -i 's;"uid": 0;"uid": 1000;g' config.json
+       sed -i 's;"gid": 0;"gid": 100;g' config.json
+
+       # Replace sh script with stat.
+       sed -i 's/"sh"/"sh", "-c", "stat -c %u:%g $(tty) | tr : \\\\\\\\n"/' config.json
+
+       # run busybox
+       runc run test_busybox
+       [ "$status" -eq 0 ]
+       [[ ${lines[0]} =~ 1000 ]]
+       # This is set by the default config.json (it corresponds to the standard tty group).
+       [[ ${lines[1]} =~ 5 ]]
+}
+
+@test "runc exec [tty ptsname]" {
+       # run busybox detached
+       runc run -d --console-socket $CONSOLE_SOCKET test_busybox
+       [ "$status" -eq 0 ]
+
+       # make sure we're running
+       testcontainer test_busybox running
+
+       # run the exec
+    runc exec test_busybox sh -c 'for file in /proc/self/fd/[012]; do readlink $file; done'
+       [ "$status" -eq 0 ]
+       [[ ${lines[0]} =~ /dev/pts/+ ]]
+       [[ ${lines[1]} =~ /dev/pts/+ ]]
+       [[ ${lines[2]} =~ /dev/pts/+ ]]
+}
+
+@test "runc exec [tty owner]" {
+       # tty chmod is not doable in rootless containers without idmap.
+       # TODO: this can be made as a change to the gid test.
+       [[ "$ROOTLESS" -ne 0 ]] && requires rootless_idmap
+
+       # run busybox detached
+       runc run -d --console-socket $CONSOLE_SOCKET test_busybox
+       [ "$status" -eq 0 ]
+
+       # make sure we're running
+       testcontainer test_busybox running
+
+       # run the exec
+    runc exec test_busybox sh -c 'stat -c %u:%g $(tty) | tr : \\n'
+       [ "$status" -eq 0 ]
+       [[ ${lines[0]} =~ 0 ]]
+       [[ ${lines[1]} =~ 5 ]]
+}
+
+@test "runc exec [tty owner] ({u,g}id != 0)" {
+       # tty chmod is not doable in rootless containers without idmap.
+       [[ "$ROOTLESS" -ne 0 ]] && requires rootless_idmap
+
+       # replace "uid": 0 with "uid": 1000
+       # and do a similar thing for gid.
+       sed -i 's;"uid": 0;"uid": 1000;g' config.json
+       sed -i 's;"gid": 0;"gid": 100;g' config.json
+
+       # run busybox detached
+       runc run -d --console-socket $CONSOLE_SOCKET test_busybox
+       [ "$status" -eq 0 ]
+
+       # make sure we're running
+       testcontainer test_busybox running
+
+       # run the exec
+       runc exec test_busybox sh -c 'stat -c %u:%g $(tty) | tr : \\n'
+       [ "$status" -eq 0 ]
+       [[ ${lines[0]} =~ 1000 ]]
+       [[ ${lines[1]} =~ 5 ]]
+}
+
+@test "runc exec [tty consolesize]" {
+       # allow writing to filesystem
+       sed -i 's/"readonly": true/"readonly": false/' config.json
+
+       # run busybox detached
+       runc run -d --console-socket $CONSOLE_SOCKET test_busybox
+       [ "$status" -eq 0 ]
+
+       # make sure we're running
+       testcontainer test_busybox running
+
+       tty_info_with_consize_size=$( cat <<EOF
+{
+    "terminal": true,
+    "consoleSize": {
+           "height": 10,
+           "width": 110
+    },
+    "args": [
+           "/bin/sh",
+           "-c",
+           "/bin/stty -a > /tmp/tty-info"
+    ],
+    "cwd": "/"
+}
+EOF
+       )
+
+       # run the exec
+       runc exec --pid-file pid.txt -d --console-socket $CONSOLE_SOCKET -p <( echo $tty_info_with_consize_size ) test_busybox
+       [ "$status" -eq 0 ]
+
+       # check the pid was generated
+       [ -e pid.txt ]
+
+       #wait user process to finish
+       timeout 1 tail --pid=$(head -n 1 pid.txt) -f /dev/null
+
+       tty_info=$( cat <<EOF
+{
+    "args": [
+       "/bin/cat",
+       "/tmp/tty-info"
+    ],
+    "cwd": "/"
+}
+EOF
+       )
+
+       # run the exec
+       runc exec -p <( echo $tty_info ) test_busybox
+       [ "$status" -eq 0 ]
+
+       # test tty width and height against original process.json
+       [[ ${lines[0]} =~ "rows 10; columns 110" ]]
+}
+
+@test "runc create [terminal=false]" {
+       # Disable terminal creation.
+       sed -i 's|"terminal": true,|"terminal": false,|g' config.json
+       # Replace sh script with sleep.
+    sed -i 's|"sh"|"sleep", "1000s"|' config.json
+
+       # Make sure that the handling of detached IO is done properly. See #1354.
+       __runc create test_busybox
+
+       # Start the command.
+       runc start test_busybox
+       [ "$status" -eq 0 ]
+
+       testcontainer test_busybox running
+
+       # Kill the container.
+       runc kill test_busybox KILL
+       [ "$status" -eq 0 ]
+}
+
+@test "runc run [terminal=false]" {
+       # Disable terminal creation.
+       sed -i 's|"terminal": true,|"terminal": false,|g' config.json
+       # Replace sh script with sleep.
+    sed -i 's|"sh"|"sleep", "1000s"|' config.json
+
+       # Make sure that the handling of non-detached IO is done properly. See #1354.
+       (
+               __runc run test_busybox
+       ) &
+
+       wait_for_container 15 1 test_busybox running
+       testcontainer test_busybox running
+
+       # Kill the container.
+       runc kill test_busybox KILL
+       [ "$status" -eq 0 ]
+}
+
+@test "runc run -d [terminal=false]" {
+       # Disable terminal creation.
+       sed -i 's|"terminal": true,|"terminal": false,|g' config.json
+       # Replace sh script with sleep.
+    sed -i 's|"sh"|"sleep", "1000s"|' config.json
+
+       # Make sure that the handling of detached IO is done properly. See #1354.
+       __runc run -d test_busybox
+
+       testcontainer test_busybox running
+
+       # Kill the container.
+       runc kill test_busybox KILL
+       [ "$status" -eq 0 ]
+}
diff --git a/tests/integration/update.bats b/tests/integration/update.bats
new file mode 100644 (file)
index 0000000..549b2ec
--- /dev/null
@@ -0,0 +1,269 @@
+#!/usr/bin/env bats
+
+load helpers
+
+function teardown() {
+    rm -f $BATS_TMPDIR/runc-cgroups-integration-test.json
+    teardown_running_container test_update
+    teardown_running_container test_update_rt
+    teardown_busybox
+}
+
+function setup() {
+    teardown
+    setup_busybox
+
+    set_cgroups_path "$BUSYBOX_BUNDLE"
+
+    # Set some initial known values
+    DATA=$(cat <<EOF
+    "memory": {
+        "limit": 33554432,
+        "reservation": 25165824,
+        "kernel": 16777216,
+        "kernelTCP": 11534336
+    },
+    "cpu": {
+        "shares": 100,
+        "quota": 500000,
+        "period": 1000000,
+        "cpus": "0"
+    },
+    "pids": {
+        "limit": 20
+    },
+EOF
+    )
+    DATA=$(echo ${DATA} | sed 's/\n/\\n/g')
+    sed -i "s/\(\"resources\": {\)/\1\n${DATA}/" ${BUSYBOX_BUNDLE}/config.json
+}
+
+function check_cgroup_value() {
+    cgroup=$1
+    source=$2
+    expected=$3
+
+    current=$(cat $cgroup/$source)
+    [ "$current" == "$expected" ]
+}
+
+# TODO: test rt cgroup updating
+@test "update" {
+    # XXX: Also, this test should be split into separate sections so that we
+    #      can skip kmem without skipping update tests overall.
+    [[ "$ROOTLESS" -ne 0 ]] && requires rootless_cgroup
+    requires cgroups_kmem
+
+    # run a few busyboxes detached
+    runc run -d --console-socket $CONSOLE_SOCKET test_update
+    [ "$status" -eq 0 ]
+
+    # get the cgroup paths
+    for g in MEMORY CPUSET CPU BLKIO PIDS; do
+        base_path=$(grep "cgroup"  /proc/self/mountinfo | gawk 'toupper($NF) ~ /\<'${g}'\>/ { print $5; exit }')
+        eval CGROUP_${g}="${base_path}${CGROUPS_PATH}"
+    done
+
+    CGROUP_SYSTEM_MEMORY=$(grep "cgroup"  /proc/self/mountinfo | gawk 'toupper($NF) ~ /\<'MEMORY'\>/ { print $5; exit }')
+
+    # check that initial values were properly set
+    check_cgroup_value $CGROUP_CPU "cpu.cfs_period_us" 1000000
+    check_cgroup_value $CGROUP_CPU "cpu.cfs_quota_us" 500000
+    check_cgroup_value $CGROUP_CPU "cpu.shares" 100
+    check_cgroup_value $CGROUP_CPUSET "cpuset.cpus" 0
+    check_cgroup_value $CGROUP_MEMORY "memory.kmem.limit_in_bytes" 16777216
+    check_cgroup_value $CGROUP_MEMORY "memory.kmem.tcp.limit_in_bytes" 11534336
+    check_cgroup_value $CGROUP_MEMORY "memory.limit_in_bytes" 33554432
+    check_cgroup_value $CGROUP_MEMORY "memory.soft_limit_in_bytes" 25165824
+    check_cgroup_value $CGROUP_PIDS "pids.max" 20
+
+    # update cpu-period
+    runc update test_update --cpu-period 900000
+    [ "$status" -eq 0 ]
+    check_cgroup_value $CGROUP_CPU "cpu.cfs_period_us" 900000
+
+    # update cpu-quota
+    runc update test_update --cpu-quota 600000
+    [ "$status" -eq 0 ]
+    check_cgroup_value $CGROUP_CPU "cpu.cfs_quota_us" 600000
+
+    # update cpu-shares
+    runc update test_update --cpu-share 200
+    [ "$status" -eq 0 ]
+    check_cgroup_value $CGROUP_CPU "cpu.shares" 200
+
+    # update cpuset if supported (i.e. we're running on a multicore cpu)
+    cpu_count=$(grep '^processor' /proc/cpuinfo | wc -l)
+    if [ $cpu_count -gt 1 ]; then
+        runc update test_update --cpuset-cpus "1"
+        [ "$status" -eq 0 ]
+        check_cgroup_value $CGROUP_CPUSET "cpuset.cpus" 1
+    fi
+
+    # update memory limit
+    runc update test_update --memory 67108864
+    [ "$status" -eq 0 ]
+    check_cgroup_value $CGROUP_MEMORY "memory.limit_in_bytes" 67108864
+
+    runc update test_update --memory 50M
+    [ "$status" -eq 0 ]
+    check_cgroup_value $CGROUP_MEMORY "memory.limit_in_bytes" 52428800
+
+    # update memory soft limit
+    runc update test_update --memory-reservation 33554432
+    [ "$status" -eq 0 ]
+    check_cgroup_value $CGROUP_MEMORY "memory.soft_limit_in_bytes" 33554432
+
+    # Run swap memory tests if swap is available
+    if [ -f "$CGROUP_MEMORY/memory.memsw.limit_in_bytes" ]; then
+        # try to remove memory swap limit
+        runc update test_update --memory-swap -1
+        [ "$status" -eq 0 ]
+        # Get System memory swap limit
+        SYSTEM_MEMORY_SW=$(cat "${CGROUP_SYSTEM_MEMORY}/memory.memsw.limit_in_bytes")
+        check_cgroup_value $CGROUP_MEMORY "memory.memsw.limit_in_bytes" ${SYSTEM_MEMORY_SW}
+
+        # update memory swap
+        runc update test_update --memory-swap 96468992
+        [ "$status" -eq 0 ]
+        check_cgroup_value $CGROUP_MEMORY "memory.memsw.limit_in_bytes" 96468992
+    fi;
+
+    # try to remove memory limit
+    runc update test_update --memory -1
+    [ "$status" -eq 0 ]
+
+    # Get System memory limit
+    SYSTEM_MEMORY=$(cat "${CGROUP_SYSTEM_MEMORY}/memory.limit_in_bytes")
+       # check memory limited is gone
+    check_cgroup_value $CGROUP_MEMORY "memory.limit_in_bytes" ${SYSTEM_MEMORY}
+
+    # check swap memory limited is gone
+    if [ -f "$CGROUP_MEMORY/memory.memsw.limit_in_bytes" ]; then
+        check_cgroup_value $CGROUP_MEMORY "memory.memsw.limit_in_bytes" ${SYSTEM_MEMORY}
+    fi
+
+    # update kernel memory limit
+    runc update test_update --kernel-memory 50331648
+    [ "$status" -eq 0 ]
+    check_cgroup_value $CGROUP_MEMORY "memory.kmem.limit_in_bytes" 50331648
+
+    # update kernel memory tcp limit
+    runc update test_update --kernel-memory-tcp 41943040
+    [ "$status" -eq 0 ]
+    check_cgroup_value $CGROUP_MEMORY "memory.kmem.tcp.limit_in_bytes" 41943040
+
+    # update pids limit
+    runc update test_update --pids-limit 10
+    [ "$status" -eq 0 ]
+    check_cgroup_value $CGROUP_PIDS "pids.max" 10
+
+    # Revert to the test initial value via json on stding
+    runc update  -r - test_update <<EOF
+{
+  "memory": {
+    "limit": 33554432,
+    "reservation": 25165824,
+    "kernel": 16777216,
+    "kernelTCP": 11534336
+  },
+  "cpu": {
+    "shares": 100,
+    "quota": 500000,
+    "period": 1000000,
+    "cpus": "0"
+  },
+  "pids": {
+    "limit": 20
+  }
+}
+EOF
+    [ "$status" -eq 0 ]
+    check_cgroup_value $CGROUP_CPU "cpu.cfs_period_us" 1000000
+    check_cgroup_value $CGROUP_CPU "cpu.cfs_quota_us" 500000
+    check_cgroup_value $CGROUP_CPU "cpu.shares" 100
+    check_cgroup_value $CGROUP_CPUSET "cpuset.cpus" 0
+    check_cgroup_value $CGROUP_MEMORY "memory.kmem.limit_in_bytes" 16777216
+    check_cgroup_value $CGROUP_MEMORY "memory.kmem.tcp.limit_in_bytes" 11534336
+    check_cgroup_value $CGROUP_MEMORY "memory.limit_in_bytes" 33554432
+    check_cgroup_value $CGROUP_MEMORY "memory.soft_limit_in_bytes" 25165824
+    check_cgroup_value $CGROUP_PIDS "pids.max" 20
+
+    # redo all the changes at once
+    runc update test_update \
+        --cpu-period 900000 --cpu-quota 600000 --cpu-share 200 --memory 67108864 \
+        --memory-reservation 33554432 --kernel-memory 50331648 --kernel-memory-tcp 41943040 \
+        --pids-limit 10
+    [ "$status" -eq 0 ]
+    check_cgroup_value $CGROUP_CPU "cpu.cfs_period_us" 900000
+    check_cgroup_value $CGROUP_CPU "cpu.cfs_quota_us" 600000
+    check_cgroup_value $CGROUP_CPU "cpu.shares" 200
+    check_cgroup_value $CGROUP_MEMORY "memory.kmem.limit_in_bytes" 50331648
+    check_cgroup_value $CGROUP_MEMORY "memory.kmem.tcp.limit_in_bytes" 41943040
+    check_cgroup_value $CGROUP_MEMORY "memory.limit_in_bytes" 67108864
+    check_cgroup_value $CGROUP_MEMORY "memory.soft_limit_in_bytes" 33554432
+    check_cgroup_value $CGROUP_PIDS "pids.max" 10
+
+    # reset to initial test value via json file
+    DATA=$(cat <<"EOF"
+{
+  "memory": {
+    "limit": 33554432,
+    "reservation": 25165824,
+    "kernel": 16777216,
+    "kernelTCP": 11534336
+  },
+  "cpu": {
+    "shares": 100,
+    "quota": 500000,
+    "period": 1000000,
+    "cpus": "0"
+  },
+  "pids": {
+    "limit": 20
+  }
+}
+EOF
+)
+    echo $DATA > $BATS_TMPDIR/runc-cgroups-integration-test.json
+
+    runc update  -r $BATS_TMPDIR/runc-cgroups-integration-test.json test_update
+    [ "$status" -eq 0 ]
+    check_cgroup_value $CGROUP_CPU "cpu.cfs_period_us" 1000000
+    check_cgroup_value $CGROUP_CPU "cpu.cfs_quota_us" 500000
+    check_cgroup_value $CGROUP_CPU "cpu.shares" 100
+    check_cgroup_value $CGROUP_CPUSET "cpuset.cpus" 0
+    check_cgroup_value $CGROUP_MEMORY "memory.kmem.limit_in_bytes" 16777216
+    check_cgroup_value $CGROUP_MEMORY "memory.kmem.tcp.limit_in_bytes" 11534336
+    check_cgroup_value $CGROUP_MEMORY "memory.limit_in_bytes" 33554432
+    check_cgroup_value $CGROUP_MEMORY "memory.soft_limit_in_bytes" 25165824
+    check_cgroup_value $CGROUP_PIDS "pids.max" 20
+}
+
+@test "update rt period and runtime" {
+    [[ "$ROOTLESS" -ne 0 ]] && requires rootless_cgroup
+    requires cgroups_kmem cgroups_rt
+
+    # run a detached busybox
+    runc run -d --console-socket $CONSOLE_SOCKET test_update_rt
+    [ "$status" -eq 0 ]
+
+    # get the cgroup paths
+    eval CGROUP_CPU="${CGROUP_CPU_BASE_PATH}${CGROUPS_PATH}"
+
+    runc update  -r - test_update_rt <<EOF
+{
+  "cpu": {
+    "realtimePeriod": 800001,
+    "realtimeRuntime": 500001
+  }
+}
+EOF
+    check_cgroup_value $CGROUP_CPU "cpu.rt_period_us" 800001
+    check_cgroup_value $CGROUP_CPU "cpu.rt_runtime_us" 500001
+
+    runc update test_update_rt --cpu-rt-period 900001 --cpu-rt-runtime 600001
+
+    check_cgroup_value $CGROUP_CPU "cpu.rt_period_us" 900001
+    check_cgroup_value $CGROUP_CPU "cpu.rt_runtime_us" 600001
+}
diff --git a/tests/integration/version.bats b/tests/integration/version.bats
new file mode 100644 (file)
index 0000000..ab77769
--- /dev/null
@@ -0,0 +1,11 @@
+#!/usr/bin/env bats
+
+load helpers
+
+@test "runc version" {
+  runc -v
+  [ "$status" -eq 0 ]
+  [[ ${lines[0]} =~ runc\ version\ [0-9]+\.[0-9]+\.[0-9]+ ]]
+  [[ ${lines[1]} =~ commit:+ ]]
+  [[ ${lines[2]} =~ spec:\ [0-9]+\.[0-9]+\.[0-9]+ ]]
+}
diff --git a/tests/rootless.sh b/tests/rootless.sh
new file mode 100755 (executable)
index 0000000..847c286
--- /dev/null
@@ -0,0 +1,125 @@
+#!/bin/bash
+# Copyright (C) 2017 SUSE LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# rootless.sh -- Runner for rootless container tests. The purpose of this
+# script is to allow for the addition (and testing) of "opportunistic" features
+# to rootless containers while still testing the base features. In order to add
+# a new feature, please match the existing style. Add an entry to $ALL_FEATURES,
+# and add an enable_* and disable_* hook.
+
+ALL_FEATURES=("idmap" "cgroup")
+ROOT="$(readlink -f "$(dirname "${BASH_SOURCE}")/..")"
+
+# FEATURE: Opportunistic new{uid,gid}map support, allowing a rootless container
+#          to be set up with the usage of helper setuid binaries.
+
+function enable_idmap() {
+       export ROOTLESS_UIDMAP_START=100000 ROOTLESS_UIDMAP_LENGTH=65536
+       export ROOTLESS_GIDMAP_START=200000 ROOTLESS_GIDMAP_LENGTH=65536
+
+       # Set up sub{uid,gid} mappings.
+       [ -e /etc/subuid.tmp ] && mv /etc/subuid{.tmp,}
+       ( grep -v '^rootless' /etc/subuid ; echo "rootless:$ROOTLESS_UIDMAP_START:$ROOTLESS_UIDMAP_LENGTH" ) > /etc/subuid.tmp
+       mv /etc/subuid{.tmp,}
+       [ -e /etc/subgid.tmp ] && mv /etc/subgid{.tmp,}
+       ( grep -v '^rootless' /etc/subgid ; echo "rootless:$ROOTLESS_GIDMAP_START:$ROOTLESS_GIDMAP_LENGTH" ) > /etc/subgid.tmp
+       mv /etc/subgid{.tmp,}
+
+       # Reactivate new{uid,gid}map helpers if applicable.
+       [ -e /usr/bin/unused-newuidmap ] && mv /usr/bin/{unused-,}newuidmap
+       [ -e /usr/bin/unused-newgidmap ] && mv /usr/bin/{unused-,}newgidmap
+}
+
+function disable_idmap() {
+       export ROOTLESS_UIDMAP_START ROOTLESS_UIDMAP_LENGTH
+       export ROOTLESS_GIDMAP_START ROOTLESS_GIDMAP_LENGTH
+
+       # Deactivate sub{uid,gid} mappings.
+       [ -e /etc/subuid ] && mv /etc/subuid{,.tmp}
+       [ -e /etc/subgid ] && mv /etc/subgid{,.tmp}
+
+       # Deactivate new{uid,gid}map helpers. setuid is preserved with mv(1).
+       [ -e /usr/bin/newuidmap ] && mv /usr/bin/{,unused-}newuidmap
+       [ -e /usr/bin/newgidmap ] && mv /usr/bin/{,unused-}newgidmap
+}
+
+# FEATURE: Opportunistic cgroups support, allowing a rootless container to set
+#          resource limits on condition that cgroupsPath is set to a path the
+#          rootless user has permissions on.
+
+# List of cgroups. We handle name= cgroups as well as combined
+# (comma-separated) cgroups and correctly split and/or strip them.
+ALL_CGROUPS=( $(cat /proc/self/cgroup | cut -d: -f2 | sed -E '{s/^name=//;s/,/\n/;/^$/D}') )
+CGROUP_MOUNT="/sys/fs/cgroup"
+CGROUP_PATH="/runc-cgroups-integration-test"
+
+function enable_cgroup() {
+       # Set up cgroups for use in rootless containers.
+       for cg in "${ALL_CGROUPS[@]}"
+       do
+               mkdir -p "$CGROUP_MOUNT/$cg$CGROUP_PATH"
+               # We only need to allow write access to {cgroup.procs,tasks} and the
+               # directory. Rather than changing the owner entirely, we just change
+               # the group and then allow write access to the group (in order to
+               # further limit the possible DAC permissions that runc could use).
+               chown root:rootless "$CGROUP_MOUNT/$cg$CGROUP_PATH/"{,cgroup.procs,tasks}
+               chmod g+rwx "$CGROUP_MOUNT/$cg$CGROUP_PATH/"{,cgroup.procs,tasks}
+               # Due to cpuset's semantics we need to give extra permissions to allow
+               # for runc to set up the hierarchy. XXX: This really shouldn't be
+               # necessary, and might actually be a bug in our impl of cgroup
+               # handling.
+               [[ "$cg" == "cpuset" ]] && chown rootless:rootless "$CGROUP_MOUNT/$cg$CGROUP_PATH/cpuset."{cpus,mems}
+       done
+}
+
+function disable_cgroup() {
+       # Remove cgroups used in rootless containers.
+       for cg in "${ALL_CGROUPS[@]}"
+       do
+               [ -d "$CGROUP_MOUNT/$cg$CGROUP_PATH" ] && rmdir "$CGROUP_MOUNT/$cg$CGROUP_PATH"
+       done
+}
+
+# Create a powerset of $ALL_FEATURES (the set of all subsets of $ALL_FEATURES).
+# We test all of the possible combinations (as long as we don't add too many
+# feature knobs this shouldn't take too long -- but the number of tested
+# combinations is O(2^n)).
+function powerset() {
+       eval printf '%s' $(printf '{,%s+}' "$@"):
+}
+features_powerset="$(powerset "${ALL_FEATURES[@]}")"
+
+# Iterate over the powerset of all features.
+IFS=:
+for enabled_features in $features_powerset
+do
+       idx="$(($idx+1))"
+       echo "[$(printf '%.2d' "$idx")] run rootless tests ... (${enabled_features%%+})"
+
+       unset IFS
+       for feature in "${ALL_FEATURES[@]}"
+       do
+               hook_func="disable_$feature"
+               grep -E "(^|\+)$feature(\+|$)" <<<$enabled_features &>/dev/null && hook_func="enable_$feature"
+               "$hook_func"
+       done
+
+       # Run the test suite!
+       set -e
+       echo path: $PATH
+       export ROOTLESS_FEATURES="$enabled_features"
+       sudo -HE -u rootless PATH="$PATH" bats -t "$ROOT/tests/integration$TESTFLAGS"
+       set +e
+done
diff --git a/tty.go b/tty.go
new file mode 100644 (file)
index 0000000..6106c2d
--- /dev/null
+++ b/tty.go
@@ -0,0 +1,170 @@
+// +build linux
+
+package main
+
+import (
+       "fmt"
+       "io"
+       "os"
+       "os/signal"
+       "sync"
+
+       "github.com/containerd/console"
+       "github.com/opencontainers/runc/libcontainer"
+       "github.com/opencontainers/runc/libcontainer/utils"
+)
+
+type tty struct {
+       epoller   *console.Epoller
+       console   *console.EpollConsole
+       stdin     console.Console
+       closers   []io.Closer
+       postStart []io.Closer
+       wg        sync.WaitGroup
+       consoleC  chan error
+}
+
+func (t *tty) copyIO(w io.Writer, r io.ReadCloser) {
+       defer t.wg.Done()
+       io.Copy(w, r)
+       r.Close()
+}
+
+// setup pipes for the process so that advanced features like c/r are able to easily checkpoint
+// and restore the process's IO without depending on a host specific path or device
+func setupProcessPipes(p *libcontainer.Process, rootuid, rootgid int) (*tty, error) {
+       i, err := p.InitializeIO(rootuid, rootgid)
+       if err != nil {
+               return nil, err
+       }
+       t := &tty{
+               closers: []io.Closer{
+                       i.Stdin,
+                       i.Stdout,
+                       i.Stderr,
+               },
+       }
+       // add the process's io to the post start closers if they support close
+       for _, cc := range []interface{}{
+               p.Stdin,
+               p.Stdout,
+               p.Stderr,
+       } {
+               if c, ok := cc.(io.Closer); ok {
+                       t.postStart = append(t.postStart, c)
+               }
+       }
+       go func() {
+               io.Copy(i.Stdin, os.Stdin)
+               i.Stdin.Close()
+       }()
+       t.wg.Add(2)
+       go t.copyIO(os.Stdout, i.Stdout)
+       go t.copyIO(os.Stderr, i.Stderr)
+       return t, nil
+}
+
+func inheritStdio(process *libcontainer.Process) error {
+       process.Stdin = os.Stdin
+       process.Stdout = os.Stdout
+       process.Stderr = os.Stderr
+       return nil
+}
+
+func (t *tty) recvtty(process *libcontainer.Process, socket *os.File) (Err error) {
+       f, err := utils.RecvFd(socket)
+       if err != nil {
+               return err
+       }
+       cons, err := console.ConsoleFromFile(f)
+       if err != nil {
+               return err
+       }
+       console.ClearONLCR(cons.Fd())
+       epoller, err := console.NewEpoller()
+       if err != nil {
+               return err
+       }
+       epollConsole, err := epoller.Add(cons)
+       if err != nil {
+               return err
+       }
+       defer func() {
+               if Err != nil {
+                       epollConsole.Close()
+               }
+       }()
+       go epoller.Wait()
+       go io.Copy(epollConsole, os.Stdin)
+       t.wg.Add(1)
+       go t.copyIO(os.Stdout, epollConsole)
+
+       // set raw mode to stdin and also handle interrupt
+       stdin, err := console.ConsoleFromFile(os.Stdin)
+       if err != nil {
+               return err
+       }
+       if err := stdin.SetRaw(); err != nil {
+               return fmt.Errorf("failed to set the terminal from the stdin: %v", err)
+       }
+       go handleInterrupt(stdin)
+
+       t.epoller = epoller
+       t.stdin = stdin
+       t.console = epollConsole
+       t.closers = []io.Closer{epollConsole}
+       return nil
+}
+
+func handleInterrupt(c console.Console) {
+       sigchan := make(chan os.Signal, 1)
+       signal.Notify(sigchan, os.Interrupt)
+       <-sigchan
+       c.Reset()
+       os.Exit(0)
+}
+
+func (t *tty) waitConsole() error {
+       if t.consoleC != nil {
+               return <-t.consoleC
+       }
+       return nil
+}
+
+// ClosePostStart closes any fds that are provided to the container and dup2'd
+// so that we no longer have copy in our process.
+func (t *tty) ClosePostStart() error {
+       for _, c := range t.postStart {
+               c.Close()
+       }
+       return nil
+}
+
+// Close closes all open fds for the tty and/or restores the original
+// stdin state to what it was prior to the container execution
+func (t *tty) Close() error {
+       // ensure that our side of the fds are always closed
+       for _, c := range t.postStart {
+               c.Close()
+       }
+       // the process is gone at this point, shutting down the console if we have
+       // one and wait for all IO to be finished
+       if t.console != nil && t.epoller != nil {
+               t.console.Shutdown(t.epoller.CloseConsole)
+       }
+       t.wg.Wait()
+       for _, c := range t.closers {
+               c.Close()
+       }
+       if t.stdin != nil {
+               t.stdin.Reset()
+       }
+       return nil
+}
+
+func (t *tty) resize() error {
+       if t.console == nil {
+               return nil
+       }
+       return t.console.ResizeFrom(console.Current())
+}
diff --git a/types/events.go b/types/events.go
new file mode 100644 (file)
index 0000000..c6f0e97
--- /dev/null
@@ -0,0 +1,130 @@
+package types
+
+// Event struct for encoding the event data to json.
+type Event struct {
+       Type string      `json:"type"`
+       ID   string      `json:"id"`
+       Data interface{} `json:"data,omitempty"`
+}
+
+// stats is the runc specific stats structure for stability when encoding and decoding stats.
+type Stats struct {
+       CPU               Cpu                 `json:"cpu"`
+       Memory            Memory              `json:"memory"`
+       Pids              Pids                `json:"pids"`
+       Blkio             Blkio               `json:"blkio"`
+       Hugetlb           map[string]Hugetlb  `json:"hugetlb"`
+       IntelRdt          IntelRdt            `json:"intel_rdt"`
+       NetworkInterfaces []*NetworkInterface `json:"network_interfaces"`
+}
+
+type Hugetlb struct {
+       Usage   uint64 `json:"usage,omitempty"`
+       Max     uint64 `json:"max,omitempty"`
+       Failcnt uint64 `json:"failcnt"`
+}
+
+type BlkioEntry struct {
+       Major uint64 `json:"major,omitempty"`
+       Minor uint64 `json:"minor,omitempty"`
+       Op    string `json:"op,omitempty"`
+       Value uint64 `json:"value,omitempty"`
+}
+
+type Blkio struct {
+       IoServiceBytesRecursive []BlkioEntry `json:"ioServiceBytesRecursive,omitempty"`
+       IoServicedRecursive     []BlkioEntry `json:"ioServicedRecursive,omitempty"`
+       IoQueuedRecursive       []BlkioEntry `json:"ioQueueRecursive,omitempty"`
+       IoServiceTimeRecursive  []BlkioEntry `json:"ioServiceTimeRecursive,omitempty"`
+       IoWaitTimeRecursive     []BlkioEntry `json:"ioWaitTimeRecursive,omitempty"`
+       IoMergedRecursive       []BlkioEntry `json:"ioMergedRecursive,omitempty"`
+       IoTimeRecursive         []BlkioEntry `json:"ioTimeRecursive,omitempty"`
+       SectorsRecursive        []BlkioEntry `json:"sectorsRecursive,omitempty"`
+}
+
+type Pids struct {
+       Current uint64 `json:"current,omitempty"`
+       Limit   uint64 `json:"limit,omitempty"`
+}
+
+type Throttling struct {
+       Periods          uint64 `json:"periods,omitempty"`
+       ThrottledPeriods uint64 `json:"throttledPeriods,omitempty"`
+       ThrottledTime    uint64 `json:"throttledTime,omitempty"`
+}
+
+type CpuUsage struct {
+       // Units: nanoseconds.
+       Total  uint64   `json:"total,omitempty"`
+       Percpu []uint64 `json:"percpu,omitempty"`
+       Kernel uint64   `json:"kernel"`
+       User   uint64   `json:"user"`
+}
+
+type Cpu struct {
+       Usage      CpuUsage   `json:"usage,omitempty"`
+       Throttling Throttling `json:"throttling,omitempty"`
+}
+
+type MemoryEntry struct {
+       Limit   uint64 `json:"limit"`
+       Usage   uint64 `json:"usage,omitempty"`
+       Max     uint64 `json:"max,omitempty"`
+       Failcnt uint64 `json:"failcnt"`
+}
+
+type Memory struct {
+       Cache     uint64            `json:"cache,omitempty"`
+       Usage     MemoryEntry       `json:"usage,omitempty"`
+       Swap      MemoryEntry       `json:"swap,omitempty"`
+       Kernel    MemoryEntry       `json:"kernel,omitempty"`
+       KernelTCP MemoryEntry       `json:"kernelTCP,omitempty"`
+       Raw       map[string]uint64 `json:"raw,omitempty"`
+}
+
+type L3CacheInfo struct {
+       CbmMask    string `json:"cbm_mask,omitempty"`
+       MinCbmBits uint64 `json:"min_cbm_bits,omitempty"`
+       NumClosids uint64 `json:"num_closids,omitempty"`
+}
+
+type MemBwInfo struct {
+       BandwidthGran uint64 `json:"bandwidth_gran,omitempty"`
+       DelayLinear   uint64 `json:"delay_linear,omitempty"`
+       MinBandwidth  uint64 `json:"min_bandwidth,omitempty"`
+       NumClosids    uint64 `json:"num_closids,omitempty"`
+}
+
+type IntelRdt struct {
+       // The read-only L3 cache information
+       L3CacheInfo *L3CacheInfo `json:"l3_cache_info,omitempty"`
+
+       // The read-only L3 cache schema in root
+       L3CacheSchemaRoot string `json:"l3_cache_schema_root,omitempty"`
+
+       // The L3 cache schema in 'container_id' group
+       L3CacheSchema string `json:"l3_cache_schema,omitempty"`
+
+       // The read-only memory bandwidth information
+       MemBwInfo *MemBwInfo `json:"mem_bw_info,omitempty"`
+
+       // The read-only memory bandwidth schema in root
+       MemBwSchemaRoot string `json:"mem_bw_schema_root,omitempty"`
+
+       // The memory bandwidth schema in 'container_id' group
+       MemBwSchema string `json:"mem_bw_schema,omitempty"`
+}
+
+type NetworkInterface struct {
+       // Name is the name of the network interface.
+       Name string
+
+       RxBytes   uint64
+       RxPackets uint64
+       RxErrors  uint64
+       RxDropped uint64
+       TxBytes   uint64
+       TxPackets uint64
+       TxErrors  uint64
+       TxDropped uint64
+}
diff --git a/update.go b/update.go
new file mode 100644 (file)
index 0000000..05dc4b5
--- /dev/null
+++ b/update.go
@@ -0,0 +1,304 @@
+// +build linux
+
+package main
+
+import (
+       "encoding/json"
+       "fmt"
+       "os"
+       "strconv"
+
+       "github.com/docker/go-units"
+       "github.com/opencontainers/runc/libcontainer/configs"
+       "github.com/opencontainers/runc/libcontainer/intelrdt"
+       "github.com/opencontainers/runtime-spec/specs-go"
+       "github.com/urfave/cli"
+)
+
+func i64Ptr(i int64) *int64   { return &i }
+func u64Ptr(i uint64) *uint64 { return &i }
+func u16Ptr(i uint16) *uint16 { return &i }
+
+var updateCommand = cli.Command{
+       Name:      "update",
+       Usage:     "update container resource constraints",
+       ArgsUsage: `<container-id>`,
+       Flags: []cli.Flag{
+               cli.StringFlag{
+                       Name:  "resources, r",
+                       Value: "",
+                       Usage: `path to the file containing the resources to update or '-' to read from the standard input
+
+The accepted format is as follow (unchanged values can be omitted):
+
+{
+  "memory": {
+    "limit": 0,
+    "reservation": 0,
+    "swap": 0,
+    "kernel": 0,
+    "kernelTCP": 0
+  },
+  "cpu": {
+    "shares": 0,
+    "quota": 0,
+    "period": 0,
+    "realtimeRuntime": 0,
+    "realtimePeriod": 0,
+    "cpus": "",
+    "mems": ""
+  },
+  "blockIO": {
+    "weight": 0
+  }
+}
+
+Note: if data is to be read from a file or the standard input, all
+other options are ignored.
+`,
+               },
+
+               cli.IntFlag{
+                       Name:  "blkio-weight",
+                       Usage: "Specifies per cgroup weight, range is from 10 to 1000",
+               },
+               cli.StringFlag{
+                       Name:  "cpu-period",
+                       Usage: "CPU CFS period to be used for hardcapping (in usecs). 0 to use system default",
+               },
+               cli.StringFlag{
+                       Name:  "cpu-quota",
+                       Usage: "CPU CFS hardcap limit (in usecs). Allowed cpu time in a given period",
+               },
+               cli.StringFlag{
+                       Name:  "cpu-share",
+                       Usage: "CPU shares (relative weight vs. other containers)",
+               },
+               cli.StringFlag{
+                       Name:  "cpu-rt-period",
+                       Usage: "CPU realtime period to be used for hardcapping (in usecs). 0 to use system default",
+               },
+               cli.StringFlag{
+                       Name:  "cpu-rt-runtime",
+                       Usage: "CPU realtime hardcap limit (in usecs). Allowed cpu time in a given period",
+               },
+               cli.StringFlag{
+                       Name:  "cpuset-cpus",
+                       Usage: "CPU(s) to use",
+               },
+               cli.StringFlag{
+                       Name:  "cpuset-mems",
+                       Usage: "Memory node(s) to use",
+               },
+               cli.StringFlag{
+                       Name:  "kernel-memory",
+                       Usage: "Kernel memory limit (in bytes)",
+               },
+               cli.StringFlag{
+                       Name:  "kernel-memory-tcp",
+                       Usage: "Kernel memory limit (in bytes) for tcp buffer",
+               },
+               cli.StringFlag{
+                       Name:  "memory",
+                       Usage: "Memory limit (in bytes)",
+               },
+               cli.StringFlag{
+                       Name:  "memory-reservation",
+                       Usage: "Memory reservation or soft_limit (in bytes)",
+               },
+               cli.StringFlag{
+                       Name:  "memory-swap",
+                       Usage: "Total memory usage (memory + swap); set '-1' to enable unlimited swap",
+               },
+               cli.IntFlag{
+                       Name:  "pids-limit",
+                       Usage: "Maximum number of pids allowed in the container",
+               },
+               cli.StringFlag{
+                       Name:  "l3-cache-schema",
+                       Usage: "The string of Intel RDT/CAT L3 cache schema",
+               },
+               cli.StringFlag{
+                       Name:  "mem-bw-schema",
+                       Usage: "The string of Intel RDT/MBA memory bandwidth schema",
+               },
+       },
+       Action: func(context *cli.Context) error {
+               if err := checkArgs(context, 1, exactArgs); err != nil {
+                       return err
+               }
+               container, err := getContainer(context)
+               if err != nil {
+                       return err
+               }
+
+               r := specs.LinuxResources{
+                       Memory: &specs.LinuxMemory{
+                               Limit:       i64Ptr(0),
+                               Reservation: i64Ptr(0),
+                               Swap:        i64Ptr(0),
+                               Kernel:      i64Ptr(0),
+                               KernelTCP:   i64Ptr(0),
+                       },
+                       CPU: &specs.LinuxCPU{
+                               Shares:          u64Ptr(0),
+                               Quota:           i64Ptr(0),
+                               Period:          u64Ptr(0),
+                               RealtimeRuntime: i64Ptr(0),
+                               RealtimePeriod:  u64Ptr(0),
+                               Cpus:            "",
+                               Mems:            "",
+                       },
+                       BlockIO: &specs.LinuxBlockIO{
+                               Weight: u16Ptr(0),
+                       },
+                       Pids: &specs.LinuxPids{
+                               Limit: 0,
+                       },
+               }
+
+               config := container.Config()
+
+               if in := context.String("resources"); in != "" {
+                       var (
+                               f   *os.File
+                               err error
+                       )
+                       switch in {
+                       case "-":
+                               f = os.Stdin
+                       default:
+                               f, err = os.Open(in)
+                               if err != nil {
+                                       return err
+                               }
+                       }
+                       err = json.NewDecoder(f).Decode(&r)
+                       if err != nil {
+                               return err
+                       }
+               } else {
+                       if val := context.Int("blkio-weight"); val != 0 {
+                               r.BlockIO.Weight = u16Ptr(uint16(val))
+                       }
+                       if val := context.String("cpuset-cpus"); val != "" {
+                               r.CPU.Cpus = val
+                       }
+                       if val := context.String("cpuset-mems"); val != "" {
+                               r.CPU.Mems = val
+                       }
+
+                       for _, pair := range []struct {
+                               opt  string
+                               dest *uint64
+                       }{
+
+                               {"cpu-period", r.CPU.Period},
+                               {"cpu-rt-period", r.CPU.RealtimePeriod},
+                               {"cpu-share", r.CPU.Shares},
+                       } {
+                               if val := context.String(pair.opt); val != "" {
+                                       var err error
+                                       *pair.dest, err = strconv.ParseUint(val, 10, 64)
+                                       if err != nil {
+                                               return fmt.Errorf("invalid value for %s: %s", pair.opt, err)
+                                       }
+                               }
+                       }
+                       for _, pair := range []struct {
+                               opt  string
+                               dest *int64
+                       }{
+
+                               {"cpu-quota", r.CPU.Quota},
+                               {"cpu-rt-runtime", r.CPU.RealtimeRuntime},
+                       } {
+                               if val := context.String(pair.opt); val != "" {
+                                       var err error
+                                       *pair.dest, err = strconv.ParseInt(val, 10, 64)
+                                       if err != nil {
+                                               return fmt.Errorf("invalid value for %s: %s", pair.opt, err)
+                                       }
+                               }
+                       }
+                       for _, pair := range []struct {
+                               opt  string
+                               dest *int64
+                       }{
+                               {"memory", r.Memory.Limit},
+                               {"memory-swap", r.Memory.Swap},
+                               {"kernel-memory", r.Memory.Kernel},
+                               {"kernel-memory-tcp", r.Memory.KernelTCP},
+                               {"memory-reservation", r.Memory.Reservation},
+                       } {
+                               if val := context.String(pair.opt); val != "" {
+                                       var v int64
+
+                                       if val != "-1" {
+                                               v, err = units.RAMInBytes(val)
+                                               if err != nil {
+                                                       return fmt.Errorf("invalid value for %s: %s", pair.opt, err)
+                                               }
+                                       } else {
+                                               v = -1
+                                       }
+                                       *pair.dest = v
+                               }
+                       }
+                       r.Pids.Limit = int64(context.Int("pids-limit"))
+               }
+
+               // Update the value
+               config.Cgroups.Resources.BlkioWeight = *r.BlockIO.Weight
+               config.Cgroups.Resources.CpuPeriod = *r.CPU.Period
+               config.Cgroups.Resources.CpuQuota = *r.CPU.Quota
+               config.Cgroups.Resources.CpuShares = *r.CPU.Shares
+               config.Cgroups.Resources.CpuRtPeriod = *r.CPU.RealtimePeriod
+               config.Cgroups.Resources.CpuRtRuntime = *r.CPU.RealtimeRuntime
+               config.Cgroups.Resources.CpusetCpus = r.CPU.Cpus
+               config.Cgroups.Resources.CpusetMems = r.CPU.Mems
+               config.Cgroups.Resources.KernelMemory = *r.Memory.Kernel
+               config.Cgroups.Resources.KernelMemoryTCP = *r.Memory.KernelTCP
+               config.Cgroups.Resources.Memory = *r.Memory.Limit
+               config.Cgroups.Resources.MemoryReservation = *r.Memory.Reservation
+               config.Cgroups.Resources.MemorySwap = *r.Memory.Swap
+               config.Cgroups.Resources.PidsLimit = r.Pids.Limit
+
+               // Update Intel RDT
+               l3CacheSchema := context.String("l3-cache-schema")
+               memBwSchema := context.String("mem-bw-schema")
+               if l3CacheSchema != "" && !intelrdt.IsCatEnabled() {
+                       return fmt.Errorf("Intel RDT/CAT: l3 cache schema is not enabled")
+               }
+
+               if memBwSchema != "" && !intelrdt.IsMbaEnabled() {
+                       return fmt.Errorf("Intel RDT/MBA: memory bandwidth schema is not enabled")
+               }
+
+               if l3CacheSchema != "" || memBwSchema != "" {
+                       // If intelRdt is not specified in original configuration, we just don't
+                       // Apply() to create intelRdt group or attach tasks for this container.
+                       // In update command, we could re-enable through IntelRdtManager.Apply()
+                       // and then update intelrdt constraint.
+                       if config.IntelRdt == nil {
+                               state, err := container.State()
+                               if err != nil {
+                                       return err
+                               }
+                               config.IntelRdt = &configs.IntelRdt{}
+                               intelRdtManager := intelrdt.IntelRdtManager{
+                                       Config: &config,
+                                       Id:     container.ID(),
+                                       Path:   state.IntelRdtPath,
+                               }
+                               if err := intelRdtManager.Apply(state.InitProcessPid); err != nil {
+                                       return err
+                               }
+                       }
+                       config.IntelRdt.L3CacheSchema = l3CacheSchema
+                       config.IntelRdt.MemBwSchema = memBwSchema
+               }
+
+               return container.Set(config)
+       },
+}
diff --git a/utils.go b/utils.go
new file mode 100644 (file)
index 0000000..5165336
--- /dev/null
+++ b/utils.go
@@ -0,0 +1,94 @@
+package main
+
+import (
+       "fmt"
+       "os"
+       "path/filepath"
+       "strconv"
+       "strings"
+
+       "github.com/opencontainers/runtime-spec/specs-go"
+
+       "github.com/sirupsen/logrus"
+       "github.com/urfave/cli"
+)
+
+const (
+       exactArgs = iota
+       minArgs
+       maxArgs
+)
+
+func checkArgs(context *cli.Context, expected, checkType int) error {
+       var err error
+       cmdName := context.Command.Name
+       switch checkType {
+       case exactArgs:
+               if context.NArg() != expected {
+                       err = fmt.Errorf("%s: %q requires exactly %d argument(s)", os.Args[0], cmdName, expected)
+               }
+       case minArgs:
+               if context.NArg() < expected {
+                       err = fmt.Errorf("%s: %q requires a minimum of %d argument(s)", os.Args[0], cmdName, expected)
+               }
+       case maxArgs:
+               if context.NArg() > expected {
+                       err = fmt.Errorf("%s: %q requires a maximum of %d argument(s)", os.Args[0], cmdName, expected)
+               }
+       }
+
+       if err != nil {
+               fmt.Printf("Incorrect Usage.\n\n")
+               cli.ShowCommandHelp(context, cmdName)
+               return err
+       }
+       return nil
+}
+
+// fatal prints the error's details if it is a libcontainer specific error type
+// then exits the program with an exit status of 1.
+func fatal(err error) {
+       // make sure the error is written to the logger
+       logrus.Error(err)
+       fmt.Fprintln(os.Stderr, err)
+       os.Exit(1)
+}
+
+// setupSpec performs initial setup based on the cli.Context for the container
+func setupSpec(context *cli.Context) (*specs.Spec, error) {
+       bundle := context.String("bundle")
+       if bundle != "" {
+               if err := os.Chdir(bundle); err != nil {
+                       return nil, err
+               }
+       }
+       spec, err := loadSpec(specConfig)
+       if err != nil {
+               return nil, err
+       }
+       return spec, nil
+}
+
+func revisePidFile(context *cli.Context) error {
+       pidFile := context.String("pid-file")
+       if pidFile == "" {
+               return nil
+       }
+
+       // convert pid-file to an absolute path so we can write to the right
+       // file after chdir to bundle
+       pidFile, err := filepath.Abs(pidFile)
+       if err != nil {
+               return err
+       }
+       return context.Set("pid-file", pidFile)
+}
+
+// parseBoolOrAuto returns (nil, nil) if s is empty or "auto"
+func parseBoolOrAuto(s string) (*bool, error) {
+       if s == "" || strings.ToLower(s) == "auto" {
+               return nil, nil
+       }
+       b, err := strconv.ParseBool(s)
+       return &b, err
+}
diff --git a/utils_linux.go b/utils_linux.go
new file mode 100644 (file)
index 0000000..984e6b0
--- /dev/null
@@ -0,0 +1,453 @@
+// +build linux
+
+package main
+
+import (
+       "fmt"
+       "net"
+       "os"
+       "os/exec"
+       "path/filepath"
+       "strconv"
+
+       "github.com/opencontainers/runc/libcontainer"
+       "github.com/opencontainers/runc/libcontainer/cgroups/systemd"
+       "github.com/opencontainers/runc/libcontainer/configs"
+       "github.com/opencontainers/runc/libcontainer/intelrdt"
+       "github.com/opencontainers/runc/libcontainer/specconv"
+       "github.com/opencontainers/runc/libcontainer/utils"
+       "github.com/opencontainers/runtime-spec/specs-go"
+       selinux "github.com/opencontainers/selinux/go-selinux"
+
+       "github.com/coreos/go-systemd/activation"
+       "github.com/pkg/errors"
+       "github.com/sirupsen/logrus"
+       "github.com/urfave/cli"
+       "golang.org/x/sys/unix"
+)
+
+var errEmptyID = errors.New("container id cannot be empty")
+
+// loadFactory returns the configured factory instance for execing containers.
+func loadFactory(context *cli.Context) (libcontainer.Factory, error) {
+       root := context.GlobalString("root")
+       abs, err := filepath.Abs(root)
+       if err != nil {
+               return nil, err
+       }
+
+       // We default to cgroupfs, and can only use systemd if the system is a
+       // systemd box.
+       cgroupManager := libcontainer.Cgroupfs
+       rootlessCg, err := shouldUseRootlessCgroupManager(context)
+       if err != nil {
+               return nil, err
+       }
+       if rootlessCg {
+               cgroupManager = libcontainer.RootlessCgroupfs
+       }
+       if context.GlobalBool("systemd-cgroup") {
+               if systemd.UseSystemd() {
+                       cgroupManager = libcontainer.SystemdCgroups
+               } else {
+                       return nil, fmt.Errorf("systemd cgroup flag passed, but systemd support for managing cgroups is not available")
+               }
+       }
+
+       intelRdtManager := libcontainer.IntelRdtFs
+       if !intelrdt.IsCatEnabled() && !intelrdt.IsMbaEnabled() {
+               intelRdtManager = nil
+       }
+
+       // We resolve the paths for {newuidmap,newgidmap} from the context of runc,
+       // to avoid doing a path lookup in the nsexec context. TODO: The binary
+       // names are not currently configurable.
+       newuidmap, err := exec.LookPath("newuidmap")
+       if err != nil {
+               newuidmap = ""
+       }
+       newgidmap, err := exec.LookPath("newgidmap")
+       if err != nil {
+               newgidmap = ""
+       }
+
+       return libcontainer.New(abs, cgroupManager, intelRdtManager,
+               libcontainer.CriuPath(context.GlobalString("criu")),
+               libcontainer.NewuidmapPath(newuidmap),
+               libcontainer.NewgidmapPath(newgidmap))
+}
+
+// getContainer returns the specified container instance by loading it from state
+// with the default factory.
+func getContainer(context *cli.Context) (libcontainer.Container, error) {
+       id := context.Args().First()
+       if id == "" {
+               return nil, errEmptyID
+       }
+       factory, err := loadFactory(context)
+       if err != nil {
+               return nil, err
+       }
+       return factory.Load(id)
+}
+
+func fatalf(t string, v ...interface{}) {
+       fatal(fmt.Errorf(t, v...))
+}
+
+func getDefaultImagePath(context *cli.Context) string {
+       cwd, err := os.Getwd()
+       if err != nil {
+               panic(err)
+       }
+       return filepath.Join(cwd, "checkpoint")
+}
+
+// newProcess returns a new libcontainer Process with the arguments from the
+// spec and stdio from the current process.
+func newProcess(p specs.Process, init bool, logLevel string) (*libcontainer.Process, error) {
+       lp := &libcontainer.Process{
+               Args: p.Args,
+               Env:  p.Env,
+               // TODO: fix libcontainer's API to better support uid/gid in a typesafe way.
+               User:            fmt.Sprintf("%d:%d", p.User.UID, p.User.GID),
+               Cwd:             p.Cwd,
+               Label:           p.SelinuxLabel,
+               NoNewPrivileges: &p.NoNewPrivileges,
+               AppArmorProfile: p.ApparmorProfile,
+               Init:            init,
+               LogLevel:        logLevel,
+       }
+
+       if p.ConsoleSize != nil {
+               lp.ConsoleWidth = uint16(p.ConsoleSize.Width)
+               lp.ConsoleHeight = uint16(p.ConsoleSize.Height)
+       }
+
+       if p.Capabilities != nil {
+               lp.Capabilities = &configs.Capabilities{}
+               lp.Capabilities.Bounding = p.Capabilities.Bounding
+               lp.Capabilities.Effective = p.Capabilities.Effective
+               lp.Capabilities.Inheritable = p.Capabilities.Inheritable
+               lp.Capabilities.Permitted = p.Capabilities.Permitted
+               lp.Capabilities.Ambient = p.Capabilities.Ambient
+       }
+       for _, gid := range p.User.AdditionalGids {
+               lp.AdditionalGroups = append(lp.AdditionalGroups, strconv.FormatUint(uint64(gid), 10))
+       }
+       for _, rlimit := range p.Rlimits {
+               rl, err := createLibContainerRlimit(rlimit)
+               if err != nil {
+                       return nil, err
+               }
+               lp.Rlimits = append(lp.Rlimits, rl)
+       }
+       return lp, nil
+}
+
+func destroy(container libcontainer.Container) {
+       if err := container.Destroy(); err != nil {
+               logrus.Error(err)
+       }
+}
+
+// setupIO modifies the given process config according to the options.
+func setupIO(process *libcontainer.Process, rootuid, rootgid int, createTTY, detach bool, sockpath string) (*tty, error) {
+       if createTTY {
+               process.Stdin = nil
+               process.Stdout = nil
+               process.Stderr = nil
+               t := &tty{}
+               if !detach {
+                       parent, child, err := utils.NewSockPair("console")
+                       if err != nil {
+                               return nil, err
+                       }
+                       process.ConsoleSocket = child
+                       t.postStart = append(t.postStart, parent, child)
+                       t.consoleC = make(chan error, 1)
+                       go func() {
+                               if err := t.recvtty(process, parent); err != nil {
+                                       t.consoleC <- err
+                               }
+                               t.consoleC <- nil
+                       }()
+               } else {
+                       // the caller of runc will handle receiving the console master
+                       conn, err := net.Dial("unix", sockpath)
+                       if err != nil {
+                               return nil, err
+                       }
+                       uc, ok := conn.(*net.UnixConn)
+                       if !ok {
+                               return nil, fmt.Errorf("casting to UnixConn failed")
+                       }
+                       t.postStart = append(t.postStart, uc)
+                       socket, err := uc.File()
+                       if err != nil {
+                               return nil, err
+                       }
+                       t.postStart = append(t.postStart, socket)
+                       process.ConsoleSocket = socket
+               }
+               return t, nil
+       }
+       // when runc will detach the caller provides the stdio to runc via runc's 0,1,2
+       // and the container's process inherits runc's stdio.
+       if detach {
+               if err := inheritStdio(process); err != nil {
+                       return nil, err
+               }
+               return &tty{}, nil
+       }
+       return setupProcessPipes(process, rootuid, rootgid)
+}
+
+// createPidFile creates a file with the processes pid inside it atomically
+// it creates a temp file with the paths filename + '.' infront of it
+// then renames the file
+func createPidFile(path string, process *libcontainer.Process) error {
+       pid, err := process.Pid()
+       if err != nil {
+               return err
+       }
+       var (
+               tmpDir  = filepath.Dir(path)
+               tmpName = filepath.Join(tmpDir, fmt.Sprintf(".%s", filepath.Base(path)))
+       )
+       f, err := os.OpenFile(tmpName, os.O_RDWR|os.O_CREATE|os.O_EXCL|os.O_SYNC, 0666)
+       if err != nil {
+               return err
+       }
+       _, err = fmt.Fprintf(f, "%d", pid)
+       f.Close()
+       if err != nil {
+               return err
+       }
+       return os.Rename(tmpName, path)
+}
+
+func createContainer(context *cli.Context, id string, spec *specs.Spec) (libcontainer.Container, error) {
+       rootlessCg, err := shouldUseRootlessCgroupManager(context)
+       if err != nil {
+               return nil, err
+       }
+       config, err := specconv.CreateLibcontainerConfig(&specconv.CreateOpts{
+               CgroupName:       id,
+               UseSystemdCgroup: context.GlobalBool("systemd-cgroup"),
+               NoPivotRoot:      context.Bool("no-pivot"),
+               NoNewKeyring:     context.Bool("no-new-keyring"),
+               Spec:             spec,
+               RootlessEUID:     os.Geteuid() != 0,
+               RootlessCgroups:  rootlessCg,
+       })
+       if err != nil {
+               return nil, err
+       }
+
+       factory, err := loadFactory(context)
+       if err != nil {
+               return nil, err
+       }
+       return factory.Create(id, config)
+}
+
+type runner struct {
+       init            bool
+       enableSubreaper bool
+       shouldDestroy   bool
+       detach          bool
+       listenFDs       []*os.File
+       preserveFDs     int
+       pidFile         string
+       consoleSocket   string
+       container       libcontainer.Container
+       action          CtAct
+       notifySocket    *notifySocket
+       criuOpts        *libcontainer.CriuOpts
+       logLevel        string
+}
+
+func (r *runner) run(config *specs.Process) (int, error) {
+       var err error
+       defer func() {
+               if err != nil {
+                       r.destroy()
+               }
+       }()
+       if err = r.checkTerminal(config); err != nil {
+               return -1, err
+       }
+       process, err := newProcess(*config, r.init, r.logLevel)
+       if err != nil {
+               return -1, err
+       }
+       if len(r.listenFDs) > 0 {
+               process.Env = append(process.Env, fmt.Sprintf("LISTEN_FDS=%d", len(r.listenFDs)), "LISTEN_PID=1")
+               process.ExtraFiles = append(process.ExtraFiles, r.listenFDs...)
+       }
+       baseFd := 3 + len(process.ExtraFiles)
+       for i := baseFd; i < baseFd+r.preserveFDs; i++ {
+               _, err = os.Stat(fmt.Sprintf("/proc/self/fd/%d", i))
+               if err != nil {
+                       return -1, errors.Wrapf(err, "please check that preserved-fd %d (of %d) is present", i-baseFd, r.preserveFDs)
+               }
+               process.ExtraFiles = append(process.ExtraFiles, os.NewFile(uintptr(i), "PreserveFD:"+strconv.Itoa(i)))
+       }
+       rootuid, err := r.container.Config().HostRootUID()
+       if err != nil {
+               return -1, err
+       }
+       rootgid, err := r.container.Config().HostRootGID()
+       if err != nil {
+               return -1, err
+       }
+       var (
+               detach = r.detach || (r.action == CT_ACT_CREATE)
+       )
+       // Setting up IO is a two stage process. We need to modify process to deal
+       // with detaching containers, and then we get a tty after the container has
+       // started.
+       handler := newSignalHandler(r.enableSubreaper, r.notifySocket)
+       tty, err := setupIO(process, rootuid, rootgid, config.Terminal, detach, r.consoleSocket)
+       if err != nil {
+               return -1, err
+       }
+       defer tty.Close()
+
+       switch r.action {
+       case CT_ACT_CREATE:
+               err = r.container.Start(process)
+       case CT_ACT_RESTORE:
+               err = r.container.Restore(process, r.criuOpts)
+       case CT_ACT_RUN:
+               err = r.container.Run(process)
+       default:
+               panic("Unknown action")
+       }
+       if err != nil {
+               return -1, err
+       }
+       if err = tty.waitConsole(); err != nil {
+               r.terminate(process)
+               return -1, err
+       }
+       if err = tty.ClosePostStart(); err != nil {
+               r.terminate(process)
+               return -1, err
+       }
+       if r.pidFile != "" {
+               if err = createPidFile(r.pidFile, process); err != nil {
+                       r.terminate(process)
+                       return -1, err
+               }
+       }
+       status, err := handler.forward(process, tty, detach)
+       if err != nil {
+               r.terminate(process)
+       }
+       if detach {
+               return 0, nil
+       }
+       r.destroy()
+       return status, err
+}
+
+func (r *runner) destroy() {
+       if r.shouldDestroy {
+               destroy(r.container)
+       }
+}
+
+func (r *runner) terminate(p *libcontainer.Process) {
+       _ = p.Signal(unix.SIGKILL)
+       _, _ = p.Wait()
+}
+
+func (r *runner) checkTerminal(config *specs.Process) error {
+       detach := r.detach || (r.action == CT_ACT_CREATE)
+       // Check command-line for sanity.
+       if detach && config.Terminal && r.consoleSocket == "" {
+               return fmt.Errorf("cannot allocate tty if runc will detach without setting console socket")
+       }
+       if (!detach || !config.Terminal) && r.consoleSocket != "" {
+               return fmt.Errorf("cannot use console socket if runc will not detach or allocate tty")
+       }
+       return nil
+}
+
+func validateProcessSpec(spec *specs.Process) error {
+       if spec.Cwd == "" {
+               return fmt.Errorf("Cwd property must not be empty")
+       }
+       if !filepath.IsAbs(spec.Cwd) {
+               return fmt.Errorf("Cwd must be an absolute path")
+       }
+       if len(spec.Args) == 0 {
+               return fmt.Errorf("args must not be empty")
+       }
+       if spec.SelinuxLabel != "" && !selinux.GetEnabled() {
+               return fmt.Errorf("selinux label is specified in config, but selinux is disabled or not supported")
+       }
+       return nil
+}
+
+type CtAct uint8
+
+const (
+       CT_ACT_CREATE CtAct = iota + 1
+       CT_ACT_RUN
+       CT_ACT_RESTORE
+)
+
+func startContainer(context *cli.Context, spec *specs.Spec, action CtAct, criuOpts *libcontainer.CriuOpts) (int, error) {
+       id := context.Args().First()
+       if id == "" {
+               return -1, errEmptyID
+       }
+
+       notifySocket := newNotifySocket(context, os.Getenv("NOTIFY_SOCKET"), id)
+       if notifySocket != nil {
+               notifySocket.setupSpec(context, spec)
+       }
+
+       container, err := createContainer(context, id, spec)
+       if err != nil {
+               return -1, err
+       }
+
+       if notifySocket != nil {
+               err := notifySocket.setupSocket()
+               if err != nil {
+                       return -1, err
+               }
+       }
+
+       // Support on-demand socket activation by passing file descriptors into the container init process.
+       listenFDs := []*os.File{}
+       if os.Getenv("LISTEN_FDS") != "" {
+               listenFDs = activation.Files(false)
+       }
+
+       logLevel := "info"
+       if context.GlobalBool("debug") {
+               logLevel = "debug"
+       }
+
+       r := &runner{
+               enableSubreaper: !context.Bool("no-subreaper"),
+               shouldDestroy:   true,
+               container:       container,
+               listenFDs:       listenFDs,
+               notifySocket:    notifySocket,
+               consoleSocket:   context.String("console-socket"),
+               detach:          context.Bool("detach"),
+               pidFile:         context.String("pid-file"),
+               preserveFDs:     context.Int("preserve-fds"),
+               action:          action,
+               criuOpts:        criuOpts,
+               init:            true,
+               logLevel:        logLevel,
+       }
+       return r.run(spec.Process)
+}
diff --git a/vendor.conf b/vendor.conf
new file mode 100644 (file)
index 0000000..dd51785
--- /dev/null
@@ -0,0 +1,31 @@
+# OCI runtime-spec. When updating this, make sure you use a version tag rather
+# than a commit ID so it's much more obvious what version of the spec we are
+# using.
+github.com/opencontainers/runtime-spec  29686dbc5559d93fb1ef402eeda3e35c38d75af4 # v1.0.1-59-g29686db
+
+# Core libcontainer functionality.
+github.com/checkpoint-restore/go-criu   17b0214f6c48980c45dc47ecb0cfd6d9e02df723 # v3.11
+github.com/mrunalp/fileutils            7d4729fb36185a7c1719923406c9d40e54fb93c7
+github.com/opencontainers/selinux       5215b1806f52b1fcc2070a8826c542c9d33cd3cf # v1.3.0 (+ CVE-2019-16884)
+github.com/seccomp/libseccomp-golang    689e3c1541a84461afc49c1c87352a6cedf72e9c # v0.9.1
+github.com/sirupsen/logrus              8bdbc7bcc01dcbb8ec23dc8a28e332258d25251f # v1.4.1
+github.com/syndtr/gocapability          d98352740cb2c55f81556b63d4a1ec64c5a319c2
+github.com/vishvananda/netlink          1e2e08e8a2dcdacaae3f14ac44c5cfa31361f270
+
+# systemd integration.
+github.com/coreos/go-systemd            95778dfbb74eb7e4dbaf43bf7d71809650ef8076 # v19
+github.com/godbus/dbus                  2ff6f7ffd60f0f2410b3105864bdd12c7894f844 # v5.0.1
+github.com/golang/protobuf              925541529c1fa6821df4e44ce2723319eb2be768 # v1.0.0
+
+# Command-line interface.
+github.com/cyphar/filepath-securejoin   a261ee33d7a517f054effbf451841abaafe3e0fd # v0.2.2
+github.com/docker/go-units              47565b4f722fb6ceae66b95f853feed578a4a51c # v0.3.3
+github.com/urfave/cli                   cfb38830724cc34fedffe9a2a29fb54fa9169cd1 # v1.20.0
+golang.org/x/sys                        9eafafc0a87e0fd0aeeba439a4573537970c44c7 https://github.com/golang/sys
+
+# console dependencies
+github.com/containerd/console           0650fd9eeb50bab4fc99dceb9f2e14cf58f36e7f
+github.com/pkg/errors                   ba968bfe8b2f7e042a574c888954fccecfa385b4 # v0.8.1
+
+# ebpf dependencies
+github.com/cilium/ebpf                  95b36a581eed7b0f127306ed1d16cc0ddc06cf67
diff --git a/vendor/github.com/cilium/ebpf/LICENSE b/vendor/github.com/cilium/ebpf/LICENSE
new file mode 100644 (file)
index 0000000..c637ae9
--- /dev/null
@@ -0,0 +1,23 @@
+MIT License
+
+Copyright (c) 2017 Nathan Sweet
+Copyright (c) 2018, 2019 Cloudflare
+Copyright (c) 2019 Authors of Cilium
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/vendor/github.com/cilium/ebpf/abi.go b/vendor/github.com/cilium/ebpf/abi.go
new file mode 100644 (file)
index 0000000..999b8cc
--- /dev/null
@@ -0,0 +1,183 @@
+package ebpf
+
+import (
+       "github.com/pkg/errors"
+)
+
+// CollectionABI describes the interface of an eBPF collection.
+type CollectionABI struct {
+       Maps     map[string]*MapABI
+       Programs map[string]*ProgramABI
+}
+
+// CheckSpec verifies that all maps and programs mentioned
+// in the ABI are present in the spec.
+func (abi *CollectionABI) CheckSpec(cs *CollectionSpec) error {
+       for name := range abi.Maps {
+               if cs.Maps[name] == nil {
+                       return errors.Errorf("missing map %s", name)
+               }
+       }
+
+       for name := range abi.Programs {
+               if cs.Programs[name] == nil {
+                       return errors.Errorf("missing program %s", name)
+               }
+       }
+
+       return nil
+}
+
+// Check verifies that all items in a collection conform to this ABI.
+func (abi *CollectionABI) Check(coll *Collection) error {
+       for name, mapABI := range abi.Maps {
+               m := coll.Maps[name]
+               if m == nil {
+                       return errors.Errorf("missing map %s", name)
+               }
+               if err := mapABI.Check(m); err != nil {
+                       return errors.Wrapf(err, "map %s", name)
+               }
+       }
+
+       for name, progABI := range abi.Programs {
+               p := coll.Programs[name]
+               if p == nil {
+                       return errors.Errorf("missing program %s", name)
+               }
+               if err := progABI.Check(p); err != nil {
+                       return errors.Wrapf(err, "program %s", name)
+               }
+       }
+
+       return nil
+}
+
+// MapABI describes a Map.
+//
+// Use it to assert that a Map matches what your code expects.
+type MapABI struct {
+       Type       MapType
+       KeySize    uint32
+       ValueSize  uint32
+       MaxEntries uint32
+       InnerMap   *MapABI
+}
+
+func newMapABIFromSpec(spec *MapSpec) *MapABI {
+       var inner *MapABI
+       if spec.InnerMap != nil {
+               inner = newMapABIFromSpec(spec.InnerMap)
+       }
+
+       return &MapABI{
+               spec.Type,
+               spec.KeySize,
+               spec.ValueSize,
+               spec.MaxEntries,
+               inner,
+       }
+}
+
+func newMapABIFromFd(fd *bpfFD) (*MapABI, error) {
+       info, err := bpfGetMapInfoByFD(fd)
+       if err != nil {
+               return nil, err
+       }
+
+       mapType := MapType(info.mapType)
+       if mapType == ArrayOfMaps || mapType == HashOfMaps {
+               return nil, errors.New("can't get map info for nested maps")
+       }
+
+       return &MapABI{
+               mapType,
+               info.keySize,
+               info.valueSize,
+               info.maxEntries,
+               nil,
+       }, nil
+}
+
+// Check verifies that a Map conforms to the ABI.
+//
+// Members of ABI which have the zero value of their type are not checked.
+func (abi *MapABI) Check(m *Map) error {
+       return abi.check(&m.abi)
+}
+
+func (abi *MapABI) check(other *MapABI) error {
+       if abi.Type != UnspecifiedMap && other.Type != abi.Type {
+               return errors.Errorf("expected map type %s, have %s", abi.Type, other.Type)
+       }
+       if err := checkUint32("key size", abi.KeySize, other.KeySize); err != nil {
+               return err
+       }
+       if err := checkUint32("value size", abi.ValueSize, other.ValueSize); err != nil {
+               return err
+       }
+       if err := checkUint32("max entries", abi.MaxEntries, other.MaxEntries); err != nil {
+               return err
+       }
+
+       if abi.InnerMap == nil {
+               if abi.Type == ArrayOfMaps || abi.Type == HashOfMaps {
+                       return errors.New("missing inner map ABI")
+               }
+
+               return nil
+       }
+
+       if other.InnerMap == nil {
+               return errors.New("missing inner map")
+       }
+
+       return errors.Wrap(abi.InnerMap.check(other.InnerMap), "inner map")
+}
+
+// ProgramABI describes a Program.
+//
+// Use it to assert that a Program matches what your code expects.
+type ProgramABI struct {
+       Type ProgramType
+}
+
+func newProgramABIFromSpec(spec *ProgramSpec) *ProgramABI {
+       return &ProgramABI{
+               spec.Type,
+       }
+}
+
+func newProgramABIFromFd(fd *bpfFD) (*ProgramABI, error) {
+       info, err := bpfGetProgInfoByFD(fd)
+       if err != nil {
+               return nil, err
+       }
+
+       return newProgramABIFromInfo(info), nil
+}
+
+func newProgramABIFromInfo(info *bpfProgInfo) *ProgramABI {
+       return &ProgramABI{
+               Type: ProgramType(info.progType),
+       }
+}
+
+// Check verifies that a Program conforms to the ABI.
+//
+// Members which have the zero value of their type
+// are not checked.
+func (abi *ProgramABI) Check(prog *Program) error {
+       if abi.Type != UnspecifiedProgram && prog.abi.Type != abi.Type {
+               return errors.Errorf("expected program type %s, have %s", abi.Type, prog.abi.Type)
+       }
+
+       return nil
+}
+
+func checkUint32(name string, want, have uint32) error {
+       if want != 0 && have != want {
+               return errors.Errorf("expected %s to be %d, have %d", name, want, have)
+       }
+       return nil
+}
diff --git a/vendor/github.com/cilium/ebpf/asm/alu.go b/vendor/github.com/cilium/ebpf/asm/alu.go
new file mode 100644 (file)
index 0000000..70ccc4d
--- /dev/null
@@ -0,0 +1,149 @@
+package asm
+
+//go:generate stringer -output alu_string.go -type=Source,Endianness,ALUOp
+
+// Source of ALU / ALU64 / Branch operations
+//
+//    msb      lsb
+//    +----+-+---+
+//    |op  |S|cls|
+//    +----+-+---+
+type Source uint8
+
+const sourceMask OpCode = 0x08
+
+// Source bitmask
+const (
+       // InvalidSource is returned by getters when invoked
+       // on non ALU / branch OpCodes.
+       InvalidSource Source = 0xff
+       // ImmSource src is from constant
+       ImmSource Source = 0x00
+       // RegSource src is from register
+       RegSource Source = 0x08
+)
+
+// The Endianness of a byte swap instruction.
+type Endianness uint8
+
+const endianMask = sourceMask
+
+// Endian flags
+const (
+       InvalidEndian Endianness = 0xff
+       // Convert to little endian
+       LE Endianness = 0x00
+       // Convert to big endian
+       BE Endianness = 0x08
+)
+
+// ALUOp are ALU / ALU64 operations
+//
+//    msb      lsb
+//    +----+-+---+
+//    |OP  |s|cls|
+//    +----+-+---+
+type ALUOp uint8
+
+const aluMask OpCode = 0xf0
+
+const (
+       // InvalidALUOp is returned by getters when invoked
+       // on non ALU OpCodes
+       InvalidALUOp ALUOp = 0xff
+       // Add - addition
+       Add ALUOp = 0x00
+       // Sub - subtraction
+       Sub ALUOp = 0x10
+       // Mul - multiplication
+       Mul ALUOp = 0x20
+       // Div - division
+       Div ALUOp = 0x30
+       // Or - bitwise or
+       Or ALUOp = 0x40
+       // And - bitwise and
+       And ALUOp = 0x50
+       // LSh - bitwise shift left
+       LSh ALUOp = 0x60
+       // RSh - bitwise shift right
+       RSh ALUOp = 0x70
+       // Neg - sign/unsign signing bit
+       Neg ALUOp = 0x80
+       // Mod - modulo
+       Mod ALUOp = 0x90
+       // Xor - bitwise xor
+       Xor ALUOp = 0xa0
+       // Mov - move value from one place to another
+       Mov ALUOp = 0xb0
+       // ArSh - arithmatic shift
+       ArSh ALUOp = 0xc0
+       // Swap - endian conversions
+       Swap ALUOp = 0xd0
+)
+
+// HostTo converts from host to another endianness.
+func HostTo(endian Endianness, dst Register, size Size) Instruction {
+       var imm int64
+       switch size {
+       case Half:
+               imm = 16
+       case Word:
+               imm = 32
+       case DWord:
+               imm = 64
+       default:
+               return Instruction{OpCode: InvalidOpCode}
+       }
+
+       return Instruction{
+               OpCode:   OpCode(ALUClass).SetALUOp(Swap).SetSource(Source(endian)),
+               Dst:      dst,
+               Constant: imm,
+       }
+}
+
+// Op returns the OpCode for an ALU operation with a given source.
+func (op ALUOp) Op(source Source) OpCode {
+       return OpCode(ALU64Class).SetALUOp(op).SetSource(source)
+}
+
+// Reg emits `dst (op) src`.
+func (op ALUOp) Reg(dst, src Register) Instruction {
+       return Instruction{
+               OpCode: op.Op(RegSource),
+               Dst:    dst,
+               Src:    src,
+       }
+}
+
+// Imm emits `dst (op) value`.
+func (op ALUOp) Imm(dst Register, value int32) Instruction {
+       return Instruction{
+               OpCode:   op.Op(ImmSource),
+               Dst:      dst,
+               Constant: int64(value),
+       }
+}
+
+// Op32 returns the OpCode for a 32-bit ALU operation with a given source.
+func (op ALUOp) Op32(source Source) OpCode {
+       return OpCode(ALUClass).SetALUOp(op).SetSource(source)
+}
+
+// Reg32 emits `dst (op) src`, zeroing the upper 32 bit of dst.
+func (op ALUOp) Reg32(dst, src Register) Instruction {
+       return Instruction{
+               OpCode: op.Op32(RegSource),
+               Dst:    dst,
+               Src:    src,
+       }
+}
+
+// Imm32 emits `dst (op) value`, zeroing the upper 32 bit of dst.
+func (op ALUOp) Imm32(dst Register, value int32) Instruction {
+       return Instruction{
+               OpCode:   op.Op32(ImmSource),
+               Dst:      dst,
+               Constant: int64(value),
+       }
+}
diff --git a/vendor/github.com/cilium/ebpf/asm/alu_string.go b/vendor/github.com/cilium/ebpf/asm/alu_string.go
new file mode 100644 (file)
index 0000000..72d3fe6
--- /dev/null
@@ -0,0 +1,107 @@
+// Code generated by "stringer -output alu_string.go -type=Source,Endianness,ALUOp"; DO NOT EDIT.
+
+package asm
+
+import "strconv"
+
+func _() {
+       // An "invalid array index" compiler error signifies that the constant values have changed.
+       // Re-run the stringer command to generate them again.
+       var x [1]struct{}
+       _ = x[InvalidSource-255]
+       _ = x[ImmSource-0]
+       _ = x[RegSource-8]
+}
+
+const (
+       _Source_name_0 = "ImmSource"
+       _Source_name_1 = "RegSource"
+       _Source_name_2 = "InvalidSource"
+)
+
+func (i Source) String() string {
+       switch {
+       case i == 0:
+               return _Source_name_0
+       case i == 8:
+               return _Source_name_1
+       case i == 255:
+               return _Source_name_2
+       default:
+               return "Source(" + strconv.FormatInt(int64(i), 10) + ")"
+       }
+}
+func _() {
+       // An "invalid array index" compiler error signifies that the constant values have changed.
+       // Re-run the stringer command to generate them again.
+       var x [1]struct{}
+       _ = x[InvalidEndian-255]
+       _ = x[LE-0]
+       _ = x[BE-8]
+}
+
+const (
+       _Endianness_name_0 = "LE"
+       _Endianness_name_1 = "BE"
+       _Endianness_name_2 = "InvalidEndian"
+)
+
+func (i Endianness) String() string {
+       switch {
+       case i == 0:
+               return _Endianness_name_0
+       case i == 8:
+               return _Endianness_name_1
+       case i == 255:
+               return _Endianness_name_2
+       default:
+               return "Endianness(" + strconv.FormatInt(int64(i), 10) + ")"
+       }
+}
+func _() {
+       // An "invalid array index" compiler error signifies that the constant values have changed.
+       // Re-run the stringer command to generate them again.
+       var x [1]struct{}
+       _ = x[InvalidALUOp-255]
+       _ = x[Add-0]
+       _ = x[Sub-16]
+       _ = x[Mul-32]
+       _ = x[Div-48]
+       _ = x[Or-64]
+       _ = x[And-80]
+       _ = x[LSh-96]
+       _ = x[RSh-112]
+       _ = x[Neg-128]
+       _ = x[Mod-144]
+       _ = x[Xor-160]
+       _ = x[Mov-176]
+       _ = x[ArSh-192]
+       _ = x[Swap-208]
+}
+
+const _ALUOp_name = "AddSubMulDivOrAndLShRShNegModXorMovArShSwapInvalidALUOp"
+
+var _ALUOp_map = map[ALUOp]string{
+       0:   _ALUOp_name[0:3],
+       16:  _ALUOp_name[3:6],
+       32:  _ALUOp_name[6:9],
+       48:  _ALUOp_name[9:12],
+       64:  _ALUOp_name[12:14],
+       80:  _ALUOp_name[14:17],
+       96:  _ALUOp_name[17:20],
+       112: _ALUOp_name[20:23],
+       128: _ALUOp_name[23:26],
+       144: _ALUOp_name[26:29],
+       160: _ALUOp_name[29:32],
+       176: _ALUOp_name[32:35],
+       192: _ALUOp_name[35:39],
+       208: _ALUOp_name[39:43],
+       255: _ALUOp_name[43:55],
+}
+
+func (i ALUOp) String() string {
+       if str, ok := _ALUOp_map[i]; ok {
+               return str
+       }
+       return "ALUOp(" + strconv.FormatInt(int64(i), 10) + ")"
+}
diff --git a/vendor/github.com/cilium/ebpf/asm/doc.go b/vendor/github.com/cilium/ebpf/asm/doc.go
new file mode 100644 (file)
index 0000000..7031bdc
--- /dev/null
@@ -0,0 +1,2 @@
+// Package asm is an assembler for eBPF bytecode.
+package asm
diff --git a/vendor/github.com/cilium/ebpf/asm/func.go b/vendor/github.com/cilium/ebpf/asm/func.go
new file mode 100644 (file)
index 0000000..97f794c
--- /dev/null
@@ -0,0 +1,143 @@
+package asm
+
+//go:generate stringer -output func_string.go -type=BuiltinFunc
+
+// BuiltinFunc is a built-in eBPF function.
+type BuiltinFunc int32
+
+// eBPF built-in functions
+//
+// You can renegerate this list using the following gawk script:
+//
+//    /FN\(.+\),/ {
+//      match($1, /\((.+)\)/, r)
+//      split(r[1], p, "_")
+//      printf "Fn"
+//      for (i in p) {
+//        printf "%s%s", toupper(substr(p[i], 1, 1)), substr(p[i], 2)
+//      }
+//      print ""
+//    }
+//
+// The script expects include/uapi/linux/bpf.h as it's input.
+const (
+       FnUnspec BuiltinFunc = iota
+       FnMapLookupElem
+       FnMapUpdateElem
+       FnMapDeleteElem
+       FnProbeRead
+       FnKtimeGetNs
+       FnTracePrintk
+       FnGetPrandomU32
+       FnGetSmpProcessorId
+       FnSkbStoreBytes
+       FnL3CsumReplace
+       FnL4CsumReplace
+       FnTailCall
+       FnCloneRedirect
+       FnGetCurrentPidTgid
+       FnGetCurrentUidGid
+       FnGetCurrentComm
+       FnGetCgroupClassid
+       FnSkbVlanPush
+       FnSkbVlanPop
+       FnSkbGetTunnelKey
+       FnSkbSetTunnelKey
+       FnPerfEventRead
+       FnRedirect
+       FnGetRouteRealm
+       FnPerfEventOutput
+       FnSkbLoadBytes
+       FnGetStackid
+       FnCsumDiff
+       FnSkbGetTunnelOpt
+       FnSkbSetTunnelOpt
+       FnSkbChangeProto
+       FnSkbChangeType
+       FnSkbUnderCgroup
+       FnGetHashRecalc
+       FnGetCurrentTask
+       FnProbeWriteUser
+       FnCurrentTaskUnderCgroup
+       FnSkbChangeTail
+       FnSkbPullData
+       FnCsumUpdate
+       FnSetHashInvalid
+       FnGetNumaNodeId
+       FnSkbChangeHead
+       FnXdpAdjustHead
+       FnProbeReadStr
+       FnGetSocketCookie
+       FnGetSocketUid
+       FnSetHash
+       FnSetsockopt
+       FnSkbAdjustRoom
+       FnRedirectMap
+       FnSkRedirectMap
+       FnSockMapUpdate
+       FnXdpAdjustMeta
+       FnPerfEventReadValue
+       FnPerfProgReadValue
+       FnGetsockopt
+       FnOverrideReturn
+       FnSockOpsCbFlagsSet
+       FnMsgRedirectMap
+       FnMsgApplyBytes
+       FnMsgCorkBytes
+       FnMsgPullData
+       FnBind
+       FnXdpAdjustTail
+       FnSkbGetXfrmState
+       FnGetStack
+       FnSkbLoadBytesRelative
+       FnFibLookup
+       FnSockHashUpdate
+       FnMsgRedirectHash
+       FnSkRedirectHash
+       FnLwtPushEncap
+       FnLwtSeg6StoreBytes
+       FnLwtSeg6AdjustSrh
+       FnLwtSeg6Action
+       FnRcRepeat
+       FnRcKeydown
+       FnSkbCgroupId
+       FnGetCurrentCgroupId
+       FnGetLocalStorage
+       FnSkSelectReuseport
+       FnSkbAncestorCgroupId
+       FnSkLookupTcp
+       FnSkLookupUdp
+       FnSkRelease
+       FnMapPushElem
+       FnMapPopElem
+       FnMapPeekElem
+       FnMsgPushData
+       FnMsgPopData
+       FnRcPointerRel
+       FnSpinLock
+       FnSpinUnlock
+       FnSkFullsock
+       FnTcpSock
+       FnSkbEcnSetCe
+       FnGetListenerSock
+       FnSkcLookupTcp
+       FnTcpCheckSyncookie
+       FnSysctlGetName
+       FnSysctlGetCurrentValue
+       FnSysctlGetNewValue
+       FnSysctlSetNewValue
+       FnStrtol
+       FnStrtoul
+       FnSkStorageGet
+       FnSkStorageDelete
+       FnSendSignal
+       FnTcpGenSyncookie
+)
+
+// Call emits a function call.
+func (fn BuiltinFunc) Call() Instruction {
+       return Instruction{
+               OpCode:   OpCode(JumpClass).SetJumpOp(Call),
+               Constant: int64(fn),
+       }
+}
diff --git a/vendor/github.com/cilium/ebpf/asm/func_string.go b/vendor/github.com/cilium/ebpf/asm/func_string.go
new file mode 100644 (file)
index 0000000..8860b9f
--- /dev/null
@@ -0,0 +1,133 @@
+// Code generated by "stringer -output func_string.go -type=BuiltinFunc"; DO NOT EDIT.
+
+package asm
+
+import "strconv"
+
+func _() {
+       // An "invalid array index" compiler error signifies that the constant values have changed.
+       // Re-run the stringer command to generate them again.
+       var x [1]struct{}
+       _ = x[FnUnspec-0]
+       _ = x[FnMapLookupElem-1]
+       _ = x[FnMapUpdateElem-2]
+       _ = x[FnMapDeleteElem-3]
+       _ = x[FnProbeRead-4]
+       _ = x[FnKtimeGetNs-5]
+       _ = x[FnTracePrintk-6]
+       _ = x[FnGetPrandomU32-7]
+       _ = x[FnGetSmpProcessorId-8]
+       _ = x[FnSkbStoreBytes-9]
+       _ = x[FnL3CsumReplace-10]
+       _ = x[FnL4CsumReplace-11]
+       _ = x[FnTailCall-12]
+       _ = x[FnCloneRedirect-13]
+       _ = x[FnGetCurrentPidTgid-14]
+       _ = x[FnGetCurrentUidGid-15]
+       _ = x[FnGetCurrentComm-16]
+       _ = x[FnGetCgroupClassid-17]
+       _ = x[FnSkbVlanPush-18]
+       _ = x[FnSkbVlanPop-19]
+       _ = x[FnSkbGetTunnelKey-20]
+       _ = x[FnSkbSetTunnelKey-21]
+       _ = x[FnPerfEventRead-22]
+       _ = x[FnRedirect-23]
+       _ = x[FnGetRouteRealm-24]
+       _ = x[FnPerfEventOutput-25]
+       _ = x[FnSkbLoadBytes-26]
+       _ = x[FnGetStackid-27]
+       _ = x[FnCsumDiff-28]
+       _ = x[FnSkbGetTunnelOpt-29]
+       _ = x[FnSkbSetTunnelOpt-30]
+       _ = x[FnSkbChangeProto-31]
+       _ = x[FnSkbChangeType-32]
+       _ = x[FnSkbUnderCgroup-33]
+       _ = x[FnGetHashRecalc-34]
+       _ = x[FnGetCurrentTask-35]
+       _ = x[FnProbeWriteUser-36]
+       _ = x[FnCurrentTaskUnderCgroup-37]
+       _ = x[FnSkbChangeTail-38]
+       _ = x[FnSkbPullData-39]
+       _ = x[FnCsumUpdate-40]
+       _ = x[FnSetHashInvalid-41]
+       _ = x[FnGetNumaNodeId-42]
+       _ = x[FnSkbChangeHead-43]
+       _ = x[FnXdpAdjustHead-44]
+       _ = x[FnProbeReadStr-45]
+       _ = x[FnGetSocketCookie-46]
+       _ = x[FnGetSocketUid-47]
+       _ = x[FnSetHash-48]
+       _ = x[FnSetsockopt-49]
+       _ = x[FnSkbAdjustRoom-50]
+       _ = x[FnRedirectMap-51]
+       _ = x[FnSkRedirectMap-52]
+       _ = x[FnSockMapUpdate-53]
+       _ = x[FnXdpAdjustMeta-54]
+       _ = x[FnPerfEventReadValue-55]
+       _ = x[FnPerfProgReadValue-56]
+       _ = x[FnGetsockopt-57]
+       _ = x[FnOverrideReturn-58]
+       _ = x[FnSockOpsCbFlagsSet-59]
+       _ = x[FnMsgRedirectMap-60]
+       _ = x[FnMsgApplyBytes-61]
+       _ = x[FnMsgCorkBytes-62]
+       _ = x[FnMsgPullData-63]
+       _ = x[FnBind-64]
+       _ = x[FnXdpAdjustTail-65]
+       _ = x[FnSkbGetXfrmState-66]
+       _ = x[FnGetStack-67]
+       _ = x[FnSkbLoadBytesRelative-68]
+       _ = x[FnFibLookup-69]
+       _ = x[FnSockHashUpdate-70]
+       _ = x[FnMsgRedirectHash-71]
+       _ = x[FnSkRedirectHash-72]
+       _ = x[FnLwtPushEncap-73]
+       _ = x[FnLwtSeg6StoreBytes-74]
+       _ = x[FnLwtSeg6AdjustSrh-75]
+       _ = x[FnLwtSeg6Action-76]
+       _ = x[FnRcRepeat-77]
+       _ = x[FnRcKeydown-78]
+       _ = x[FnSkbCgroupId-79]
+       _ = x[FnGetCurrentCgroupId-80]
+       _ = x[FnGetLocalStorage-81]
+       _ = x[FnSkSelectReuseport-82]
+       _ = x[FnSkbAncestorCgroupId-83]
+       _ = x[FnSkLookupTcp-84]
+       _ = x[FnSkLookupUdp-85]
+       _ = x[FnSkRelease-86]
+       _ = x[FnMapPushElem-87]
+       _ = x[FnMapPopElem-88]
+       _ = x[FnMapPeekElem-89]
+       _ = x[FnMsgPushData-90]
+       _ = x[FnMsgPopData-91]
+       _ = x[FnRcPointerRel-92]
+       _ = x[FnSpinLock-93]
+       _ = x[FnSpinUnlock-94]
+       _ = x[FnSkFullsock-95]
+       _ = x[FnTcpSock-96]
+       _ = x[FnSkbEcnSetCe-97]
+       _ = x[FnGetListenerSock-98]
+       _ = x[FnSkcLookupTcp-99]
+       _ = x[FnTcpCheckSyncookie-100]
+       _ = x[FnSysctlGetName-101]
+       _ = x[FnSysctlGetCurrentValue-102]
+       _ = x[FnSysctlGetNewValue-103]
+       _ = x[FnSysctlSetNewValue-104]
+       _ = x[FnStrtol-105]
+       _ = x[FnStrtoul-106]
+       _ = x[FnSkStorageGet-107]
+       _ = x[FnSkStorageDelete-108]
+       _ = x[FnSendSignal-109]
+       _ = x[FnTcpGenSyncookie-110]
+}
+
+const _BuiltinFunc_name = "FnUnspecFnMapLookupElemFnMapUpdateElemFnMapDeleteElemFnProbeReadFnKtimeGetNsFnTracePrintkFnGetPrandomU32FnGetSmpProcessorIdFnSkbStoreBytesFnL3CsumReplaceFnL4CsumReplaceFnTailCallFnCloneRedirectFnGetCurrentPidTgidFnGetCurrentUidGidFnGetCurrentCommFnGetCgroupClassidFnSkbVlanPushFnSkbVlanPopFnSkbGetTunnelKeyFnSkbSetTunnelKeyFnPerfEventReadFnRedirectFnGetRouteRealmFnPerfEventOutputFnSkbLoadBytesFnGetStackidFnCsumDiffFnSkbGetTunnelOptFnSkbSetTunnelOptFnSkbChangeProtoFnSkbChangeTypeFnSkbUnderCgroupFnGetHashRecalcFnGetCurrentTaskFnProbeWriteUserFnCurrentTaskUnderCgroupFnSkbChangeTailFnSkbPullDataFnCsumUpdateFnSetHashInvalidFnGetNumaNodeIdFnSkbChangeHeadFnXdpAdjustHeadFnProbeReadStrFnGetSocketCookieFnGetSocketUidFnSetHashFnSetsockoptFnSkbAdjustRoomFnRedirectMapFnSkRedirectMapFnSockMapUpdateFnXdpAdjustMetaFnPerfEventReadValueFnPerfProgReadValueFnGetsockoptFnOverrideReturnFnSockOpsCbFlagsSetFnMsgRedirectMapFnMsgApplyBytesFnMsgCorkBytesFnMsgPullDataFnBindFnXdpAdjustTailFnSkbGetXfrmStateFnGetStackFnSkbLoadBytesRelativeFnFibLookupFnSockHashUpdateFnMsgRedirectHashFnSkRedirectHashFnLwtPushEncapFnLwtSeg6StoreBytesFnLwtSeg6AdjustSrhFnLwtSeg6ActionFnRcRepeatFnRcKeydownFnSkbCgroupIdFnGetCurrentCgroupIdFnGetLocalStorageFnSkSelectReuseportFnSkbAncestorCgroupIdFnSkLookupTcpFnSkLookupUdpFnSkReleaseFnMapPushElemFnMapPopElemFnMapPeekElemFnMsgPushDataFnMsgPopDataFnRcPointerRelFnSpinLockFnSpinUnlockFnSkFullsockFnTcpSockFnSkbEcnSetCeFnGetListenerSockFnSkcLookupTcpFnTcpCheckSyncookieFnSysctlGetNameFnSysctlGetCurrentValueFnSysctlGetNewValueFnSysctlSetNewValueFnStrtolFnStrtoulFnSkStorageGetFnSkStorageDeleteFnSendSignalFnTcpGenSyncookie"
+
+var _BuiltinFunc_index = [...]uint16{0, 8, 23, 38, 53, 64, 76, 89, 104, 123, 138, 153, 168, 178, 193, 212, 230, 246, 264, 277, 289, 306, 323, 338, 348, 363, 380, 394, 406, 416, 433, 450, 466, 481, 497, 512, 528, 544, 568, 583, 596, 608, 624, 639, 654, 669, 683, 700, 714, 723, 735, 750, 763, 778, 793, 808, 828, 847, 859, 875, 894, 910, 925, 939, 952, 958, 973, 990, 1000, 1022, 1033, 1049, 1066, 1082, 1096, 1115, 1133, 1148, 1158, 1169, 1182, 1202, 1219, 1238, 1259, 1272, 1285, 1296, 1309, 1321, 1334, 1347, 1359, 1373, 1383, 1395, 1407, 1416, 1429, 1446, 1460, 1479, 1494, 1517, 1536, 1555, 1563, 1572, 1586, 1603, 1615, 1632}
+
+func (i BuiltinFunc) String() string {
+       if i < 0 || i >= BuiltinFunc(len(_BuiltinFunc_index)-1) {
+               return "BuiltinFunc(" + strconv.FormatInt(int64(i), 10) + ")"
+       }
+       return _BuiltinFunc_name[_BuiltinFunc_index[i]:_BuiltinFunc_index[i+1]]
+}
diff --git a/vendor/github.com/cilium/ebpf/asm/instruction.go b/vendor/github.com/cilium/ebpf/asm/instruction.go
new file mode 100644 (file)
index 0000000..c8ed6cf
--- /dev/null
@@ -0,0 +1,416 @@
+package asm
+
+import (
+       "encoding/binary"
+       "fmt"
+       "io"
+       "math"
+       "strings"
+
+       "github.com/pkg/errors"
+)
+
+// InstructionSize is the size of a BPF instruction in bytes
+const InstructionSize = 8
+
+// Instruction is a single eBPF instruction.
+type Instruction struct {
+       OpCode    OpCode
+       Dst       Register
+       Src       Register
+       Offset    int16
+       Constant  int64
+       Reference string
+       Symbol    string
+}
+
+// Sym creates a symbol.
+func (ins Instruction) Sym(name string) Instruction {
+       ins.Symbol = name
+       return ins
+}
+
+// Unmarshal decodes a BPF instruction.
+func (ins *Instruction) Unmarshal(r io.Reader, bo binary.ByteOrder) (uint64, error) {
+       var bi bpfInstruction
+       err := binary.Read(r, bo, &bi)
+       if err != nil {
+               return 0, err
+       }
+
+       ins.OpCode = bi.OpCode
+       ins.Dst = bi.Registers.Dst()
+       ins.Src = bi.Registers.Src()
+       ins.Offset = bi.Offset
+       ins.Constant = int64(bi.Constant)
+
+       if !bi.OpCode.isDWordLoad() {
+               return InstructionSize, nil
+       }
+
+       var bi2 bpfInstruction
+       if err := binary.Read(r, bo, &bi2); err != nil {
+               // No Wrap, to avoid io.EOF clash
+               return 0, errors.New("64bit immediate is missing second half")
+       }
+       if bi2.OpCode != 0 || bi2.Offset != 0 || bi2.Registers != 0 {
+               return 0, errors.New("64bit immediate has non-zero fields")
+       }
+       ins.Constant = int64(uint64(uint32(bi2.Constant))<<32 | uint64(uint32(bi.Constant)))
+
+       return 2 * InstructionSize, nil
+}
+
+// Marshal encodes a BPF instruction.
+func (ins Instruction) Marshal(w io.Writer, bo binary.ByteOrder) (uint64, error) {
+       if ins.OpCode == InvalidOpCode {
+               return 0, errors.New("invalid opcode")
+       }
+
+       isDWordLoad := ins.OpCode.isDWordLoad()
+
+       cons := int32(ins.Constant)
+       if isDWordLoad {
+               // Encode least significant 32bit first for 64bit operations.
+               cons = int32(uint32(ins.Constant))
+       }
+
+       bpfi := bpfInstruction{
+               ins.OpCode,
+               newBPFRegisters(ins.Dst, ins.Src),
+               ins.Offset,
+               cons,
+       }
+
+       if err := binary.Write(w, bo, &bpfi); err != nil {
+               return 0, err
+       }
+
+       if !isDWordLoad {
+               return InstructionSize, nil
+       }
+
+       bpfi = bpfInstruction{
+               Constant: int32(ins.Constant >> 32),
+       }
+
+       if err := binary.Write(w, bo, &bpfi); err != nil {
+               return 0, err
+       }
+
+       return 2 * InstructionSize, nil
+}
+
+// RewriteMapPtr changes an instruction to use a new map fd.
+//
+// Returns an error if the fd is invalid, or the instruction
+// is incorrect.
+func (ins *Instruction) RewriteMapPtr(fd int) error {
+       if !ins.OpCode.isDWordLoad() {
+               return errors.Errorf("%s is not a 64 bit load", ins.OpCode)
+       }
+
+       if fd < 0 {
+               return errors.New("invalid fd")
+       }
+
+       ins.Src = R1
+       ins.Constant = int64(fd)
+       return nil
+}
+
+// Format implements fmt.Formatter.
+func (ins Instruction) Format(f fmt.State, c rune) {
+       if c != 'v' {
+               fmt.Fprintf(f, "{UNRECOGNIZED: %c}", c)
+               return
+       }
+
+       op := ins.OpCode
+
+       if op == InvalidOpCode {
+               fmt.Fprint(f, "INVALID")
+               return
+       }
+
+       // Omit trailing space for Exit
+       if op.JumpOp() == Exit {
+               fmt.Fprint(f, op)
+               return
+       }
+
+       fmt.Fprintf(f, "%v ", op)
+       switch cls := op.Class(); cls {
+       case LdClass, LdXClass, StClass, StXClass:
+               switch op.Mode() {
+               case ImmMode:
+                       fmt.Fprintf(f, "dst: %s imm: %d", ins.Dst, ins.Constant)
+               case AbsMode:
+                       fmt.Fprintf(f, "imm: %d", ins.Constant)
+               case IndMode:
+                       fmt.Fprintf(f, "dst: %s src: %s imm: %d", ins.Dst, ins.Src, ins.Constant)
+               case MemMode:
+                       fmt.Fprintf(f, "dst: %s src: %s off: %d imm: %d", ins.Dst, ins.Src, ins.Offset, ins.Constant)
+               case XAddMode:
+                       fmt.Fprintf(f, "dst: %s src: %s", ins.Dst, ins.Src)
+               }
+
+       case ALU64Class, ALUClass:
+               fmt.Fprintf(f, "dst: %s ", ins.Dst)
+               if op.ALUOp() == Swap || op.Source() == ImmSource {
+                       fmt.Fprintf(f, "imm: %d", ins.Constant)
+               } else {
+                       fmt.Fprintf(f, "src: %s", ins.Src)
+               }
+
+       case JumpClass:
+               switch jop := op.JumpOp(); jop {
+               case Call:
+                       if ins.Src == R1 {
+                               // bpf-to-bpf call
+                               fmt.Fprint(f, ins.Constant)
+                       } else {
+                               fmt.Fprint(f, BuiltinFunc(ins.Constant))
+                       }
+
+               default:
+                       fmt.Fprintf(f, "dst: %s off: %d ", ins.Dst, ins.Offset)
+                       if op.Source() == ImmSource {
+                               fmt.Fprintf(f, "imm: %d", ins.Constant)
+                       } else {
+                               fmt.Fprintf(f, "src: %s", ins.Src)
+                       }
+               }
+       }
+
+       if ins.Reference != "" {
+               fmt.Fprintf(f, " <%s>", ins.Reference)
+       }
+}
+
+// Instructions is an eBPF program.
+type Instructions []Instruction
+
+func (insns Instructions) String() string {
+       return fmt.Sprint(insns)
+}
+
+// RewriteMapPtr rewrites all loads of a specific map pointer to a new fd.
+//
+// Returns an error if the symbol isn't used, see IsUnreferencedSymbol.
+func (insns Instructions) RewriteMapPtr(symbol string, fd int) error {
+       if symbol == "" {
+               return errors.New("empty symbol")
+       }
+
+       found := false
+       for i := range insns {
+               ins := &insns[i]
+               if ins.Reference != symbol {
+                       continue
+               }
+
+               if err := ins.RewriteMapPtr(fd); err != nil {
+                       return err
+               }
+
+               found = true
+       }
+
+       if !found {
+               return &unreferencedSymbolError{symbol}
+       }
+
+       return nil
+}
+
+// SymbolOffsets returns the set of symbols and their offset in
+// the instructions.
+func (insns Instructions) SymbolOffsets() (map[string]int, error) {
+       offsets := make(map[string]int)
+
+       for i, ins := range insns {
+               if ins.Symbol == "" {
+                       continue
+               }
+
+               if _, ok := offsets[ins.Symbol]; ok {
+                       return nil, errors.Errorf("duplicate symbol %s", ins.Symbol)
+               }
+
+               offsets[ins.Symbol] = i
+       }
+
+       return offsets, nil
+}
+
+// ReferenceOffsets returns the set of references and their offset in
+// the instructions.
+func (insns Instructions) ReferenceOffsets() map[string][]int {
+       offsets := make(map[string][]int)
+
+       for i, ins := range insns {
+               if ins.Reference == "" {
+                       continue
+               }
+
+               offsets[ins.Reference] = append(offsets[ins.Reference], i)
+       }
+
+       return offsets
+}
+
+func (insns Instructions) marshalledOffsets() (map[string]int, error) {
+       symbols := make(map[string]int)
+
+       marshalledPos := 0
+       for _, ins := range insns {
+               currentPos := marshalledPos
+               marshalledPos += ins.OpCode.marshalledInstructions()
+
+               if ins.Symbol == "" {
+                       continue
+               }
+
+               if _, ok := symbols[ins.Symbol]; ok {
+                       return nil, errors.Errorf("duplicate symbol %s", ins.Symbol)
+               }
+
+               symbols[ins.Symbol] = currentPos
+       }
+
+       return symbols, nil
+}
+
+// Format implements fmt.Formatter.
+//
+// You can control indentation of symbols by
+// specifying a width. Setting a precision controls the indentation of
+// instructions.
+// The default character is a tab, which can be overriden by specifying
+// the ' ' space flag.
+func (insns Instructions) Format(f fmt.State, c rune) {
+       if c != 's' && c != 'v' {
+               fmt.Fprintf(f, "{UNKNOWN FORMAT '%c'}", c)
+               return
+       }
+
+       // Precision is better in this case, because it allows
+       // specifying 0 padding easily.
+       padding, ok := f.Precision()
+       if !ok {
+               padding = 1
+       }
+
+       indent := strings.Repeat("\t", padding)
+       if f.Flag(' ') {
+               indent = strings.Repeat(" ", padding)
+       }
+
+       symPadding, ok := f.Width()
+       if !ok {
+               symPadding = padding - 1
+       }
+       if symPadding < 0 {
+               symPadding = 0
+       }
+
+       symIndent := strings.Repeat("\t", symPadding)
+       if f.Flag(' ') {
+               symIndent = strings.Repeat(" ", symPadding)
+       }
+
+       // Figure out how many digits we need to represent the highest
+       // offset.
+       highestOffset := 0
+       for _, ins := range insns {
+               highestOffset += ins.OpCode.marshalledInstructions()
+       }
+       offsetWidth := int(math.Ceil(math.Log10(float64(highestOffset))))
+
+       offset := 0
+       for _, ins := range insns {
+               if ins.Symbol != "" {
+                       fmt.Fprintf(f, "%s%s:\n", symIndent, ins.Symbol)
+               }
+               fmt.Fprintf(f, "%s%*d: %v\n", indent, offsetWidth, offset, ins)
+               offset += ins.OpCode.marshalledInstructions()
+       }
+
+       return
+}
+
+// Marshal encodes a BPF program into the kernel format.
+func (insns Instructions) Marshal(w io.Writer, bo binary.ByteOrder) error {
+       absoluteOffsets, err := insns.marshalledOffsets()
+       if err != nil {
+               return err
+       }
+
+       num := 0
+       for i, ins := range insns {
+               switch {
+               case ins.OpCode.JumpOp() == Call && ins.Constant == -1:
+                       // Rewrite bpf to bpf call
+                       offset, ok := absoluteOffsets[ins.Reference]
+                       if !ok {
+                               return errors.Errorf("instruction %d: reference to missing symbol %s", i, ins.Reference)
+                       }
+
+                       ins.Constant = int64(offset - num - 1)
+
+               case ins.OpCode.Class() == JumpClass && ins.Offset == -1:
+                       // Rewrite jump to label
+                       offset, ok := absoluteOffsets[ins.Reference]
+                       if !ok {
+                               return errors.Errorf("instruction %d: reference to missing symbol %s", i, ins.Reference)
+                       }
+
+                       ins.Offset = int16(offset - num - 1)
+               }
+
+               n, err := ins.Marshal(w, bo)
+               if err != nil {
+                       return errors.Wrapf(err, "instruction %d", i)
+               }
+
+               num += int(n / InstructionSize)
+       }
+       return nil
+}
+
+type bpfInstruction struct {
+       OpCode    OpCode
+       Registers bpfRegisters
+       Offset    int16
+       Constant  int32
+}
+
+type bpfRegisters uint8
+
+func newBPFRegisters(dst, src Register) bpfRegisters {
+       return bpfRegisters((src << 4) | (dst & 0xF))
+}
+
+func (r bpfRegisters) Dst() Register {
+       return Register(r & 0xF)
+}
+
+func (r bpfRegisters) Src() Register {
+       return Register(r >> 4)
+}
+
+type unreferencedSymbolError struct {
+       symbol string
+}
+
+func (use *unreferencedSymbolError) Error() string {
+       return fmt.Sprintf("unreferenced symbol %s", use.symbol)
+}
+
+// IsUnreferencedSymbol returns true if err was caused by
+// an unreferenced symbol.
+func IsUnreferencedSymbol(err error) bool {
+       _, ok := err.(*unreferencedSymbolError)
+       return ok
+}
diff --git a/vendor/github.com/cilium/ebpf/asm/jump.go b/vendor/github.com/cilium/ebpf/asm/jump.go
new file mode 100644 (file)
index 0000000..33c9b56
--- /dev/null
@@ -0,0 +1,109 @@
+package asm
+
+//go:generate stringer -output jump_string.go -type=JumpOp
+
+// JumpOp affect control flow.
+//
+//    msb      lsb
+//    +----+-+---+
+//    |OP  |s|cls|
+//    +----+-+---+
+type JumpOp uint8
+
+const jumpMask OpCode = aluMask
+
+const (
+       // InvalidJumpOp is returned by getters when invoked
+       // on non branch OpCodes
+       InvalidJumpOp JumpOp = 0xff
+       // Ja jumps by offset unconditionally
+       Ja JumpOp = 0x00
+       // JEq jumps by offset if r == imm
+       JEq JumpOp = 0x10
+       // JGT jumps by offset if r > imm
+       JGT JumpOp = 0x20
+       // JGE jumps by offset if r >= imm
+       JGE JumpOp = 0x30
+       // JSet jumps by offset if r & imm
+       JSet JumpOp = 0x40
+       // JNE jumps by offset if r != imm
+       JNE JumpOp = 0x50
+       // JSGT jumps by offset if signed r > signed imm
+       JSGT JumpOp = 0x60
+       // JSGE jumps by offset if signed r >= signed imm
+       JSGE JumpOp = 0x70
+       // Call builtin or user defined function from imm
+       Call JumpOp = 0x80
+       // Exit ends execution, with value in r0
+       Exit JumpOp = 0x90
+       // JLT jumps by offset if r < imm
+       JLT JumpOp = 0xa0
+       // JLE jumps by offset if r <= imm
+       JLE JumpOp = 0xb0
+       // JSLT jumps by offset if signed r < signed imm
+       JSLT JumpOp = 0xc0
+       // JSLE jumps by offset if signed r <= signed imm
+       JSLE JumpOp = 0xd0
+)
+
+// Return emits an exit instruction.
+//
+// Requires a return value in R0.
+func Return() Instruction {
+       return Instruction{
+               OpCode: OpCode(JumpClass).SetJumpOp(Exit),
+       }
+}
+
+// Op returns the OpCode for a given jump source.
+func (op JumpOp) Op(source Source) OpCode {
+       return OpCode(JumpClass).SetJumpOp(op).SetSource(source)
+}
+
+// Imm compares dst to value, and adjusts PC by offset if the condition is fulfilled.
+func (op JumpOp) Imm(dst Register, value int32, label string) Instruction {
+       if op == Exit || op == Call || op == Ja {
+               return Instruction{OpCode: InvalidOpCode}
+       }
+
+       return Instruction{
+               OpCode:    OpCode(JumpClass).SetJumpOp(op).SetSource(ImmSource),
+               Dst:       dst,
+               Offset:    -1,
+               Constant:  int64(value),
+               Reference: label,
+       }
+}
+
+// Reg compares dst to src, and adjusts PC by offset if the condition is fulfilled.
+func (op JumpOp) Reg(dst, src Register, label string) Instruction {
+       if op == Exit || op == Call || op == Ja {
+               return Instruction{OpCode: InvalidOpCode}
+       }
+
+       return Instruction{
+               OpCode:    OpCode(JumpClass).SetJumpOp(op).SetSource(RegSource),
+               Dst:       dst,
+               Src:       src,
+               Offset:    -1,
+               Reference: label,
+       }
+}
+
+// Label adjusts PC to the address of the label.
+func (op JumpOp) Label(label string) Instruction {
+       if op == Call {
+               return Instruction{
+                       OpCode:    OpCode(JumpClass).SetJumpOp(Call),
+                       Src:       R1,
+                       Constant:  -1,
+                       Reference: label,
+               }
+       }
+
+       return Instruction{
+               OpCode:    OpCode(JumpClass).SetJumpOp(op),
+               Offset:    -1,
+               Reference: label,
+       }
+}
diff --git a/vendor/github.com/cilium/ebpf/asm/jump_string.go b/vendor/github.com/cilium/ebpf/asm/jump_string.go
new file mode 100644 (file)
index 0000000..85a4aaf
--- /dev/null
@@ -0,0 +1,53 @@
+// Code generated by "stringer -output jump_string.go -type=JumpOp"; DO NOT EDIT.
+
+package asm
+
+import "strconv"
+
+func _() {
+       // An "invalid array index" compiler error signifies that the constant values have changed.
+       // Re-run the stringer command to generate them again.
+       var x [1]struct{}
+       _ = x[InvalidJumpOp-255]
+       _ = x[Ja-0]
+       _ = x[JEq-16]
+       _ = x[JGT-32]
+       _ = x[JGE-48]
+       _ = x[JSet-64]
+       _ = x[JNE-80]
+       _ = x[JSGT-96]
+       _ = x[JSGE-112]
+       _ = x[Call-128]
+       _ = x[Exit-144]
+       _ = x[JLT-160]
+       _ = x[JLE-176]
+       _ = x[JSLT-192]
+       _ = x[JSLE-208]
+}
+
+const _JumpOp_name = "JaJEqJGTJGEJSetJNEJSGTJSGECallExitJLTJLEJSLTJSLEInvalidJumpOp"
+
+var _JumpOp_map = map[JumpOp]string{
+       0:   _JumpOp_name[0:2],
+       16:  _JumpOp_name[2:5],
+       32:  _JumpOp_name[5:8],
+       48:  _JumpOp_name[8:11],
+       64:  _JumpOp_name[11:15],
+       80:  _JumpOp_name[15:18],
+       96:  _JumpOp_name[18:22],
+       112: _JumpOp_name[22:26],
+       128: _JumpOp_name[26:30],
+       144: _JumpOp_name[30:34],
+       160: _JumpOp_name[34:37],
+       176: _JumpOp_name[37:40],
+       192: _JumpOp_name[40:44],
+       208: _JumpOp_name[44:48],
+       255: _JumpOp_name[48:61],
+}
+
+func (i JumpOp) String() string {
+       if str, ok := _JumpOp_map[i]; ok {
+               return str
+       }
+       return "JumpOp(" + strconv.FormatInt(int64(i), 10) + ")"
+}
diff --git a/vendor/github.com/cilium/ebpf/asm/load_store.go b/vendor/github.com/cilium/ebpf/asm/load_store.go
new file mode 100644 (file)
index 0000000..ab0e92f
--- /dev/null
@@ -0,0 +1,189 @@
+package asm
+
+//go:generate stringer -output load_store_string.go -type=Mode,Size
+
+// Mode for load and store operations
+//
+//    msb      lsb
+//    +---+--+---+
+//    |MDE|sz|cls|
+//    +---+--+---+
+type Mode uint8
+
+const modeMask OpCode = 0xe0
+
+const (
+       // InvalidMode is returned by getters when invoked
+       // on non load / store OpCodes
+       InvalidMode Mode = 0xff
+       // ImmMode - immediate value
+       ImmMode Mode = 0x00
+       // AbsMode - immediate value + offset
+       AbsMode Mode = 0x20
+       // IndMode - indirect (imm+src)
+       IndMode Mode = 0x40
+       // MemMode - load from memory
+       MemMode Mode = 0x60
+       // XAddMode - add atomically across processors.
+       XAddMode Mode = 0xc0
+)
+
+// Size of load and store operations
+//
+//    msb      lsb
+//    +---+--+---+
+//    |mde|SZ|cls|
+//    +---+--+---+
+type Size uint8
+
+const sizeMask OpCode = 0x18
+
+const (
+       // InvalidSize is returned by getters when invoked
+       // on non load / store OpCodes
+       InvalidSize Size = 0xff
+       // DWord - double word; 64 bits
+       DWord Size = 0x18
+       // Word - word; 32 bits
+       Word Size = 0x00
+       // Half - half-word; 16 bits
+       Half Size = 0x08
+       // Byte - byte; 8 bits
+       Byte Size = 0x10
+)
+
+// Sizeof returns the size in bytes.
+func (s Size) Sizeof() int {
+       switch s {
+       case DWord:
+               return 8
+       case Word:
+               return 4
+       case Half:
+               return 2
+       case Byte:
+               return 1
+       default:
+               return -1
+       }
+}
+
+// LoadMemOp returns the OpCode to load a value of given size from memory.
+func LoadMemOp(size Size) OpCode {
+       return OpCode(LdXClass).SetMode(MemMode).SetSize(size)
+}
+
+// LoadMem emits `dst = *(size *)(src + offset)`.
+func LoadMem(dst, src Register, offset int16, size Size) Instruction {
+       return Instruction{
+               OpCode: LoadMemOp(size),
+               Dst:    dst,
+               Src:    src,
+               Offset: offset,
+       }
+}
+
+// LoadImmOp returns the OpCode to load an immediate of given size.
+//
+// As of kernel 4.20, only DWord size is accepted.
+func LoadImmOp(size Size) OpCode {
+       return OpCode(LdClass).SetMode(ImmMode).SetSize(size)
+}
+
+// LoadImm emits `dst = (size)value`.
+//
+// As of kernel 4.20, only DWord size is accepted.
+func LoadImm(dst Register, value int64, size Size) Instruction {
+       return Instruction{
+               OpCode:   LoadImmOp(size),
+               Dst:      dst,
+               Constant: value,
+       }
+}
+
+// LoadMapPtr stores a pointer to a map in dst.
+func LoadMapPtr(dst Register, fd int) Instruction {
+       if fd < 0 {
+               return Instruction{OpCode: InvalidOpCode}
+       }
+
+       return Instruction{
+               OpCode:   LoadImmOp(DWord),
+               Dst:      dst,
+               Src:      R1,
+               Constant: int64(fd),
+       }
+}
+
+// LoadIndOp returns the OpCode for loading a value of given size from an sk_buff.
+func LoadIndOp(size Size) OpCode {
+       return OpCode(LdClass).SetMode(IndMode).SetSize(size)
+}
+
+// LoadInd emits `dst = ntoh(*(size *)(((sk_buff *)R6)->data + src + offset))`.
+func LoadInd(dst, src Register, offset int32, size Size) Instruction {
+       return Instruction{
+               OpCode:   LoadIndOp(size),
+               Dst:      dst,
+               Src:      src,
+               Constant: int64(offset),
+       }
+}
+
+// LoadAbsOp returns the OpCode for loading a value of given size from an sk_buff.
+func LoadAbsOp(size Size) OpCode {
+       return OpCode(LdClass).SetMode(AbsMode).SetSize(size)
+}
+
+// LoadAbs emits `r0 = ntoh(*(size *)(((sk_buff *)R6)->data + offset))`.
+func LoadAbs(offset int32, size Size) Instruction {
+       return Instruction{
+               OpCode:   LoadAbsOp(size),
+               Dst:      R0,
+               Constant: int64(offset),
+       }
+}
+
+// StoreMemOp returns the OpCode for storing a register of given size in memory.
+func StoreMemOp(size Size) OpCode {
+       return OpCode(StXClass).SetMode(MemMode).SetSize(size)
+}
+
+// StoreMem emits `*(size *)(dst + offset) = src`
+func StoreMem(dst Register, offset int16, src Register, size Size) Instruction {
+       return Instruction{
+               OpCode: StoreMemOp(size),
+               Dst:    dst,
+               Src:    src,
+               Offset: offset,
+       }
+}
+
+// StoreImmOp returns the OpCode for storing an immediate of given size in memory.
+func StoreImmOp(size Size) OpCode {
+       return OpCode(StClass).SetMode(MemMode).SetSize(size)
+}
+
+// StoreImm emits `*(size *)(dst + offset) = value`.
+func StoreImm(dst Register, offset int16, value int64, size Size) Instruction {
+       return Instruction{
+               OpCode:   StoreImmOp(size),
+               Dst:      dst,
+               Offset:   offset,
+               Constant: value,
+       }
+}
+
+// StoreXAddOp returns the OpCode to atomically add a register to a value in memory.
+func StoreXAddOp(size Size) OpCode {
+       return OpCode(StXClass).SetMode(XAddMode).SetSize(size)
+}
+
+// StoreXAdd atomically adds src to *dst.
+func StoreXAdd(dst, src Register, size Size) Instruction {
+       return Instruction{
+               OpCode: StoreXAddOp(size),
+               Dst:    dst,
+               Src:    src,
+       }
+}
diff --git a/vendor/github.com/cilium/ebpf/asm/load_store_string.go b/vendor/github.com/cilium/ebpf/asm/load_store_string.go
new file mode 100644 (file)
index 0000000..76d29a0
--- /dev/null
@@ -0,0 +1,80 @@
+// Code generated by "stringer -output load_store_string.go -type=Mode,Size"; DO NOT EDIT.
+
+package asm
+
+import "strconv"
+
+func _() {
+       // An "invalid array index" compiler error signifies that the constant values have changed.
+       // Re-run the stringer command to generate them again.
+       var x [1]struct{}
+       _ = x[InvalidMode-255]
+       _ = x[ImmMode-0]
+       _ = x[AbsMode-32]
+       _ = x[IndMode-64]
+       _ = x[MemMode-96]
+       _ = x[XAddMode-192]
+}
+
+const (
+       _Mode_name_0 = "ImmMode"
+       _Mode_name_1 = "AbsMode"
+       _Mode_name_2 = "IndMode"
+       _Mode_name_3 = "MemMode"
+       _Mode_name_4 = "XAddMode"
+       _Mode_name_5 = "InvalidMode"
+)
+
+func (i Mode) String() string {
+       switch {
+       case i == 0:
+               return _Mode_name_0
+       case i == 32:
+               return _Mode_name_1
+       case i == 64:
+               return _Mode_name_2
+       case i == 96:
+               return _Mode_name_3
+       case i == 192:
+               return _Mode_name_4
+       case i == 255:
+               return _Mode_name_5
+       default:
+               return "Mode(" + strconv.FormatInt(int64(i), 10) + ")"
+       }
+}
+func _() {
+       // An "invalid array index" compiler error signifies that the constant values have changed.
+       // Re-run the stringer command to generate them again.
+       var x [1]struct{}
+       _ = x[InvalidSize-255]
+       _ = x[DWord-24]
+       _ = x[Word-0]
+       _ = x[Half-8]
+       _ = x[Byte-16]
+}
+
+const (
+       _Size_name_0 = "Word"
+       _Size_name_1 = "Half"
+       _Size_name_2 = "Byte"
+       _Size_name_3 = "DWord"
+       _Size_name_4 = "InvalidSize"
+)
+
+func (i Size) String() string {
+       switch {
+       case i == 0:
+               return _Size_name_0
+       case i == 8:
+               return _Size_name_1
+       case i == 16:
+               return _Size_name_2
+       case i == 24:
+               return _Size_name_3
+       case i == 255:
+               return _Size_name_4
+       default:
+               return "Size(" + strconv.FormatInt(int64(i), 10) + ")"
+       }
+}
diff --git a/vendor/github.com/cilium/ebpf/asm/opcode.go b/vendor/github.com/cilium/ebpf/asm/opcode.go
new file mode 100644 (file)
index 0000000..d796de3
--- /dev/null
@@ -0,0 +1,237 @@
+package asm
+
+import (
+       "fmt"
+       "strings"
+)
+
+//go:generate stringer -output opcode_string.go -type=Class
+
+type encoding int
+
+const (
+       unknownEncoding encoding = iota
+       loadOrStore
+       jumpOrALU
+)
+
+// Class of operations
+//
+//    msb      lsb
+//    +---+--+---+
+//    |  ??  |CLS|
+//    +---+--+---+
+type Class uint8
+
+const classMask OpCode = 0x07
+
+const (
+       // LdClass load memory
+       LdClass Class = 0x00
+       // LdXClass load memory from constant
+       LdXClass Class = 0x01
+       // StClass load register from memory
+       StClass Class = 0x02
+       // StXClass load register from constant
+       StXClass Class = 0x03
+       // ALUClass arithmetic operators
+       ALUClass Class = 0x04
+       // JumpClass jump operators
+       JumpClass Class = 0x05
+       // ALU64Class arithmetic in 64 bit mode
+       ALU64Class Class = 0x07
+)
+
+func (cls Class) encoding() encoding {
+       switch cls {
+       case LdClass, LdXClass, StClass, StXClass:
+               return loadOrStore
+       case ALU64Class, ALUClass, JumpClass:
+               return jumpOrALU
+       default:
+               return unknownEncoding
+       }
+}
+
+// OpCode is a packed eBPF opcode.
+//
+// Its encoding is defined by a Class value:
+//
+//    msb      lsb
+//    +----+-+---+
+//    | ???? |CLS|
+//    +----+-+---+
+type OpCode uint8
+
+// InvalidOpCode is returned by setters on OpCode
+const InvalidOpCode OpCode = 0xff
+
+// marshalledInstructions returns the number of BPF instructions required
+// to encode this opcode.
+func (op OpCode) marshalledInstructions() int {
+       if op == LoadImmOp(DWord) {
+               return 2
+       }
+       return 1
+}
+
+func (op OpCode) isDWordLoad() bool {
+       return op == LoadImmOp(DWord)
+}
+
+// Class returns the class of operation.
+func (op OpCode) Class() Class {
+       return Class(op & classMask)
+}
+
+// Mode returns the mode for load and store operations.
+func (op OpCode) Mode() Mode {
+       if op.Class().encoding() != loadOrStore {
+               return InvalidMode
+       }
+       return Mode(op & modeMask)
+}
+
+// Size returns the size for load and store operations.
+func (op OpCode) Size() Size {
+       if op.Class().encoding() != loadOrStore {
+               return InvalidSize
+       }
+       return Size(op & sizeMask)
+}
+
+// Source returns the source for branch and ALU operations.
+func (op OpCode) Source() Source {
+       if op.Class().encoding() != jumpOrALU || op.ALUOp() == Swap {
+               return InvalidSource
+       }
+       return Source(op & sourceMask)
+}
+
+// ALUOp returns the ALUOp.
+func (op OpCode) ALUOp() ALUOp {
+       if op.Class().encoding() != jumpOrALU {
+               return InvalidALUOp
+       }
+       return ALUOp(op & aluMask)
+}
+
+// Endianness returns the Endianness for a byte swap instruction.
+func (op OpCode) Endianness() Endianness {
+       if op.ALUOp() != Swap {
+               return InvalidEndian
+       }
+       return Endianness(op & endianMask)
+}
+
+// JumpOp returns the JumpOp.
+func (op OpCode) JumpOp() JumpOp {
+       if op.Class().encoding() != jumpOrALU {
+               return InvalidJumpOp
+       }
+       return JumpOp(op & jumpMask)
+}
+
+// SetMode sets the mode on load and store operations.
+//
+// Returns InvalidOpCode if op is of the wrong class.
+func (op OpCode) SetMode(mode Mode) OpCode {
+       if op.Class().encoding() != loadOrStore || !valid(OpCode(mode), modeMask) {
+               return InvalidOpCode
+       }
+       return (op & ^modeMask) | OpCode(mode)
+}
+
+// SetSize sets the size on load and store operations.
+//
+// Returns InvalidOpCode if op is of the wrong class.
+func (op OpCode) SetSize(size Size) OpCode {
+       if op.Class().encoding() != loadOrStore || !valid(OpCode(size), sizeMask) {
+               return InvalidOpCode
+       }
+       return (op & ^sizeMask) | OpCode(size)
+}
+
+// SetSource sets the source on jump and ALU operations.
+//
+// Returns InvalidOpCode if op is of the wrong class.
+func (op OpCode) SetSource(source Source) OpCode {
+       if op.Class().encoding() != jumpOrALU || !valid(OpCode(source), sourceMask) {
+               return InvalidOpCode
+       }
+       return (op & ^sourceMask) | OpCode(source)
+}
+
+// SetALUOp sets the ALUOp on ALU operations.
+//
+// Returns InvalidOpCode if op is of the wrong class.
+func (op OpCode) SetALUOp(alu ALUOp) OpCode {
+       class := op.Class()
+       if (class != ALUClass && class != ALU64Class) || !valid(OpCode(alu), aluMask) {
+               return InvalidOpCode
+       }
+       return (op & ^aluMask) | OpCode(alu)
+}
+
+// SetJumpOp sets the JumpOp on jump operations.
+//
+// Returns InvalidOpCode if op is of the wrong class.
+func (op OpCode) SetJumpOp(jump JumpOp) OpCode {
+       if op.Class() != JumpClass || !valid(OpCode(jump), jumpMask) {
+               return InvalidOpCode
+       }
+       return (op & ^jumpMask) | OpCode(jump)
+}
+
+func (op OpCode) String() string {
+       var f strings.Builder
+
+       switch class := op.Class(); class {
+       case LdClass, LdXClass, StClass, StXClass:
+               f.WriteString(strings.TrimSuffix(class.String(), "Class"))
+
+               mode := op.Mode()
+               f.WriteString(strings.TrimSuffix(mode.String(), "Mode"))
+
+               switch op.Size() {
+               case DWord:
+                       f.WriteString("DW")
+               case Word:
+                       f.WriteString("W")
+               case Half:
+                       f.WriteString("H")
+               case Byte:
+                       f.WriteString("B")
+               }
+
+       case ALU64Class, ALUClass:
+               f.WriteString(op.ALUOp().String())
+
+               if op.ALUOp() == Swap {
+                       // Width for Endian is controlled by Constant
+                       f.WriteString(op.Endianness().String())
+               } else {
+                       if class == ALUClass {
+                               f.WriteString("32")
+                       }
+
+                       f.WriteString(strings.TrimSuffix(op.Source().String(), "Source"))
+               }
+
+       case JumpClass:
+               f.WriteString(op.JumpOp().String())
+               if jop := op.JumpOp(); jop != Exit && jop != Call {
+                       f.WriteString(strings.TrimSuffix(op.Source().String(), "Source"))
+               }
+
+       default:
+               fmt.Fprintf(&f, "%#x", op)
+       }
+
+       return f.String()
+}
+
+// valid returns true if all bits in value are covered by mask.
+func valid(value, mask OpCode) bool {
+       return value & ^mask == 0
+}
diff --git a/vendor/github.com/cilium/ebpf/asm/opcode_string.go b/vendor/github.com/cilium/ebpf/asm/opcode_string.go
new file mode 100644 (file)
index 0000000..079ce1d
--- /dev/null
@@ -0,0 +1,38 @@
+// Code generated by "stringer -output opcode_string.go -type=Class"; DO NOT EDIT.
+
+package asm
+
+import "strconv"
+
+func _() {
+       // An "invalid array index" compiler error signifies that the constant values have changed.
+       // Re-run the stringer command to generate them again.
+       var x [1]struct{}
+       _ = x[LdClass-0]
+       _ = x[LdXClass-1]
+       _ = x[StClass-2]
+       _ = x[StXClass-3]
+       _ = x[ALUClass-4]
+       _ = x[JumpClass-5]
+       _ = x[ALU64Class-7]
+}
+
+const (
+       _Class_name_0 = "LdClassLdXClassStClassStXClassALUClassJumpClass"
+       _Class_name_1 = "ALU64Class"
+)
+
+var (
+       _Class_index_0 = [...]uint8{0, 7, 15, 22, 30, 38, 47}
+)
+
+func (i Class) String() string {
+       switch {
+       case 0 <= i && i <= 5:
+               return _Class_name_0[_Class_index_0[i]:_Class_index_0[i+1]]
+       case i == 7:
+               return _Class_name_1
+       default:
+               return "Class(" + strconv.FormatInt(int64(i), 10) + ")"
+       }
+}
diff --git a/vendor/github.com/cilium/ebpf/asm/register.go b/vendor/github.com/cilium/ebpf/asm/register.go
new file mode 100644 (file)
index 0000000..4f284fb
--- /dev/null
@@ -0,0 +1,42 @@
+package asm
+
+import (
+       "fmt"
+)
+
+// Register is the source or destination of most operations.
+type Register uint8
+
+// R0 contains return values.
+const R0 Register = 0
+
+// Registers for function arguments.
+const (
+       R1 Register = R0 + 1 + iota
+       R2
+       R3
+       R4
+       R5
+)
+
+// Callee saved registers preserved by function calls.
+const (
+       R6 Register = R5 + 1 + iota
+       R7
+       R8
+       R9
+)
+
+// Read-only frame pointer to access stack.
+const (
+       R10 Register = R9 + 1
+       RFP          = R10
+)
+
+func (r Register) String() string {
+       v := uint8(r)
+       if v == 10 {
+               return "rfp"
+       }
+       return fmt.Sprintf("r%d", v)
+}
diff --git a/vendor/github.com/cilium/ebpf/collection.go b/vendor/github.com/cilium/ebpf/collection.go
new file mode 100644 (file)
index 0000000..5ad1a5e
--- /dev/null
@@ -0,0 +1,148 @@
+package ebpf
+
+import (
+       "github.com/cilium/ebpf/asm"
+       "github.com/pkg/errors"
+)
+
+// CollectionOptions control loading a collection into the kernel.
+type CollectionOptions struct {
+       Programs ProgramOptions
+}
+
+// CollectionSpec describes a collection.
+type CollectionSpec struct {
+       Maps     map[string]*MapSpec
+       Programs map[string]*ProgramSpec
+}
+
+// Copy returns a recursive copy of the spec.
+func (cs *CollectionSpec) Copy() *CollectionSpec {
+       if cs == nil {
+               return nil
+       }
+
+       cpy := CollectionSpec{
+               Maps:     make(map[string]*MapSpec, len(cs.Maps)),
+               Programs: make(map[string]*ProgramSpec, len(cs.Programs)),
+       }
+
+       for name, spec := range cs.Maps {
+               cpy.Maps[name] = spec.Copy()
+       }
+
+       for name, spec := range cs.Programs {
+               cpy.Programs[name] = spec.Copy()
+       }
+
+       return &cpy
+}
+
+// Collection is a collection of Programs and Maps associated
+// with their symbols
+type Collection struct {
+       Programs map[string]*Program
+       Maps     map[string]*Map
+}
+
+// NewCollection creates a Collection from a specification.
+//
+// Only maps referenced by at least one of the programs are initialized.
+func NewCollection(spec *CollectionSpec) (*Collection, error) {
+       return NewCollectionWithOptions(spec, CollectionOptions{})
+}
+
+// NewCollectionWithOptions creates a Collection from a specification.
+//
+// Only maps referenced by at least one of the programs are initialized.
+func NewCollectionWithOptions(spec *CollectionSpec, opts CollectionOptions) (*Collection, error) {
+       maps := make(map[string]*Map)
+       for mapName, mapSpec := range spec.Maps {
+               m, err := NewMap(mapSpec)
+               if err != nil {
+                       return nil, errors.Wrapf(err, "map %s", mapName)
+               }
+               maps[mapName] = m
+       }
+
+       progs := make(map[string]*Program)
+       for progName, origProgSpec := range spec.Programs {
+               progSpec := origProgSpec.Copy()
+
+               // Rewrite any reference to a valid map.
+               for i := range progSpec.Instructions {
+                       var (
+                               ins = &progSpec.Instructions[i]
+                               m   = maps[ins.Reference]
+                       )
+
+                       if ins.Reference == "" || m == nil {
+                               continue
+                       }
+
+                       if ins.Src == asm.R1 {
+                               // Don't overwrite maps already rewritten, users can
+                               // rewrite programs in the spec themselves
+                               continue
+                       }
+
+                       if err := ins.RewriteMapPtr(m.FD()); err != nil {
+                               return nil, errors.Wrapf(err, "progam %s: map %s", progName, ins.Reference)
+                       }
+               }
+
+               prog, err := NewProgramWithOptions(progSpec, opts.Programs)
+               if err != nil {
+                       return nil, errors.Wrapf(err, "program %s", progName)
+               }
+               progs[progName] = prog
+       }
+
+       return &Collection{
+               progs,
+               maps,
+       }, nil
+}
+
+// LoadCollection parses an object file and converts it to a collection.
+func LoadCollection(file string) (*Collection, error) {
+       spec, err := LoadCollectionSpec(file)
+       if err != nil {
+               return nil, err
+       }
+       return NewCollection(spec)
+}
+
+// Close frees all maps and programs associated with the collection.
+//
+// The collection mustn't be used afterwards.
+func (coll *Collection) Close() {
+       for _, prog := range coll.Programs {
+               prog.Close()
+       }
+       for _, m := range coll.Maps {
+               m.Close()
+       }
+}
+
+// DetachMap removes the named map from the Collection.
+//
+// This means that a later call to Close() will not affect this map.
+//
+// Returns nil if no map of that name exists.
+func (coll *Collection) DetachMap(name string) *Map {
+       m := coll.Maps[name]
+       delete(coll.Maps, name)
+       return m
+}
+
+// DetachProgram removes the named program from the Collection.
+//
+// This means that a later call to Close() will not affect this program.
+//
+// Returns nil if no program of that name exists.
+func (coll *Collection) DetachProgram(name string) *Program {
+       p := coll.Programs[name]
+       delete(coll.Programs, name)
+       return p
+}
diff --git a/vendor/github.com/cilium/ebpf/doc.go b/vendor/github.com/cilium/ebpf/doc.go
new file mode 100644 (file)
index 0000000..d96e6b1
--- /dev/null
@@ -0,0 +1,17 @@
+// Package ebpf is a toolkit for working with eBPF programs.
+//
+// eBPF programs are small snippets of code which are executed directly
+// in a VM in the Linux kernel, which makes them very fast and flexible.
+// Many Linux subsystems now accept eBPF programs. This makes it possible
+// to implement highly application specific logic inside the kernel,
+// without having to modify the actual kernel itself.
+//
+// This package is designed for long-running processes which
+// want to use eBPF to implement part of their application logic. It has no
+// run-time dependencies outside of the library and the Linux kernel itself.
+// eBPF code should be compiled ahead of time using clang, and shipped with
+// your application as any other resource.
+//
+// This package doesn't include code required to attach eBPF to Linux
+// subsystems, since this varies per subsystem.
+package ebpf
diff --git a/vendor/github.com/cilium/ebpf/elf_reader.go b/vendor/github.com/cilium/ebpf/elf_reader.go
new file mode 100644 (file)
index 0000000..3bdc084
--- /dev/null
@@ -0,0 +1,392 @@
+package ebpf
+
+import (
+       "bytes"
+       "debug/elf"
+       "encoding/binary"
+       "fmt"
+       "io"
+       "os"
+       "strings"
+
+       "github.com/cilium/ebpf/asm"
+
+       "github.com/pkg/errors"
+)
+
+type elfCode struct {
+       *elf.File
+       symbols           []elf.Symbol
+       symbolsPerSection map[elf.SectionIndex]map[uint64]string
+}
+
+// LoadCollectionSpec parses an ELF file into a CollectionSpec.
+func LoadCollectionSpec(file string) (*CollectionSpec, error) {
+       f, err := os.Open(file)
+       if err != nil {
+               return nil, err
+       }
+       defer f.Close()
+
+       spec, err := LoadCollectionSpecFromReader(f)
+       return spec, errors.Wrapf(err, "file %s", file)
+}
+
+// LoadCollectionSpecFromReader parses an ELF file into a CollectionSpec.
+func LoadCollectionSpecFromReader(code io.ReaderAt) (*CollectionSpec, error) {
+       f, err := elf.NewFile(code)
+       if err != nil {
+               return nil, err
+       }
+       defer f.Close()
+
+       symbols, err := f.Symbols()
+       if err != nil {
+               return nil, errors.Wrap(err, "load symbols")
+       }
+
+       ec := &elfCode{f, symbols, symbolsPerSection(symbols)}
+
+       var licenseSection, versionSection *elf.Section
+       progSections := make(map[elf.SectionIndex]*elf.Section)
+       relSections := make(map[elf.SectionIndex]*elf.Section)
+       mapSections := make(map[elf.SectionIndex]*elf.Section)
+       for i, sec := range ec.Sections {
+               switch {
+               case strings.HasPrefix(sec.Name, "license"):
+                       licenseSection = sec
+               case strings.HasPrefix(sec.Name, "version"):
+                       versionSection = sec
+               case strings.HasPrefix(sec.Name, "maps"):
+                       mapSections[elf.SectionIndex(i)] = sec
+               case sec.Type == elf.SHT_REL:
+                       if int(sec.Info) >= len(ec.Sections) {
+                               return nil, errors.Errorf("found relocation section %v for missing section %v", i, sec.Info)
+                       }
+
+                       // Store relocations under the section index of the target
+                       idx := elf.SectionIndex(sec.Info)
+                       if relSections[idx] != nil {
+                               return nil, errors.Errorf("section %d has multiple relocation sections", idx)
+                       }
+                       relSections[idx] = sec
+               case sec.Type == elf.SHT_PROGBITS && (sec.Flags&elf.SHF_EXECINSTR) != 0 && sec.Size > 0:
+                       progSections[elf.SectionIndex(i)] = sec
+               }
+       }
+
+       license, err := loadLicense(licenseSection)
+       if err != nil {
+               return nil, errors.Wrap(err, "load license")
+       }
+
+       version, err := loadVersion(versionSection, ec.ByteOrder)
+       if err != nil {
+               return nil, errors.Wrap(err, "load version")
+       }
+
+       maps, err := ec.loadMaps(mapSections)
+       if err != nil {
+               return nil, errors.Wrap(err, "load maps")
+       }
+
+       progs, libs, err := ec.loadPrograms(progSections, relSections, license, version)
+       if err != nil {
+               return nil, errors.Wrap(err, "load programs")
+       }
+
+       if len(libs) > 0 {
+               for name, prog := range progs {
+                       prog.Instructions, err = link(prog.Instructions, libs...)
+                       if err != nil {
+                               return nil, errors.Wrapf(err, "program %s", name)
+                       }
+               }
+       }
+
+       return &CollectionSpec{maps, progs}, nil
+}
+
+func loadLicense(sec *elf.Section) (string, error) {
+       if sec == nil {
+               return "", errors.Errorf("missing license section")
+       }
+       data, err := sec.Data()
+       if err != nil {
+               return "", errors.Wrapf(err, "section %s", sec.Name)
+       }
+       return string(bytes.TrimRight(data, "\000")), nil
+}
+
+func loadVersion(sec *elf.Section, bo binary.ByteOrder) (uint32, error) {
+       if sec == nil {
+               return 0, nil
+       }
+
+       var version uint32
+       err := binary.Read(sec.Open(), bo, &version)
+       return version, errors.Wrapf(err, "section %s", sec.Name)
+}
+
+func (ec *elfCode) loadPrograms(progSections, relSections map[elf.SectionIndex]*elf.Section, license string, version uint32) (map[string]*ProgramSpec, []asm.Instructions, error) {
+       var (
+               progs = make(map[string]*ProgramSpec)
+               libs  []asm.Instructions
+       )
+       for idx, prog := range progSections {
+               syms := ec.symbolsPerSection[idx]
+               if len(syms) == 0 {
+                       return nil, nil, errors.Errorf("section %v: missing symbols", prog.Name)
+               }
+
+               funcSym := syms[0]
+               if funcSym == "" {
+                       return nil, nil, errors.Errorf("section %v: no label at start", prog.Name)
+               }
+
+               rels, err := ec.loadRelocations(relSections[idx])
+               if err != nil {
+                       return nil, nil, errors.Wrapf(err, "program %s: can't load relocations", funcSym)
+               }
+
+               insns, err := ec.loadInstructions(prog, syms, rels)
+               if err != nil {
+                       return nil, nil, errors.Wrapf(err, "program %s: can't unmarshal instructions", funcSym)
+               }
+
+               if progType, attachType := getProgType(prog.Name); progType == UnspecifiedProgram {
+                       // There is no single name we can use for "library" sections,
+                       // since they may contain multiple functions. We'll decode the
+                       // labels they contain later on, and then link sections that way.
+                       libs = append(libs, insns)
+               } else {
+                       progs[funcSym] = &ProgramSpec{
+                               Name:          funcSym,
+                               Type:          progType,
+                               AttachType:    attachType,
+                               License:       license,
+                               KernelVersion: version,
+                               Instructions:  insns,
+                       }
+               }
+       }
+       return progs, libs, nil
+}
+
+func (ec *elfCode) loadInstructions(section *elf.Section, symbols, relocations map[uint64]string) (asm.Instructions, error) {
+       var (
+               r      = section.Open()
+               insns  asm.Instructions
+               ins    asm.Instruction
+               offset uint64
+       )
+       for {
+               n, err := ins.Unmarshal(r, ec.ByteOrder)
+               if err == io.EOF {
+                       return insns, nil
+               }
+               if err != nil {
+                       return nil, errors.Wrapf(err, "offset %d", offset)
+               }
+
+               ins.Symbol = symbols[offset]
+               ins.Reference = relocations[offset]
+
+               insns = append(insns, ins)
+               offset += n
+       }
+}
+
+func (ec *elfCode) loadMaps(mapSections map[elf.SectionIndex]*elf.Section) (map[string]*MapSpec, error) {
+       var (
+               maps = make(map[string]*MapSpec)
+               b    = make([]byte, 1)
+       )
+       for idx, sec := range mapSections {
+               syms := ec.symbolsPerSection[idx]
+               if len(syms) == 0 {
+                       return nil, errors.Errorf("section %v: no symbols", sec.Name)
+               }
+
+               if sec.Size%uint64(len(syms)) != 0 {
+                       return nil, errors.Errorf("section %v: map descriptors are not of equal size", sec.Name)
+               }
+
+               var (
+                       r    = sec.Open()
+                       size = sec.Size / uint64(len(syms))
+               )
+               for i, offset := 0, uint64(0); i < len(syms); i, offset = i+1, offset+size {
+                       mapSym := syms[offset]
+                       if mapSym == "" {
+                               fmt.Println(syms)
+                               return nil, errors.Errorf("section %s: missing symbol for map at offset %d", sec.Name, offset)
+                       }
+
+                       if maps[mapSym] != nil {
+                               return nil, errors.Errorf("section %v: map %v already exists", sec.Name, mapSym)
+                       }
+
+                       lr := io.LimitReader(r, int64(size))
+
+                       var spec MapSpec
+                       switch {
+                       case binary.Read(lr, ec.ByteOrder, &spec.Type) != nil:
+                               return nil, errors.Errorf("map %v: missing type", mapSym)
+                       case binary.Read(lr, ec.ByteOrder, &spec.KeySize) != nil:
+                               return nil, errors.Errorf("map %v: missing key size", mapSym)
+                       case binary.Read(lr, ec.ByteOrder, &spec.ValueSize) != nil:
+                               return nil, errors.Errorf("map %v: missing value size", mapSym)
+                       case binary.Read(lr, ec.ByteOrder, &spec.MaxEntries) != nil:
+                               return nil, errors.Errorf("map %v: missing max entries", mapSym)
+                       case binary.Read(lr, ec.ByteOrder, &spec.Flags) != nil:
+                               return nil, errors.Errorf("map %v: missing flags", mapSym)
+                       }
+
+                       for {
+                               _, err := lr.Read(b)
+                               if err == io.EOF {
+                                       break
+                               }
+                               if err != nil {
+                                       return nil, err
+                               }
+                               if b[0] != 0 {
+                                       return nil, errors.Errorf("map %v: unknown and non-zero fields in definition", mapSym)
+                               }
+                       }
+
+                       maps[mapSym] = &spec
+               }
+       }
+       return maps, nil
+}
+
+func getProgType(v string) (ProgramType, AttachType) {
+       types := map[string]ProgramType{
+               // From https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/tools/lib/bpf/libbpf.c#n3568
+               "socket":         SocketFilter,
+               "seccomp":        SocketFilter,
+               "kprobe/":        Kprobe,
+               "kretprobe/":     Kprobe,
+               "tracepoint/":    TracePoint,
+               "xdp":            XDP,
+               "perf_event":     PerfEvent,
+               "sockops":        SockOps,
+               "sk_skb":         SkSKB,
+               "sk_msg":         SkMsg,
+               "lirc_mode2":     LircMode2,
+               "flow_dissector": FlowDissector,
+
+               "cgroup_skb/":       CGroupSKB,
+               "cgroup/dev":        CGroupDevice,
+               "cgroup/skb":        CGroupSKB,
+               "cgroup/sock":       CGroupSock,
+               "cgroup/post_bind":  CGroupSock,
+               "cgroup/bind":       CGroupSockAddr,
+               "cgroup/connect":    CGroupSockAddr,
+               "cgroup/sendmsg":    CGroupSockAddr,
+               "cgroup/recvmsg":    CGroupSockAddr,
+               "cgroup/sysctl":     CGroupSysctl,
+               "cgroup/getsockopt": CGroupSockopt,
+               "cgroup/setsockopt": CGroupSockopt,
+               "classifier":        SchedCLS,
+               "action":            SchedACT,
+       }
+       attachTypes := map[string]AttachType{
+               "cgroup_skb/ingress":    AttachCGroupInetIngress,
+               "cgroup_skb/egress":     AttachCGroupInetEgress,
+               "cgroup/sock":           AttachCGroupInetSockCreate,
+               "cgroup/post_bind4":     AttachCGroupInet4PostBind,
+               "cgroup/post_bind6":     AttachCGroupInet6PostBind,
+               "cgroup/dev":            AttachCGroupDevice,
+               "sockops":               AttachCGroupSockOps,
+               "sk_skb/stream_parser":  AttachSkSKBStreamParser,
+               "sk_skb/stream_verdict": AttachSkSKBStreamVerdict,
+               "sk_msg":                AttachSkSKBStreamVerdict,
+               "lirc_mode2":            AttachLircMode2,
+               "flow_dissector":        AttachFlowDissector,
+               "cgroup/bind4":          AttachCGroupInet4Bind,
+               "cgroup/bind6":          AttachCGroupInet6Bind,
+               "cgroup/connect4":       AttachCGroupInet4Connect,
+               "cgroup/connect6":       AttachCGroupInet6Connect,
+               "cgroup/sendmsg4":       AttachCGroupUDP4Sendmsg,
+               "cgroup/sendmsg6":       AttachCGroupUDP6Sendmsg,
+               "cgroup/recvmsg4":       AttachCGroupUDP4Recvmsg,
+               "cgroup/recvmsg6":       AttachCGroupUDP6Recvmsg,
+               "cgroup/sysctl":         AttachCGroupSysctl,
+               "cgroup/getsockopt":     AttachCGroupGetsockopt,
+               "cgroup/setsockopt":     AttachCGroupSetsockopt,
+       }
+       attachType := AttachNone
+       for k, t := range attachTypes {
+               if strings.HasPrefix(v, k) {
+                       attachType = t
+               }
+       }
+
+       for k, t := range types {
+               if strings.HasPrefix(v, k) {
+                       return t, attachType
+               }
+       }
+       return UnspecifiedProgram, AttachNone
+}
+
+func (ec *elfCode) loadRelocations(sec *elf.Section) (map[uint64]string, error) {
+       rels := make(map[uint64]string)
+       if sec == nil {
+               return rels, nil
+       }
+
+       if sec.Entsize < 16 {
+               return nil, errors.New("rels are less than 16 bytes")
+       }
+
+       r := sec.Open()
+       for off := uint64(0); off < sec.Size; off += sec.Entsize {
+               ent := io.LimitReader(r, int64(sec.Entsize))
+
+               var rel elf.Rel64
+               if binary.Read(ent, ec.ByteOrder, &rel) != nil {
+                       return nil, errors.Errorf("can't parse relocation at offset %v", off)
+               }
+
+               symNo := int(elf.R_SYM64(rel.Info) - 1)
+               if symNo >= len(ec.symbols) {
+                       return nil, errors.Errorf("relocation at offset %d: symbol %v doesnt exist", off, symNo)
+               }
+
+               rels[rel.Off] = ec.symbols[symNo].Name
+       }
+       return rels, nil
+}
+
+func symbolsPerSection(symbols []elf.Symbol) map[elf.SectionIndex]map[uint64]string {
+       result := make(map[elf.SectionIndex]map[uint64]string)
+       for i, sym := range symbols {
+               switch elf.ST_TYPE(sym.Info) {
+               case elf.STT_NOTYPE:
+                       // Older versions of LLVM doesn't tag
+                       // symbols correctly.
+                       break
+               case elf.STT_OBJECT:
+                       break
+               case elf.STT_FUNC:
+                       break
+               default:
+                       continue
+               }
+
+               if sym.Name == "" {
+                       continue
+               }
+
+               idx := sym.Section
+               if _, ok := result[idx]; !ok {
+                       result[idx] = make(map[uint64]string)
+               }
+               result[idx][sym.Value] = symbols[i].Name
+       }
+       return result
+}
diff --git a/vendor/github.com/cilium/ebpf/feature.go b/vendor/github.com/cilium/ebpf/feature.go
new file mode 100644 (file)
index 0000000..9104bc9
--- /dev/null
@@ -0,0 +1,19 @@
+package ebpf
+
+import (
+       "sync"
+)
+
+type featureTest struct {
+       Fn func() bool
+
+       once   sync.Once
+       result bool
+}
+
+func (ft *featureTest) Result() bool {
+       ft.once.Do(func() {
+               ft.result = ft.Fn()
+       })
+       return ft.result
+}
diff --git a/vendor/github.com/cilium/ebpf/go.mod b/vendor/github.com/cilium/ebpf/go.mod
new file mode 100644 (file)
index 0000000..687bdec
--- /dev/null
@@ -0,0 +1,8 @@
+module github.com/cilium/ebpf
+
+go 1.12
+
+require (
+       github.com/pkg/errors v0.8.1
+       golang.org/x/sys v0.0.0-20191022100944-742c48ecaeb7
+)
diff --git a/vendor/github.com/cilium/ebpf/internal/cpu.go b/vendor/github.com/cilium/ebpf/internal/cpu.go
new file mode 100644 (file)
index 0000000..ce3cab7
--- /dev/null
@@ -0,0 +1,64 @@
+package internal
+
+import (
+       "fmt"
+       "os"
+       "sync"
+
+       "github.com/pkg/errors"
+)
+
+var sysCPU struct {
+       once sync.Once
+       err  error
+       num  int
+}
+
+// PossibleCPUs returns the max number of CPUs a system may possibly have
+// Logical CPU numbers must be of the form 0-n
+func PossibleCPUs() (int, error) {
+       sysCPU.once.Do(func() {
+               sysCPU.num, sysCPU.err = parseCPUs("/sys/devices/system/cpu/possible")
+       })
+
+       return sysCPU.num, sysCPU.err
+}
+
+var onlineCPU struct {
+       once sync.Once
+       err  error
+       num  int
+}
+
+// OnlineCPUs returns the number of currently online CPUs
+// Logical CPU numbers must be of the form 0-n
+func OnlineCPUs() (int, error) {
+       onlineCPU.once.Do(func() {
+               onlineCPU.num, onlineCPU.err = parseCPUs("/sys/devices/system/cpu/online")
+       })
+
+       return onlineCPU.num, onlineCPU.err
+}
+
+// parseCPUs parses the number of cpus from sysfs,
+// in the format of "/sys/devices/system/cpu/{possible,online,..}.
+// Logical CPU numbers must be of the form 0-n
+func parseCPUs(path string) (int, error) {
+       file, err := os.Open(path)
+       if err != nil {
+               return 0, err
+       }
+       defer file.Close()
+
+       var low, high int
+       n, _ := fmt.Fscanf(file, "%d-%d", &low, &high)
+       if n < 1 || low != 0 {
+               return 0, errors.Wrapf(err, "%s has unknown format", path)
+       }
+       if n == 1 {
+               high = low
+       }
+
+       // cpus is 0 indexed
+       return high + 1, nil
+}
diff --git a/vendor/github.com/cilium/ebpf/internal/endian.go b/vendor/github.com/cilium/ebpf/internal/endian.go
new file mode 100644 (file)
index 0000000..ac8a94e
--- /dev/null
@@ -0,0 +1,24 @@
+package internal
+
+import (
+       "encoding/binary"
+       "unsafe"
+)
+
+// NativeEndian is set to either binary.BigEndian or binary.LittleEndian,
+// depending on the host's endianness.
+var NativeEndian binary.ByteOrder
+
+func init() {
+       if isBigEndian() {
+               NativeEndian = binary.BigEndian
+       } else {
+               NativeEndian = binary.LittleEndian
+       }
+}
+
+func isBigEndian() (ret bool) {
+       i := int(0x1)
+       bs := (*[int(unsafe.Sizeof(i))]byte)(unsafe.Pointer(&i))
+       return bs[0] == 0
+}
diff --git a/vendor/github.com/cilium/ebpf/internal/unix/types_linux.go b/vendor/github.com/cilium/ebpf/internal/unix/types_linux.go
new file mode 100644 (file)
index 0000000..49c6be5
--- /dev/null
@@ -0,0 +1,118 @@
+// +build linux
+
+package unix
+
+import (
+       "syscall"
+
+       linux "golang.org/x/sys/unix"
+)
+
+const (
+       ENOENT                   = linux.ENOENT
+       EAGAIN                   = linux.EAGAIN
+       ENOSPC                   = linux.ENOSPC
+       EINVAL                   = linux.EINVAL
+       EPOLLIN                  = linux.EPOLLIN
+       BPF_OBJ_NAME_LEN         = linux.BPF_OBJ_NAME_LEN
+       BPF_TAG_SIZE             = linux.BPF_TAG_SIZE
+       SYS_BPF                  = linux.SYS_BPF
+       F_DUPFD_CLOEXEC          = linux.F_DUPFD_CLOEXEC
+       EPOLL_CTL_ADD            = linux.EPOLL_CTL_ADD
+       EPOLL_CLOEXEC            = linux.EPOLL_CLOEXEC
+       O_CLOEXEC                = linux.O_CLOEXEC
+       O_NONBLOCK               = linux.O_NONBLOCK
+       PROT_READ                = linux.PROT_READ
+       PROT_WRITE               = linux.PROT_WRITE
+       MAP_SHARED               = linux.MAP_SHARED
+       PERF_TYPE_SOFTWARE       = linux.PERF_TYPE_SOFTWARE
+       PERF_COUNT_SW_BPF_OUTPUT = linux.PERF_COUNT_SW_BPF_OUTPUT
+       PerfBitWatermark         = linux.PerfBitWatermark
+       PERF_SAMPLE_RAW          = linux.PERF_SAMPLE_RAW
+       PERF_FLAG_FD_CLOEXEC     = linux.PERF_FLAG_FD_CLOEXEC
+)
+
+// Statfs_t is a wrapper
+type Statfs_t = linux.Statfs_t
+
+// Rlimit is a wrapper
+type Rlimit = linux.Rlimit
+
+// Setrlimit is a wrapper
+func Setrlimit(resource int, rlim *Rlimit) (err error) {
+       return linux.Setrlimit(resource, rlim)
+}
+
+// Syscall is a wrapper
+func Syscall(trap, a1, a2, a3 uintptr) (r1, r2 uintptr, err syscall.Errno) {
+       return linux.Syscall(trap, a1, a2, a3)
+}
+
+// FcntlInt is a wrapper
+func FcntlInt(fd uintptr, cmd, arg int) (int, error) {
+       return linux.FcntlInt(fd, cmd, arg)
+}
+
+// Statfs is a wrapper
+func Statfs(path string, buf *Statfs_t) (err error) {
+       return linux.Statfs(path, buf)
+}
+
+// Close is a wrapper
+func Close(fd int) (err error) {
+       return linux.Close(fd)
+}
+
+// EpollEvent is a wrapper
+type EpollEvent = linux.EpollEvent
+
+// EpollWait is a wrapper
+func EpollWait(epfd int, events []EpollEvent, msec int) (n int, err error) {
+       return linux.EpollWait(epfd, events, msec)
+}
+
+// EpollCtl is a wrapper
+func EpollCtl(epfd int, op int, fd int, event *EpollEvent) (err error) {
+       return linux.EpollCtl(epfd, op, fd, event)
+}
+
+// Eventfd is a wrapper
+func Eventfd(initval uint, flags int) (fd int, err error) {
+       return linux.Eventfd(initval, flags)
+}
+
+// Write is a wrapper
+func Write(fd int, p []byte) (n int, err error) {
+       return linux.Write(fd, p)
+}
+
+// EpollCreate1 is a wrapper
+func EpollCreate1(flag int) (fd int, err error) {
+       return linux.EpollCreate1(flag)
+}
+
+// PerfEventMmapPage is a wrapper
+type PerfEventMmapPage linux.PerfEventMmapPage
+
+// SetNonblock is a wrapper
+func SetNonblock(fd int, nonblocking bool) (err error) {
+       return linux.SetNonblock(fd, nonblocking)
+}
+
+// Mmap is a wrapper
+func Mmap(fd int, offset int64, length int, prot int, flags int) (data []byte, err error) {
+       return linux.Mmap(fd, offset, length, prot, flags)
+}
+
+// Munmap is a wrapper
+func Munmap(b []byte) (err error) {
+       return linux.Munmap(b)
+}
+
+// PerfEventAttr is a wrapper
+type PerfEventAttr = linux.PerfEventAttr
+
+// PerfEventOpen is a wrapper
+func PerfEventOpen(attr *PerfEventAttr, pid int, cpu int, groupFd int, flags int) (fd int, err error) {
+       return linux.PerfEventOpen(attr, pid, cpu, groupFd, flags)
+}
diff --git a/vendor/github.com/cilium/ebpf/internal/unix/types_other.go b/vendor/github.com/cilium/ebpf/internal/unix/types_other.go
new file mode 100644 (file)
index 0000000..a327f2a
--- /dev/null
@@ -0,0 +1,183 @@
+// +build !linux
+
+package unix
+
+import (
+       "fmt"
+       "runtime"
+       "syscall"
+)
+
+var errNonLinux = fmt.Errorf("unsupported platform %s/%s", runtime.GOOS, runtime.GOARCH)
+
+const (
+       ENOENT                   = syscall.ENOENT
+       EAGAIN                   = syscall.EAGAIN
+       ENOSPC                   = syscall.ENOSPC
+       EINVAL                   = syscall.EINVAL
+       BPF_OBJ_NAME_LEN         = 0x10
+       BPF_TAG_SIZE             = 0x8
+       SYS_BPF                  = 321
+       F_DUPFD_CLOEXEC          = 0x406
+       EPOLLIN                  = 0x1
+       EPOLL_CTL_ADD            = 0x1
+       EPOLL_CLOEXEC            = 0x80000
+       O_CLOEXEC                = 0x80000
+       O_NONBLOCK               = 0x800
+       PROT_READ                = 0x1
+       PROT_WRITE               = 0x2
+       MAP_SHARED               = 0x1
+       PERF_TYPE_SOFTWARE       = 0x1
+       PERF_COUNT_SW_BPF_OUTPUT = 0xa
+       PerfBitWatermark         = 0x4000
+       PERF_SAMPLE_RAW          = 0x400
+       PERF_FLAG_FD_CLOEXEC     = 0x8
+)
+
+// Statfs_t is a wrapper
+type Statfs_t struct {
+       Type    int64
+       Bsize   int64
+       Blocks  uint64
+       Bfree   uint64
+       Bavail  uint64
+       Files   uint64
+       Ffree   uint64
+       Fsid    [2]int32
+       Namelen int64
+       Frsize  int64
+       Flags   int64
+       Spare   [4]int64
+}
+
+// Rlimit is a wrapper
+type Rlimit struct {
+       Cur uint64
+       Max uint64
+}
+
+// Setrlimit is a wrapper
+func Setrlimit(resource int, rlim *Rlimit) (err error) {
+       return errNonLinux
+}
+
+// Syscall is a wrapper
+func Syscall(trap, a1, a2, a3 uintptr) (r1, r2 uintptr, err syscall.Errno) {
+       return 0, 0, syscall.Errno(1)
+}
+
+// FcntlInt is a wrapper
+func FcntlInt(fd uintptr, cmd, arg int) (int, error) {
+       return -1, errNonLinux
+}
+
+// Statfs is a wrapper
+func Statfs(path string, buf *Statfs_t) error {
+       return errNonLinux
+}
+
+// Close is a wrapper
+func Close(fd int) (err error) {
+       return errNonLinux
+}
+
+// EpollEvent is a wrapper
+type EpollEvent struct {
+       Events uint32
+       Fd     int32
+       Pad    int32
+}
+
+// EpollWait is a wrapper
+func EpollWait(epfd int, events []EpollEvent, msec int) (n int, err error) {
+       return 0, errNonLinux
+}
+
+// EpollCtl is a wrapper
+func EpollCtl(epfd int, op int, fd int, event *EpollEvent) (err error) {
+       return errNonLinux
+}
+
+// Eventfd is a wrapper
+func Eventfd(initval uint, flags int) (fd int, err error) {
+       return 0, errNonLinux
+}
+
+// Write is a wrapper
+func Write(fd int, p []byte) (n int, err error) {
+       return 0, errNonLinux
+}
+
+// EpollCreate1 is a wrapper
+func EpollCreate1(flag int) (fd int, err error) {
+       return 0, errNonLinux
+}
+
+// PerfEventMmapPage is a wrapper
+type PerfEventMmapPage struct {
+       Version        uint32
+       Compat_version uint32
+       Lock           uint32
+       Index          uint32
+       Offset         int64
+       Time_enabled   uint64
+       Time_running   uint64
+       Capabilities   uint64
+       Pmc_width      uint16
+       Time_shift     uint16
+       Time_mult      uint32
+       Time_offset    uint64
+       Time_zero      uint64
+       Size           uint32
+
+       Data_head   uint64
+       Data_tail   uint64
+       Data_offset uint64
+       Data_size   uint64
+       Aux_head    uint64
+       Aux_tail    uint64
+       Aux_offset  uint64
+       Aux_size    uint64
+}
+
+// SetNonblock is a wrapper
+func SetNonblock(fd int, nonblocking bool) (err error) {
+       return errNonLinux
+}
+
+// Mmap is a wrapper
+func Mmap(fd int, offset int64, length int, prot int, flags int) (data []byte, err error) {
+       return []byte{}, errNonLinux
+}
+
+// Munmap is a wrapper
+func Munmap(b []byte) (err error) {
+       return errNonLinux
+}
+
+// PerfEventAttr is a wrapper
+type PerfEventAttr struct {
+       Type               uint32
+       Size               uint32
+       Config             uint64
+       Sample             uint64
+       Sample_type        uint64
+       Read_format        uint64
+       Bits               uint64
+       Wakeup             uint32
+       Bp_type            uint32
+       Ext1               uint64
+       Ext2               uint64
+       Branch_sample_type uint64
+       Sample_regs_user   uint64
+       Sample_stack_user  uint32
+       Clockid            int32
+       Sample_regs_intr   uint64
+       Aux_watermark      uint32
+       Sample_max_stack   uint16
+}
+
+// PerfEventOpen is a wrapper
+func PerfEventOpen(attr *PerfEventAttr, pid int, cpu int, groupFd int, flags int) (fd int, err error) {
+       return 0, errNonLinux
+}
diff --git a/vendor/github.com/cilium/ebpf/linker.go b/vendor/github.com/cilium/ebpf/linker.go
new file mode 100644 (file)
index 0000000..da556c2
--- /dev/null
@@ -0,0 +1,58 @@
+package ebpf
+
+import (
+       "github.com/cilium/ebpf/asm"
+)
+
+// link resolves bpf-to-bpf calls.
+//
+// Each section may contain multiple functions / labels, and is only linked
+// if the program being edited references one of these functions.
+//
+// Sections must not require linking themselves.
+func link(insns asm.Instructions, sections ...asm.Instructions) (asm.Instructions, error) {
+       for _, section := range sections {
+               var err error
+               insns, err = linkSection(insns, section)
+               if err != nil {
+                       return nil, err
+               }
+       }
+       return insns, nil
+}
+
+func linkSection(insns, section asm.Instructions) (asm.Instructions, error) {
+       // A map of symbols to the libraries which contain them.
+       symbols, err := section.SymbolOffsets()
+       if err != nil {
+               return nil, err
+       }
+
+       for _, ins := range insns {
+               if ins.Reference == "" {
+                       continue
+               }
+
+               if ins.OpCode.JumpOp() != asm.Call || ins.Src != asm.R1 {
+                       continue
+               }
+
+               if ins.Constant != -1 {
+                       // This is already a valid call, no need to link again.
+                       continue
+               }
+
+               if _, ok := symbols[ins.Reference]; !ok {
+                       // Symbol isn't available in this section
+                       continue
+               }
+
+               // At this point we know that at least one function in the
+               // library is called from insns. Merge the two sections.
+               // The rewrite of ins.Constant happens in asm.Instruction.Marshal.
+               return append(insns, section...), nil
+       }
+
+       // None of the functions in the section are called. Do nothing.
+       return insns, nil
+}
diff --git a/vendor/github.com/cilium/ebpf/map.go b/vendor/github.com/cilium/ebpf/map.go
new file mode 100644 (file)
index 0000000..028a913
--- /dev/null
@@ -0,0 +1,595 @@
+package ebpf
+
+import (
+       "fmt"
+       "unsafe"
+
+       "github.com/cilium/ebpf/internal"
+       "github.com/cilium/ebpf/internal/unix"
+
+       "github.com/pkg/errors"
+)
+
+// MapSpec defines a Map.
+type MapSpec struct {
+       // Name is passed to the kernel as a debug aid. Must only contain
+       // alpha numeric and '_' characters.
+       Name       string
+       Type       MapType
+       KeySize    uint32
+       ValueSize  uint32
+       MaxEntries uint32
+       Flags      uint32
+       // InnerMap is used as a template for ArrayOfMaps and HashOfMaps
+       InnerMap *MapSpec
+}
+
+func (ms *MapSpec) String() string {
+       return fmt.Sprintf("%s(keySize=%d, valueSize=%d, maxEntries=%d, flags=%d)", ms.Type, ms.KeySize, ms.ValueSize, ms.MaxEntries, ms.Flags)
+}
+
+// Copy returns a copy of the spec.
+func (ms *MapSpec) Copy() *MapSpec {
+       if ms == nil {
+               return nil
+       }
+
+       cpy := *ms
+       cpy.InnerMap = ms.InnerMap.Copy()
+       return &cpy
+}
+
+// Map represents a Map file descriptor.
+//
+// It is not safe to close a map which is used by other goroutines.
+//
+// Methods which take interface{} arguments by default encode
+// them using binary.Read/Write in the machine's native endianness.
+//
+// Implement encoding.BinaryMarshaler or encoding.BinaryUnmarshaler
+// if you require custom encoding.
+type Map struct {
+       fd  *bpfFD
+       abi MapABI
+       // Per CPU maps return values larger than the size in the spec
+       fullValueSize int
+}
+
+// NewMapFromFD creates a map from a raw fd.
+//
+// You should not use fd after calling this function.
+func NewMapFromFD(fd int) (*Map, error) {
+       if fd < 0 {
+               return nil, errors.New("invalid fd")
+       }
+       bpfFd := newBPFFD(uint32(fd))
+
+       abi, err := newMapABIFromFd(bpfFd)
+       if err != nil {
+               bpfFd.forget()
+               return nil, err
+       }
+       return newMap(bpfFd, abi)
+}
+
+// NewMap creates a new Map.
+//
+// Creating a map for the first time will perform feature detection
+// by creating small, temporary maps.
+func NewMap(spec *MapSpec) (*Map, error) {
+       if spec.Type != ArrayOfMaps && spec.Type != HashOfMaps {
+               return createMap(spec, nil)
+       }
+
+       if spec.InnerMap == nil {
+               return nil, errors.Errorf("%s requires InnerMap", spec.Type)
+       }
+
+       template, err := createMap(spec.InnerMap, nil)
+       if err != nil {
+               return nil, err
+       }
+       defer template.Close()
+
+       return createMap(spec, template.fd)
+}
+
+func createMap(spec *MapSpec, inner *bpfFD) (*Map, error) {
+       spec = spec.Copy()
+
+       switch spec.Type {
+       case ArrayOfMaps:
+               fallthrough
+       case HashOfMaps:
+               if spec.ValueSize != 0 && spec.ValueSize != 4 {
+                       return nil, errors.Errorf("ValueSize must be zero or four for map of map")
+               }
+               spec.ValueSize = 4
+
+       case PerfEventArray:
+               if spec.KeySize != 0 {
+                       return nil, errors.Errorf("KeySize must be zero for perf event array")
+               }
+               if spec.ValueSize != 0 {
+                       return nil, errors.Errorf("ValueSize must be zero for perf event array")
+               }
+               if spec.MaxEntries == 0 {
+                       n, err := internal.OnlineCPUs()
+                       if err != nil {
+                               return nil, errors.Wrap(err, "perf event array")
+                       }
+                       spec.MaxEntries = uint32(n)
+               }
+
+               spec.KeySize = 4
+               spec.ValueSize = 4
+       }
+
+       attr := bpfMapCreateAttr{
+               mapType:    spec.Type,
+               keySize:    spec.KeySize,
+               valueSize:  spec.ValueSize,
+               maxEntries: spec.MaxEntries,
+               flags:      spec.Flags,
+       }
+
+       if inner != nil {
+               var err error
+               attr.innerMapFd, err = inner.value()
+               if err != nil {
+                       return nil, errors.Wrap(err, "map create")
+               }
+       }
+
+       name, err := newBPFObjName(spec.Name)
+       if err != nil {
+               return nil, errors.Wrap(err, "map create")
+       }
+
+       if haveObjName.Result() {
+               attr.mapName = name
+       }
+
+       fd, err := bpfMapCreate(&attr)
+       if err != nil {
+               return nil, errors.Wrap(err, "map create")
+       }
+
+       return newMap(fd, newMapABIFromSpec(spec))
+}
+
+func newMap(fd *bpfFD, abi *MapABI) (*Map, error) {
+       m := &Map{
+               fd,
+               *abi,
+               int(abi.ValueSize),
+       }
+
+       if !abi.Type.hasPerCPUValue() {
+               return m, nil
+       }
+
+       possibleCPUs, err := internal.PossibleCPUs()
+       if err != nil {
+               return nil, err
+       }
+
+       m.fullValueSize = align(int(abi.ValueSize), 8) * possibleCPUs
+       return m, nil
+}
+
+func (m *Map) String() string {
+       return fmt.Sprintf("%s#%d", m.abi.Type, m.fd)
+}
+
+// ABI gets the ABI of the Map
+func (m *Map) ABI() MapABI {
+       return m.abi
+}
+
+// Lookup retrieves a value from a Map.
+//
+// Calls Close() on valueOut if it is of type **Map or **Program,
+// and *valueOut is not nil.
+//
+// Returns an error if the key doesn't exist, see IsNotExist.
+func (m *Map) Lookup(key, valueOut interface{}) error {
+       valuePtr, valueBytes := makeBuffer(valueOut, m.fullValueSize)
+
+       if err := m.lookup(key, valuePtr); err != nil {
+               return err
+       }
+
+       if valueBytes == nil {
+               return nil
+       }
+
+       if m.abi.Type.hasPerCPUValue() {
+               return unmarshalPerCPUValue(valueOut, int(m.abi.ValueSize), valueBytes)
+       }
+
+       switch value := valueOut.(type) {
+       case **Map:
+               m, err := unmarshalMap(valueBytes)
+               if err != nil {
+                       return err
+               }
+
+               (*value).Close()
+               *value = m
+               return nil
+       case *Map:
+               return errors.Errorf("can't unmarshal into %T, need %T", value, (**Map)(nil))
+       case Map:
+               return errors.Errorf("can't unmarshal into %T, need %T", value, (**Map)(nil))
+
+       case **Program:
+               p, err := unmarshalProgram(valueBytes)
+               if err != nil {
+                       return err
+               }
+
+               (*value).Close()
+               *value = p
+               return nil
+       case *Program:
+               return errors.Errorf("can't unmarshal into %T, need %T", value, (**Program)(nil))
+       case Program:
+               return errors.Errorf("can't unmarshal into %T, need %T", value, (**Program)(nil))
+
+       default:
+               return unmarshalBytes(valueOut, valueBytes)
+       }
+}
+
+// LookupBytes gets a value from Map.
+//
+// Returns a nil value if a key doesn't exist.
+func (m *Map) LookupBytes(key interface{}) ([]byte, error) {
+       valueBytes := make([]byte, m.fullValueSize)
+       valuePtr := newPtr(unsafe.Pointer(&valueBytes[0]))
+
+       err := m.lookup(key, valuePtr)
+       if IsNotExist(err) {
+               return nil, nil
+       }
+
+       return valueBytes, err
+}
+
+func (m *Map) lookup(key interface{}, valueOut syscallPtr) error {
+       keyPtr, err := marshalPtr(key, int(m.abi.KeySize))
+       if err != nil {
+               return errors.WithMessage(err, "can't marshal key")
+       }
+
+       err = bpfMapLookupElem(m.fd, keyPtr, valueOut)
+       return errors.WithMessage(err, "lookup failed")
+}
+
+// MapUpdateFlags controls the behaviour of the Map.Update call.
+//
+// The exact semantics depend on the specific MapType.
+type MapUpdateFlags uint64
+
+const (
+       // UpdateAny creates a new element or update an existing one.
+       UpdateAny MapUpdateFlags = iota
+       // UpdateNoExist creates a new element.
+       UpdateNoExist MapUpdateFlags = 1 << (iota - 1)
+       // UpdateExist updates an existing element.
+       UpdateExist
+)
+
+// Put replaces or creates a value in map.
+//
+// It is equivalent to calling Update with UpdateAny.
+func (m *Map) Put(key, value interface{}) error {
+       return m.Update(key, value, UpdateAny)
+}
+
+// Update changes the value of a key.
+func (m *Map) Update(key, value interface{}, flags MapUpdateFlags) error {
+       keyPtr, err := marshalPtr(key, int(m.abi.KeySize))
+       if err != nil {
+               return errors.WithMessage(err, "can't marshal key")
+       }
+
+       var valuePtr syscallPtr
+       if m.abi.Type.hasPerCPUValue() {
+               valuePtr, err = marshalPerCPUValue(value, int(m.abi.ValueSize))
+       } else {
+               valuePtr, err = marshalPtr(value, int(m.abi.ValueSize))
+       }
+       if err != nil {
+               return errors.WithMessage(err, "can't marshal value")
+       }
+
+       return bpfMapUpdateElem(m.fd, keyPtr, valuePtr, uint64(flags))
+}
+
+// Delete removes a value.
+//
+// Returns an error if the key does not exist, see IsNotExist.
+func (m *Map) Delete(key interface{}) error {
+       keyPtr, err := marshalPtr(key, int(m.abi.KeySize))
+       if err != nil {
+               return errors.WithMessage(err, "can't marshal key")
+       }
+
+       err = bpfMapDeleteElem(m.fd, keyPtr)
+       return errors.WithMessage(err, "can't delete key")
+}
+
+// NextKey finds the key following an initial key.
+//
+// See NextKeyBytes for details.
+func (m *Map) NextKey(key, nextKeyOut interface{}) error {
+       nextKeyPtr, nextKeyBytes := makeBuffer(nextKeyOut, int(m.abi.KeySize))
+
+       if err := m.nextKey(key, nextKeyPtr); err != nil {
+               return err
+       }
+
+       if nextKeyBytes == nil {
+               return nil
+       }
+
+       err := unmarshalBytes(nextKeyOut, nextKeyBytes)
+       return errors.WithMessage(err, "can't unmarshal next key")
+}
+
+// NextKeyBytes returns the key following an initial key as a byte slice.
+//
+// Passing nil will return the first key.
+//
+// Use Iterate if you want to traverse all entries in the map.
+func (m *Map) NextKeyBytes(key interface{}) ([]byte, error) {
+       nextKey := make([]byte, m.abi.KeySize)
+       nextKeyPtr := newPtr(unsafe.Pointer(&nextKey[0]))
+
+       err := m.nextKey(key, nextKeyPtr)
+       if IsNotExist(err) {
+               return nil, nil
+       }
+
+       return nextKey, err
+}
+
+func (m *Map) nextKey(key interface{}, nextKeyOut syscallPtr) error {
+       var (
+               keyPtr syscallPtr
+               err    error
+       )
+
+       if key != nil {
+               keyPtr, err = marshalPtr(key, int(m.abi.KeySize))
+               if err != nil {
+                       return errors.WithMessage(err, "can't marshal key")
+               }
+       }
+
+       err = bpfMapGetNextKey(m.fd, keyPtr, nextKeyOut)
+       return errors.WithMessage(err, "can't get next key")
+}
+
+// Iterate traverses a map.
+//
+// It's safe to create multiple iterators at the same time.
+//
+// It's not possible to guarantee that all keys in a map will be
+// returned if there are concurrent modifications to the map.
+func (m *Map) Iterate() *MapIterator {
+       return newMapIterator(m)
+}
+
+// Close removes a Map
+func (m *Map) Close() error {
+       if m == nil {
+               // This makes it easier to clean up when iterating maps
+               // of maps / programs.
+               return nil
+       }
+
+       return m.fd.close()
+}
+
+// FD gets the file descriptor of the Map.
+//
+// Calling this function is invalid after Close has been called.
+func (m *Map) FD() int {
+       fd, err := m.fd.value()
+       if err != nil {
+               // Best effort: -1 is the number most likely to be an
+               // invalid file descriptor.
+               return -1
+       }
+
+       return int(fd)
+}
+
+// Clone creates a duplicate of the Map.
+//
+// Closing the duplicate does not affect the original, and vice versa.
+// Changes made to the map are reflected by both instances however.
+//
+// Cloning a nil Map returns nil.
+func (m *Map) Clone() (*Map, error) {
+       if m == nil {
+               return nil, nil
+       }
+
+       dup, err := m.fd.dup()
+       if err != nil {
+               return nil, errors.Wrap(err, "can't clone map")
+       }
+
+       return newMap(dup, &m.abi)
+}
+
+// Pin persists the map past the lifetime of the process that created it.
+//
+// This requires bpffs to be mounted above fileName. See http://cilium.readthedocs.io/en/doc-1.0/kubernetes/install/#mounting-the-bpf-fs-optional
+func (m *Map) Pin(fileName string) error {
+       return bpfPinObject(fileName, m.fd)
+}
+
+// LoadPinnedMap load a Map from a BPF file.
+//
+// Requires at least Linux 4.13, and is not compatible with
+// nested maps. Use LoadPinnedMapExplicit in these situations.
+func LoadPinnedMap(fileName string) (*Map, error) {
+       fd, err := bpfGetObject(fileName)
+       if err != nil {
+               return nil, err
+       }
+       abi, err := newMapABIFromFd(fd)
+       if err != nil {
+               _ = fd.close()
+               return nil, err
+       }
+       return newMap(fd, abi)
+}
+
+// LoadPinnedMapExplicit loads a map with explicit parameters.
+func LoadPinnedMapExplicit(fileName string, abi *MapABI) (*Map, error) {
+       fd, err := bpfGetObject(fileName)
+       if err != nil {
+               return nil, err
+       }
+       return newMap(fd, abi)
+}
+
+func unmarshalMap(buf []byte) (*Map, error) {
+       if len(buf) != 4 {
+               return nil, errors.New("map id requires 4 byte value")
+       }
+
+       // Looking up an entry in a nested map or prog array returns an id,
+       // not an fd.
+       id := internal.NativeEndian.Uint32(buf)
+       fd, err := bpfGetMapFDByID(id)
+       if err != nil {
+               return nil, err
+       }
+
+       abi, err := newMapABIFromFd(fd)
+       if err != nil {
+               _ = fd.close()
+               return nil, err
+       }
+
+       return newMap(fd, abi)
+}
+
+// MarshalBinary implements BinaryMarshaler.
+func (m *Map) MarshalBinary() ([]byte, error) {
+       fd, err := m.fd.value()
+       if err != nil {
+               return nil, err
+       }
+
+       buf := make([]byte, 4)
+       internal.NativeEndian.PutUint32(buf, fd)
+       return buf, nil
+}
+
+// MapIterator iterates a Map.
+//
+// See Map.Iterate.
+type MapIterator struct {
+       target            *Map
+       prevKey           interface{}
+       prevBytes         []byte
+       count, maxEntries uint32
+       done              bool
+       err               error
+}
+
+func newMapIterator(target *Map) *MapIterator {
+       return &MapIterator{
+               target:     target,
+               maxEntries: target.abi.MaxEntries,
+               prevBytes:  make([]byte, int(target.abi.KeySize)),
+       }
+}
+
+var errIterationAborted = errors.New("iteration aborted")
+
+// Next decodes the next key and value.
+//
+// Iterating a hash map from which keys are being deleted is not
+// safe. You may see the same key multiple times. Iteration may
+// also abort with an error, see IsIterationAborted.
+//
+// Returns false if there are no more entries. You must check
+// the result of Err afterwards.
+//
+// See Map.Get for further caveats around valueOut.
+func (mi *MapIterator) Next(keyOut, valueOut interface{}) bool {
+       if mi.err != nil || mi.done {
+               return false
+       }
+
+       for ; mi.count < mi.maxEntries; mi.count++ {
+               var nextBytes []byte
+               nextBytes, mi.err = mi.target.NextKeyBytes(mi.prevKey)
+               if mi.err != nil {
+                       return false
+               }
+
+               if nextBytes == nil {
+                       mi.done = true
+                       return false
+               }
+
+               // The user can get access to nextBytes since unmarshalBytes
+               // does not copy when unmarshaling into a []byte.
+               // Make a copy to prevent accidental corruption of
+               // iterator state.
+               copy(mi.prevBytes, nextBytes)
+               mi.prevKey = mi.prevBytes
+
+               mi.err = mi.target.Lookup(nextBytes, valueOut)
+               if IsNotExist(mi.err) {
+                       // Even though the key should be valid, we couldn't look up
+                       // its value. If we're iterating a hash map this is probably
+                       // because a concurrent delete removed the value before we
+                       // could get it. This means that the next call to NextKeyBytes
+                       // is very likely to restart iteration.
+                       // If we're iterating one of the fd maps like
+                       // ProgramArray it means that a given slot doesn't have
+                       // a valid fd associated. It's OK to continue to the next slot.
+                       continue
+               }
+               if mi.err != nil {
+                       return false
+               }
+
+               mi.err = unmarshalBytes(keyOut, nextBytes)
+               return mi.err == nil
+       }
+
+       mi.err = errIterationAborted
+       return false
+}
+
+// Err returns any encountered error.
+//
+// The method must be called after Next returns nil.
+func (mi *MapIterator) Err() error {
+       return mi.err
+}
+
+// IsNotExist returns true if the error indicates that a
+// key doesn't exist.
+func IsNotExist(err error) bool {
+       return errors.Cause(err) == unix.ENOENT
+}
+
+// IsIterationAborted returns true if the iteration was aborted.
+//
+// This occurs when keys are deleted from a hash map during iteration.
+func IsIterationAborted(err error) bool {
+       return errors.Cause(err) == errIterationAborted
+}
diff --git a/vendor/github.com/cilium/ebpf/marshalers.go b/vendor/github.com/cilium/ebpf/marshalers.go
new file mode 100644 (file)
index 0000000..44ba273
--- /dev/null
@@ -0,0 +1,192 @@
+package ebpf
+
+import (
+       "bytes"
+       "encoding"
+       "encoding/binary"
+       "reflect"
+       "runtime"
+       "unsafe"
+
+       "github.com/cilium/ebpf/internal"
+
+       "github.com/pkg/errors"
+)
+
+func marshalPtr(data interface{}, length int) (syscallPtr, error) {
+       if ptr, ok := data.(unsafe.Pointer); ok {
+               return newPtr(ptr), nil
+       }
+
+       buf, err := marshalBytes(data, length)
+       if err != nil {
+               return syscallPtr{}, err
+       }
+
+       return newPtr(unsafe.Pointer(&buf[0])), nil
+}
+
+func marshalBytes(data interface{}, length int) (buf []byte, err error) {
+       switch value := data.(type) {
+       case encoding.BinaryMarshaler:
+               buf, err = value.MarshalBinary()
+       case string:
+               buf = []byte(value)
+       case []byte:
+               buf = value
+       case unsafe.Pointer:
+               err = errors.New("can't marshal from unsafe.Pointer")
+       default:
+               var wr bytes.Buffer
+               err = binary.Write(&wr, internal.NativeEndian, value)
+               err = errors.Wrapf(err, "encoding %T", value)
+               buf = wr.Bytes()
+       }
+       if err != nil {
+               return nil, err
+       }
+
+       if len(buf) != length {
+               return nil, errors.Errorf("%T doesn't marshal to %d bytes", data, length)
+       }
+       return buf, nil
+}
+
+func makeBuffer(dst interface{}, length int) (syscallPtr, []byte) {
+       if ptr, ok := dst.(unsafe.Pointer); ok {
+               return newPtr(ptr), nil
+       }
+
+       buf := make([]byte, length)
+       return newPtr(unsafe.Pointer(&buf[0])), buf
+}
+
+func unmarshalBytes(data interface{}, buf []byte) error {
+       switch value := data.(type) {
+       case unsafe.Pointer:
+               sh := &reflect.SliceHeader{
+                       Data: uintptr(value),
+                       Len:  len(buf),
+                       Cap:  len(buf),
+               }
+
+               dst := *(*[]byte)(unsafe.Pointer(sh))
+               copy(dst, buf)
+               runtime.KeepAlive(value)
+               return nil
+       case encoding.BinaryUnmarshaler:
+               return value.UnmarshalBinary(buf)
+       case *string:
+               *value = string(buf)
+               return nil
+       case *[]byte:
+               *value = buf
+               return nil
+       case string:
+               return errors.New("require pointer to string")
+       case []byte:
+               return errors.New("require pointer to []byte")
+       default:
+               rd := bytes.NewReader(buf)
+               err := binary.Read(rd, internal.NativeEndian, value)
+               return errors.Wrapf(err, "decoding %T", value)
+       }
+}
+
+// marshalPerCPUValue encodes a slice containing one value per
+// possible CPU into a buffer of bytes.
+//
+// Values are initialized to zero if the slice has less elements than CPUs.
+//
+// slice must have a type like []elementType.
+func marshalPerCPUValue(slice interface{}, elemLength int) (syscallPtr, error) {
+       sliceType := reflect.TypeOf(slice)
+       if sliceType.Kind() != reflect.Slice {
+               return syscallPtr{}, errors.New("per-CPU value requires slice")
+       }
+
+       possibleCPUs, err := internal.PossibleCPUs()
+       if err != nil {
+               return syscallPtr{}, err
+       }
+
+       sliceValue := reflect.ValueOf(slice)
+       sliceLen := sliceValue.Len()
+       if sliceLen > possibleCPUs {
+               return syscallPtr{}, errors.Errorf("per-CPU value exceeds number of CPUs")
+       }
+
+       alignedElemLength := align(elemLength, 8)
+       buf := make([]byte, alignedElemLength*possibleCPUs)
+
+       for i := 0; i < sliceLen; i++ {
+               elem := sliceValue.Index(i).Interface()
+               elemBytes, err := marshalBytes(elem, elemLength)
+               if err != nil {
+                       return syscallPtr{}, err
+               }
+
+               offset := i * alignedElemLength
+               copy(buf[offset:offset+elemLength], elemBytes)
+       }
+
+       return newPtr(unsafe.Pointer(&buf[0])), nil
+}
+
+// unmarshalPerCPUValue decodes a buffer into a slice containing one value per
+// possible CPU.
+//
+// valueOut must have a type like *[]elementType
+func unmarshalPerCPUValue(slicePtr interface{}, elemLength int, buf []byte) error {
+       slicePtrType := reflect.TypeOf(slicePtr)
+       if slicePtrType.Kind() != reflect.Ptr || slicePtrType.Elem().Kind() != reflect.Slice {
+               return errors.Errorf("per-cpu value requires pointer to slice")
+       }
+
+       possibleCPUs, err := internal.PossibleCPUs()
+       if err != nil {
+               return err
+       }
+
+       sliceType := slicePtrType.Elem()
+       slice := reflect.MakeSlice(sliceType, possibleCPUs, possibleCPUs)
+
+       sliceElemType := sliceType.Elem()
+       sliceElemIsPointer := sliceElemType.Kind() == reflect.Ptr
+       if sliceElemIsPointer {
+               sliceElemType = sliceElemType.Elem()
+       }
+
+       step := len(buf) / possibleCPUs
+       if step < elemLength {
+               return errors.Errorf("per-cpu element length is larger than available data")
+       }
+       for i := 0; i < possibleCPUs; i++ {
+               var elem interface{}
+               if sliceElemIsPointer {
+                       newElem := reflect.New(sliceElemType)
+                       slice.Index(i).Set(newElem)
+                       elem = newElem.Interface()
+               } else {
+                       elem = slice.Index(i).Addr().Interface()
+               }
+
+               // Make a copy, since unmarshal can hold on to itemBytes
+               elemBytes := make([]byte, elemLength)
+               copy(elemBytes, buf[:elemLength])
+
+               err := unmarshalBytes(elem, elemBytes)
+               if err != nil {
+                       return errors.Wrapf(err, "cpu %d", i)
+               }
+
+               buf = buf[step:]
+       }
+
+       reflect.ValueOf(slicePtr).Elem().Set(slice)
+       return nil
+}
+
+func align(n, alignment int) int {
+       return (int(n) + alignment - 1) / alignment * alignment
+}
diff --git a/vendor/github.com/cilium/ebpf/prog.go b/vendor/github.com/cilium/ebpf/prog.go
new file mode 100644 (file)
index 0000000..03b24fb
--- /dev/null
@@ -0,0 +1,523 @@
+package ebpf
+
+import (
+       "bytes"
+       "fmt"
+       "math"
+       "path/filepath"
+       "strings"
+       "time"
+       "unsafe"
+
+       "github.com/cilium/ebpf/asm"
+       "github.com/cilium/ebpf/internal"
+       "github.com/cilium/ebpf/internal/unix"
+
+       "github.com/pkg/errors"
+)
+
+var (
+       errNotSupported = errors.New("ebpf: not supported by kernel")
+)
+
+const (
+       // Number of bytes to pad the output buffer for BPF_PROG_TEST_RUN.
+       // This is currently the maximum of spare space allocated for SKB
+       // and XDP programs, and equal to XDP_PACKET_HEADROOM + NET_IP_ALIGN.
+       outputPad = 256 + 2
+)
+
+// DefaultVerifierLogSize is the default number of bytes allocated for the
+// verifier log.
+const DefaultVerifierLogSize = 64 * 1024
+
+// ProgramOptions control loading a program into the kernel.
+type ProgramOptions struct {
+       // Controls the detail emitted by the kernel verifier. Set to non-zero
+       // to enable logging.
+       LogLevel uint32
+       // Controls the output buffer size for the verifier. Defaults to
+       // DefaultVerifierLogSize.
+       LogSize int
+}
+
+// ProgramSpec defines a Program
+type ProgramSpec struct {
+       // Name is passed to the kernel as a debug aid. Must only contain
+       // alpha numeric and '_' characters.
+       Name          string
+       Type          ProgramType
+       AttachType    AttachType
+       Instructions  asm.Instructions
+       License       string
+       KernelVersion uint32
+}
+
+// Copy returns a copy of the spec.
+func (ps *ProgramSpec) Copy() *ProgramSpec {
+       if ps == nil {
+               return nil
+       }
+
+       cpy := *ps
+       cpy.Instructions = make(asm.Instructions, len(ps.Instructions))
+       copy(cpy.Instructions, ps.Instructions)
+       return &cpy
+}
+
+// Program represents BPF program loaded into the kernel.
+//
+// It is not safe to close a Program which is used by other goroutines.
+type Program struct {
+       // Contains the output of the kernel verifier if enabled,
+       // otherwise it is empty.
+       VerifierLog string
+
+       fd   *bpfFD
+       name string
+       abi  ProgramABI
+}
+
+// NewProgram creates a new Program.
+//
+// Loading a program for the first time will perform
+// feature detection by loading small, temporary programs.
+func NewProgram(spec *ProgramSpec) (*Program, error) {
+       return NewProgramWithOptions(spec, ProgramOptions{})
+}
+
+// NewProgramWithOptions creates a new Program.
+//
+// Loading a program for the first time will perform
+// feature detection by loading small, temporary programs.
+func NewProgramWithOptions(spec *ProgramSpec, opts ProgramOptions) (*Program, error) {
+       attr, err := convertProgramSpec(spec, haveObjName.Result())
+       if err != nil {
+               return nil, err
+       }
+
+       logSize := DefaultVerifierLogSize
+       if opts.LogSize > 0 {
+               logSize = opts.LogSize
+       }
+
+       var logBuf []byte
+       if opts.LogLevel > 0 {
+               logBuf = make([]byte, logSize)
+               attr.logLevel = opts.LogLevel
+               attr.logSize = uint32(len(logBuf))
+               attr.logBuf = newPtr(unsafe.Pointer(&logBuf[0]))
+       }
+
+       fd, err := bpfProgLoad(attr)
+       if err == nil {
+               prog := newProgram(fd, spec.Name, &ProgramABI{spec.Type})
+               prog.VerifierLog = convertCString(logBuf)
+               return prog, nil
+       }
+
+       truncated := errors.Cause(err) == unix.ENOSPC
+       if opts.LogLevel == 0 {
+               // Re-run with the verifier enabled to get better error messages.
+               logBuf = make([]byte, logSize)
+               attr.logLevel = 1
+               attr.logSize = uint32(len(logBuf))
+               attr.logBuf = newPtr(unsafe.Pointer(&logBuf[0]))
+
+               _, nerr := bpfProgLoad(attr)
+               truncated = errors.Cause(nerr) == unix.ENOSPC
+       }
+
+       logs := convertCString(logBuf)
+       if truncated {
+               logs += "\n(truncated...)"
+       }
+
+       return nil, &loadError{err, logs}
+}
+
+// NewProgramFromFD creates a program from a raw fd.
+//
+// You should not use fd after calling this function.
+func NewProgramFromFD(fd int) (*Program, error) {
+       if fd < 0 {
+               return nil, errors.New("invalid fd")
+       }
+       bpfFd := newBPFFD(uint32(fd))
+
+       info, err := bpfGetProgInfoByFD(bpfFd)
+       if err != nil {
+               bpfFd.forget()
+               return nil, err
+       }
+
+       var name string
+       if bpfName := convertCString(info.name[:]); bpfName != "" {
+               name = bpfName
+       } else {
+               name = convertCString(info.tag[:])
+       }
+
+       return newProgram(bpfFd, name, newProgramABIFromInfo(info)), nil
+}
+
+func newProgram(fd *bpfFD, name string, abi *ProgramABI) *Program {
+       return &Program{
+               name: name,
+               fd:   fd,
+               abi:  *abi,
+       }
+}
+
+func convertProgramSpec(spec *ProgramSpec, includeName bool) (*bpfProgLoadAttr, error) {
+       if len(spec.Instructions) == 0 {
+               return nil, errors.New("Instructions cannot be empty")
+       }
+
+       if len(spec.License) == 0 {
+               return nil, errors.New("License cannot be empty")
+       }
+
+       buf := bytes.NewBuffer(make([]byte, 0, len(spec.Instructions)*asm.InstructionSize))
+       err := spec.Instructions.Marshal(buf, internal.NativeEndian)
+       if err != nil {
+               return nil, err
+       }
+
+       bytecode := buf.Bytes()
+       insCount := uint32(len(bytecode) / asm.InstructionSize)
+       lic := []byte(spec.License)
+       attr := &bpfProgLoadAttr{
+               progType:           spec.Type,
+               expectedAttachType: spec.AttachType,
+               insCount:           insCount,
+               instructions:       newPtr(unsafe.Pointer(&bytecode[0])),
+               license:            newPtr(unsafe.Pointer(&lic[0])),
+       }
+
+       name, err := newBPFObjName(spec.Name)
+       if err != nil {
+               return nil, err
+       }
+
+       if includeName {
+               attr.progName = name
+       }
+
+       return attr, nil
+}
+
+func (p *Program) String() string {
+       if p.name != "" {
+               return fmt.Sprintf("%s(%s)#%s", p.abi.Type, p.name, p.fd)
+       }
+       return fmt.Sprintf("%s#%s", p.abi.Type, p.fd)
+}
+
+// ABI gets the ABI of the Program
+func (p *Program) ABI() ProgramABI {
+       return p.abi
+}
+
+// FD gets the file descriptor of the Program.
+//
+// It is invalid to call this function after Close has been called.
+func (p *Program) FD() int {
+       fd, err := p.fd.value()
+       if err != nil {
+               // Best effort: -1 is the number most likely to be an
+               // invalid file descriptor.
+               return -1
+       }
+
+       return int(fd)
+}
+
+// Clone creates a duplicate of the Program.
+//
+// Closing the duplicate does not affect the original, and vice versa.
+//
+// Cloning a nil Program returns nil.
+func (p *Program) Clone() (*Program, error) {
+       if p == nil {
+               return nil, nil
+       }
+
+       dup, err := p.fd.dup()
+       if err != nil {
+               return nil, errors.Wrap(err, "can't clone program")
+       }
+
+       return newProgram(dup, p.name, &p.abi), nil
+}
+
+// Pin persists the Program past the lifetime of the process that created it
+//
+// This requires bpffs to be mounted above fileName. See http://cilium.readthedocs.io/en/doc-1.0/kubernetes/install/#mounting-the-bpf-fs-optional
+func (p *Program) Pin(fileName string) error {
+       return errors.Wrap(bpfPinObject(fileName, p.fd), "can't pin program")
+}
+
+// Close unloads the program from the kernel.
+func (p *Program) Close() error {
+       if p == nil {
+               return nil
+       }
+
+       return p.fd.close()
+}
+
+// Test runs the Program in the kernel with the given input and returns the
+// value returned by the eBPF program. outLen may be zero.
+//
+// Note: the kernel expects at least 14 bytes input for an ethernet header for
+// XDP and SKB programs.
+//
+// This function requires at least Linux 4.12.
+func (p *Program) Test(in []byte) (uint32, []byte, error) {
+       ret, out, _, err := p.testRun(in, 1)
+       return ret, out, err
+}
+
+// Benchmark runs the Program with the given input for a number of times
+// and returns the time taken per iteration.
+//
+// The returned value is the return value of the last execution of
+// the program.
+//
+// This function requires at least Linux 4.12.
+func (p *Program) Benchmark(in []byte, repeat int) (uint32, time.Duration, error) {
+       ret, _, total, err := p.testRun(in, repeat)
+       return ret, total, err
+}
+
+var noProgTestRun = featureTest{
+       Fn: func() bool {
+               prog, err := NewProgram(&ProgramSpec{
+                       Type: SocketFilter,
+                       Instructions: asm.Instructions{
+                               asm.LoadImm(asm.R0, 0, asm.DWord),
+                               asm.Return(),
+                       },
+                       License: "MIT",
+               })
+               if err != nil {
+                       // This may be because we lack sufficient permissions, etc.
+                       return false
+               }
+               defer prog.Close()
+
+               fd, err := prog.fd.value()
+               if err != nil {
+                       return false
+               }
+
+               // Programs require at least 14 bytes input
+               in := make([]byte, 14)
+               attr := bpfProgTestRunAttr{
+                       fd:         fd,
+                       dataSizeIn: uint32(len(in)),
+                       dataIn:     newPtr(unsafe.Pointer(&in[0])),
+               }
+
+               _, err = bpfCall(_ProgTestRun, unsafe.Pointer(&attr), unsafe.Sizeof(attr))
+               return errors.Cause(err) == unix.EINVAL
+       },
+}
+
+func (p *Program) testRun(in []byte, repeat int) (uint32, []byte, time.Duration, error) {
+       if uint(repeat) > math.MaxUint32 {
+               return 0, nil, 0, fmt.Errorf("repeat is too high")
+       }
+
+       if len(in) == 0 {
+               return 0, nil, 0, fmt.Errorf("missing input")
+       }
+
+       if uint(len(in)) > math.MaxUint32 {
+               return 0, nil, 0, fmt.Errorf("input is too long")
+       }
+
+       if noProgTestRun.Result() {
+               return 0, nil, 0, errNotSupported
+       }
+
+       // Older kernels ignore the dataSizeOut argument when copying to user space.
+       // Combined with things like bpf_xdp_adjust_head() we don't really know what the final
+       // size will be. Hence we allocate an output buffer which we hope will always be large
+       // enough, and panic if the kernel wrote past the end of the allocation.
+       // See https://patchwork.ozlabs.org/cover/1006822/
+       out := make([]byte, len(in)+outputPad)
+
+       fd, err := p.fd.value()
+       if err != nil {
+               return 0, nil, 0, err
+       }
+
+       attr := bpfProgTestRunAttr{
+               fd:          fd,
+               dataSizeIn:  uint32(len(in)),
+               dataSizeOut: uint32(len(out)),
+               dataIn:      newPtr(unsafe.Pointer(&in[0])),
+               dataOut:     newPtr(unsafe.Pointer(&out[0])),
+               repeat:      uint32(repeat),
+       }
+
+       _, err = bpfCall(_ProgTestRun, unsafe.Pointer(&attr), unsafe.Sizeof(attr))
+       if err != nil {
+               return 0, nil, 0, errors.Wrap(err, "can't run test")
+       }
+
+       if int(attr.dataSizeOut) > cap(out) {
+               // Houston, we have a problem. The program created more data than we allocated,
+               // and the kernel wrote past the end of our buffer.
+               panic("kernel wrote past end of output buffer")
+       }
+       out = out[:int(attr.dataSizeOut)]
+
+       total := time.Duration(attr.duration) * time.Nanosecond
+       return attr.retval, out, total, nil
+}
+
+func unmarshalProgram(buf []byte) (*Program, error) {
+       if len(buf) != 4 {
+               return nil, errors.New("program id requires 4 byte value")
+       }
+
+       // Looking up an entry in a nested map or prog array returns an id,
+       // not an fd.
+       id := internal.NativeEndian.Uint32(buf)
+       fd, err := bpfGetProgramFDByID(id)
+       if err != nil {
+               return nil, err
+       }
+
+       abi, err := newProgramABIFromFd(fd)
+       if err != nil {
+               _ = fd.close()
+               return nil, err
+       }
+
+       return newProgram(fd, "", abi), nil
+}
+
+// MarshalBinary implements BinaryMarshaler.
+func (p *Program) MarshalBinary() ([]byte, error) {
+       value, err := p.fd.value()
+       if err != nil {
+               return nil, err
+       }
+
+       buf := make([]byte, 4)
+       internal.NativeEndian.PutUint32(buf, value)
+       return buf, nil
+}
+
+// Attach a Program to a container object fd
+func (p *Program) Attach(fd int, typ AttachType, flags AttachFlags) error {
+       if fd < 0 {
+               return errors.New("invalid fd")
+       }
+
+       pfd, err := p.fd.value()
+       if err != nil {
+               return err
+       }
+
+       attr := bpfProgAlterAttr{
+               targetFd:    uint32(fd),
+               attachBpfFd: pfd,
+               attachType:  uint32(typ),
+               attachFlags: uint32(flags),
+       }
+
+       return bpfProgAlter(_ProgAttach, &attr)
+}
+
+// Detach a Program from a container object fd
+func (p *Program) Detach(fd int, typ AttachType, flags AttachFlags) error {
+       if fd < 0 {
+               return errors.New("invalid fd")
+       }
+
+       pfd, err := p.fd.value()
+       if err != nil {
+               return err
+       }
+
+       attr := bpfProgAlterAttr{
+               targetFd:    uint32(fd),
+               attachBpfFd: pfd,
+               attachType:  uint32(typ),
+               attachFlags: uint32(flags),
+       }
+
+       return bpfProgAlter(_ProgDetach, &attr)
+}
+
+// LoadPinnedProgram loads a Program from a BPF file.
+//
+// Requires at least Linux 4.13, use LoadPinnedProgramExplicit on
+// earlier versions.
+func LoadPinnedProgram(fileName string) (*Program, error) {
+       fd, err := bpfGetObject(fileName)
+       if err != nil {
+               return nil, err
+       }
+
+       abi, err := newProgramABIFromFd(fd)
+       if err != nil {
+               _ = fd.close()
+               return nil, err
+       }
+
+       return newProgram(fd, filepath.Base(fileName), abi), nil
+}
+
+// LoadPinnedProgramExplicit loads a program with explicit parameters.
+func LoadPinnedProgramExplicit(fileName string, abi *ProgramABI) (*Program, error) {
+       fd, err := bpfGetObject(fileName)
+       if err != nil {
+               return nil, err
+       }
+
+       return newProgram(fd, filepath.Base(fileName), abi), nil
+}
+
+// SanitizeName replaces all invalid characters in name.
+//
+// Use this to automatically generate valid names for maps and
+// programs at run time.
+//
+// Passing a negative value for replacement will delete characters
+// instead of replacing them.
+func SanitizeName(name string, replacement rune) string {
+       return strings.Map(func(char rune) rune {
+               if invalidBPFObjNameChar(char) {
+                       return replacement
+               }
+               return char
+       }, name)
+}
+
+type loadError struct {
+       cause       error
+       verifierLog string
+}
+
+func (le *loadError) Error() string {
+       if le.verifierLog == "" {
+               return fmt.Sprintf("failed to load program: %s", le.cause)
+       }
+       return fmt.Sprintf("failed to load program: %s: %s", le.cause, le.verifierLog)
+}
+
+func (le *loadError) Cause() error {
+       return le.cause
+}
+
+// IsNotSupported returns true if an error occurred because
+// the kernel does not have support for a specific feature.
+func IsNotSupported(err error) bool {
+       return errors.Cause(err) == errNotSupported
+}
diff --git a/vendor/github.com/cilium/ebpf/ptr_32_be.go b/vendor/github.com/cilium/ebpf/ptr_32_be.go
new file mode 100644 (file)
index 0000000..7757744
--- /dev/null
@@ -0,0 +1,14 @@
+// +build armbe mips mips64p32
+
+package ebpf
+
+import (
+       "unsafe"
+)
+
+// ptr wraps an unsafe.Pointer to be 64bit to
+// conform to the syscall specification.
+type syscallPtr struct {
+       pad uint32
+       ptr unsafe.Pointer
+}
diff --git a/vendor/github.com/cilium/ebpf/ptr_32_le.go b/vendor/github.com/cilium/ebpf/ptr_32_le.go
new file mode 100644 (file)
index 0000000..14b805e
--- /dev/null
@@ -0,0 +1,14 @@
+// +build 386 amd64p32 arm mipsle mips64p32le
+
+package ebpf
+
+import (
+       "unsafe"
+)
+
+// ptr wraps an unsafe.Pointer to be 64bit to
+// conform to the syscall specification.
+type syscallPtr struct {
+       ptr unsafe.Pointer
+       pad uint32
+}
diff --git a/vendor/github.com/cilium/ebpf/ptr_64.go b/vendor/github.com/cilium/ebpf/ptr_64.go
new file mode 100644 (file)
index 0000000..c897d72
--- /dev/null
@@ -0,0 +1,14 @@
+// +build !386,!amd64p32,!arm,!mipsle,!mips64p32le
+// +build !armbe,!mips,!mips64p32
+
+package ebpf
+
+import (
+       "unsafe"
+)
+
+// ptr wraps an unsafe.Pointer to be 64bit to
+// conform to the syscall specification.
+type syscallPtr struct {
+       ptr unsafe.Pointer
+}
diff --git a/vendor/github.com/cilium/ebpf/readme.md b/vendor/github.com/cilium/ebpf/readme.md
new file mode 100644 (file)
index 0000000..26ab2b9
--- /dev/null
@@ -0,0 +1,20 @@
+eBPF
+-------
+[![](https://godoc.org/github.com/cilium/ebpf?status.svg)](https://godoc.org/github.com/cilium/ebpf)
+
+eBPF is a pure Go library that provides utilities for loading, compiling, and debugging eBPF programs. It has minimal external dependencies and is intended to be used in long running processes.
+
+[ebpf/asm](https://godoc.org/github.com/cilium/ebpf/asm) contains a basic assembler.
+
+The library is maintained by [Cloudflare](https://www.cloudflare.com) and [Cilium](https://www.cilium.io). Feel free to [join](https://cilium.herokuapp.com/) the [libbpf-go](https://cilium.slack.com/messages/libbpf-go) channel on Slack.
+
+## Current status
+
+The package is production ready, but **the API is explicitly unstable
+right now**. Expect to update your code if you want to follow along.
+
+## Useful resources
+
+* [Cilium eBPF documentation](https://cilium.readthedocs.io/en/latest/bpf/#bpf-guide) (recommended)
+* [Linux documentation on BPF](http://elixir.free-electrons.com/linux/latest/source/Documentation/networking/filter.txt)
+* [eBPF features by Linux version](https://github.com/iovisor/bcc/blob/master/docs/kernel-versions.md)
diff --git a/vendor/github.com/cilium/ebpf/syscalls.go b/vendor/github.com/cilium/ebpf/syscalls.go
new file mode 100644 (file)
index 0000000..68abd3b
--- /dev/null
@@ -0,0 +1,420 @@
+package ebpf
+
+import (
+       "bytes"
+       "path/filepath"
+       "runtime"
+       "strconv"
+       "strings"
+       "unsafe"
+
+       "github.com/cilium/ebpf/internal/unix"
+
+       "github.com/pkg/errors"
+)
+
+var errClosedFd = errors.New("use of closed file descriptor")
+
+type bpfFD struct {
+       raw int64
+}
+
+func newBPFFD(value uint32) *bpfFD {
+       fd := &bpfFD{int64(value)}
+       runtime.SetFinalizer(fd, (*bpfFD).close)
+       return fd
+}
+
+func (fd *bpfFD) String() string {
+       return strconv.FormatInt(fd.raw, 10)
+}
+
+func (fd *bpfFD) value() (uint32, error) {
+       if fd.raw < 0 {
+               return 0, errClosedFd
+       }
+
+       return uint32(fd.raw), nil
+}
+
+func (fd *bpfFD) close() error {
+       if fd.raw < 0 {
+               return nil
+       }
+
+       value := int(fd.raw)
+       fd.raw = -1
+
+       fd.forget()
+       return unix.Close(value)
+}
+
+func (fd *bpfFD) forget() {
+       runtime.SetFinalizer(fd, nil)
+}
+
+func (fd *bpfFD) dup() (*bpfFD, error) {
+       if fd.raw < 0 {
+               return nil, errClosedFd
+       }
+
+       dup, err := unix.FcntlInt(uintptr(fd.raw), unix.F_DUPFD_CLOEXEC, 0)
+       if err != nil {
+               return nil, errors.Wrap(err, "can't dup fd")
+       }
+
+       return newBPFFD(uint32(dup)), nil
+}
+
+// bpfObjName is a null-terminated string made up of
+// 'A-Za-z0-9_' characters.
+type bpfObjName [unix.BPF_OBJ_NAME_LEN]byte
+
+// newBPFObjName truncates the result if it is too long.
+func newBPFObjName(name string) (bpfObjName, error) {
+       idx := strings.IndexFunc(name, invalidBPFObjNameChar)
+       if idx != -1 {
+               return bpfObjName{}, errors.Errorf("invalid character '%c' in name '%s'", name[idx], name)
+       }
+
+       var result bpfObjName
+       copy(result[:unix.BPF_OBJ_NAME_LEN-1], name)
+       return result, nil
+}
+
+func invalidBPFObjNameChar(char rune) bool {
+       switch {
+       case char >= 'A' && char <= 'Z':
+               fallthrough
+       case char >= 'a' && char <= 'z':
+               fallthrough
+       case char >= '0' && char <= '9':
+               fallthrough
+       case char == '_':
+               return false
+       default:
+               return true
+       }
+}
+
+type bpfMapCreateAttr struct {
+       mapType    MapType
+       keySize    uint32
+       valueSize  uint32
+       maxEntries uint32
+       flags      uint32
+       innerMapFd uint32     // since 4.12 56f668dfe00d
+       numaNode   uint32     // since 4.14 96eabe7a40aa
+       mapName    bpfObjName // since 4.15 ad5b177bd73f
+}
+
+type bpfMapOpAttr struct {
+       mapFd   uint32
+       padding uint32
+       key     syscallPtr
+       value   syscallPtr
+       flags   uint64
+}
+
+type bpfMapInfo struct {
+       mapType    uint32
+       id         uint32
+       keySize    uint32
+       valueSize  uint32
+       maxEntries uint32
+       flags      uint32
+       mapName    bpfObjName // since 4.15 ad5b177bd73f
+}
+
+type bpfPinObjAttr struct {
+       fileName syscallPtr
+       fd       uint32
+       padding  uint32
+}
+
+type bpfProgLoadAttr struct {
+       progType           ProgramType
+       insCount           uint32
+       instructions       syscallPtr
+       license            syscallPtr
+       logLevel           uint32
+       logSize            uint32
+       logBuf             syscallPtr
+       kernelVersion      uint32     // since 4.1  2541517c32be
+       progFlags          uint32     // since 4.11 e07b98d9bffe
+       progName           bpfObjName // since 4.15 067cae47771c
+       progIfIndex        uint32     // since 4.15 1f6f4cb7ba21
+       expectedAttachType AttachType // since 4.17 5e43f899b03a
+}
+
+type bpfProgInfo struct {
+       progType     uint32
+       id           uint32
+       tag          [unix.BPF_TAG_SIZE]byte
+       jitedLen     uint32
+       xlatedLen    uint32
+       jited        syscallPtr
+       xlated       syscallPtr
+       loadTime     uint64 // since 4.15 cb4d2b3f03d8
+       createdByUID uint32
+       nrMapIDs     uint32
+       mapIds       syscallPtr
+       name         bpfObjName
+}
+
+type bpfProgTestRunAttr struct {
+       fd          uint32
+       retval      uint32
+       dataSizeIn  uint32
+       dataSizeOut uint32
+       dataIn      syscallPtr
+       dataOut     syscallPtr
+       repeat      uint32
+       duration    uint32
+}
+
+type bpfProgAlterAttr struct {
+       targetFd    uint32
+       attachBpfFd uint32
+       attachType  uint32
+       attachFlags uint32
+}
+
+type bpfObjGetInfoByFDAttr struct {
+       fd      uint32
+       infoLen uint32
+       info    syscallPtr // May be either bpfMapInfo or bpfProgInfo
+}
+
+type bpfGetFDByIDAttr struct {
+       id   uint32
+       next uint32
+}
+
+func newPtr(ptr unsafe.Pointer) syscallPtr {
+       return syscallPtr{ptr: ptr}
+}
+
+func bpfProgLoad(attr *bpfProgLoadAttr) (*bpfFD, error) {
+       for {
+               fd, err := bpfCall(_ProgLoad, unsafe.Pointer(attr), unsafe.Sizeof(*attr))
+               // As of ~4.20 the verifier can be interrupted by a signal,
+               // and returns EAGAIN in that case.
+               if err == unix.EAGAIN {
+                       continue
+               }
+
+               if err != nil {
+                       return nil, err
+               }
+
+               return newBPFFD(uint32(fd)), nil
+       }
+}
+
+func bpfProgAlter(cmd int, attr *bpfProgAlterAttr) error {
+       _, err := bpfCall(cmd, unsafe.Pointer(attr), unsafe.Sizeof(*attr))
+       return err
+}
+
+func bpfMapCreate(attr *bpfMapCreateAttr) (*bpfFD, error) {
+       fd, err := bpfCall(_MapCreate, unsafe.Pointer(attr), unsafe.Sizeof(*attr))
+       if err != nil {
+               return nil, err
+       }
+
+       return newBPFFD(uint32(fd)), nil
+}
+
+func bpfMapLookupElem(m *bpfFD, key, valueOut syscallPtr) error {
+       fd, err := m.value()
+       if err != nil {
+               return err
+       }
+
+       attr := bpfMapOpAttr{
+               mapFd: fd,
+               key:   key,
+               value: valueOut,
+       }
+       _, err = bpfCall(_MapLookupElem, unsafe.Pointer(&attr), unsafe.Sizeof(attr))
+       return err
+}
+
+func bpfMapUpdateElem(m *bpfFD, key, valueOut syscallPtr, flags uint64) error {
+       fd, err := m.value()
+       if err != nil {
+               return err
+       }
+
+       attr := bpfMapOpAttr{
+               mapFd: fd,
+               key:   key,
+               value: valueOut,
+               flags: flags,
+       }
+       _, err = bpfCall(_MapUpdateElem, unsafe.Pointer(&attr), unsafe.Sizeof(attr))
+       return err
+}
+
+func bpfMapDeleteElem(m *bpfFD, key syscallPtr) error {
+       fd, err := m.value()
+       if err != nil {
+               return err
+       }
+
+       attr := bpfMapOpAttr{
+               mapFd: fd,
+               key:   key,
+       }
+       _, err = bpfCall(_MapDeleteElem, unsafe.Pointer(&attr), unsafe.Sizeof(attr))
+       return err
+}
+
+func bpfMapGetNextKey(m *bpfFD, key, nextKeyOut syscallPtr) error {
+       fd, err := m.value()
+       if err != nil {
+               return err
+       }
+
+       attr := bpfMapOpAttr{
+               mapFd: fd,
+               key:   key,
+               value: nextKeyOut,
+       }
+       _, err = bpfCall(_MapGetNextKey, unsafe.Pointer(&attr), unsafe.Sizeof(attr))
+       return err
+}
+
+const bpfFSType = 0xcafe4a11
+
+func bpfPinObject(fileName string, fd *bpfFD) error {
+       dirName := filepath.Dir(fileName)
+       var statfs unix.Statfs_t
+       if err := unix.Statfs(dirName, &statfs); err != nil {
+               return err
+       }
+       if uint64(statfs.Type) != bpfFSType {
+               return errors.Errorf("%s is not on a bpf filesystem", fileName)
+       }
+
+       value, err := fd.value()
+       if err != nil {
+               return err
+       }
+
+       _, err = bpfCall(_ObjPin, unsafe.Pointer(&bpfPinObjAttr{
+               fileName: newPtr(unsafe.Pointer(&[]byte(fileName)[0])),
+               fd:       value,
+       }), 16)
+       return errors.Wrapf(err, "pin object %s", fileName)
+}
+
+func bpfGetObject(fileName string) (*bpfFD, error) {
+       ptr, err := bpfCall(_ObjGet, unsafe.Pointer(&bpfPinObjAttr{
+               fileName: newPtr(unsafe.Pointer(&[]byte(fileName)[0])),
+       }), 16)
+       if err != nil {
+               return nil, errors.Wrapf(err, "get object %s", fileName)
+       }
+       return newBPFFD(uint32(ptr)), nil
+}
+
+func bpfGetObjectInfoByFD(fd *bpfFD, info unsafe.Pointer, size uintptr) error {
+       value, err := fd.value()
+       if err != nil {
+               return err
+       }
+
+       // available from 4.13
+       attr := bpfObjGetInfoByFDAttr{
+               fd:      value,
+               infoLen: uint32(size),
+               info:    newPtr(info),
+       }
+       _, err = bpfCall(_ObjGetInfoByFD, unsafe.Pointer(&attr), unsafe.Sizeof(attr))
+       return errors.Wrapf(err, "fd %d", value)
+}
+
+func bpfGetProgInfoByFD(fd *bpfFD) (*bpfProgInfo, error) {
+       var info bpfProgInfo
+       err := bpfGetObjectInfoByFD(fd, unsafe.Pointer(&info), unsafe.Sizeof(info))
+       return &info, errors.Wrap(err, "can't get program info")
+}
+
+func bpfGetMapInfoByFD(fd *bpfFD) (*bpfMapInfo, error) {
+       var info bpfMapInfo
+       err := bpfGetObjectInfoByFD(fd, unsafe.Pointer(&info), unsafe.Sizeof(info))
+       return &info, errors.Wrap(err, "can't get map info:")
+}
+
+var haveObjName = featureTest{
+       Fn: func() bool {
+               name, err := newBPFObjName("feature_test")
+               if err != nil {
+                       // This really is a fatal error, but it should be caught
+                       // by the unit tests not working.
+                       return false
+               }
+
+               attr := bpfMapCreateAttr{
+                       mapType:    Array,
+                       keySize:    4,
+                       valueSize:  4,
+                       maxEntries: 1,
+                       mapName:    name,
+               }
+
+               fd, err := bpfMapCreate(&attr)
+               if err != nil {
+                       return false
+               }
+
+               _ = fd.close()
+               return true
+       },
+}
+
+func bpfGetMapFDByID(id uint32) (*bpfFD, error) {
+       // available from 4.13
+       attr := bpfGetFDByIDAttr{
+               id: id,
+       }
+       ptr, err := bpfCall(_MapGetFDByID, unsafe.Pointer(&attr), unsafe.Sizeof(attr))
+       if err != nil {
+               return nil, errors.Wrapf(err, "can't get fd for map id %d", id)
+       }
+       return newBPFFD(uint32(ptr)), nil
+}
+
+func bpfGetProgramFDByID(id uint32) (*bpfFD, error) {
+       // available from 4.13
+       attr := bpfGetFDByIDAttr{
+               id: id,
+       }
+       ptr, err := bpfCall(_ProgGetFDByID, unsafe.Pointer(&attr), unsafe.Sizeof(attr))
+       if err != nil {
+               return nil, errors.Wrapf(err, "can't get fd for program id %d", id)
+       }
+       return newBPFFD(uint32(ptr)), nil
+}
+
+func bpfCall(cmd int, attr unsafe.Pointer, size uintptr) (uintptr, error) {
+       r1, _, errNo := unix.Syscall(unix.SYS_BPF, uintptr(cmd), uintptr(attr), size)
+       runtime.KeepAlive(attr)
+
+       var err error
+       if errNo != 0 {
+               err = errNo
+       }
+
+       return r1, err
+}
+
+func convertCString(in []byte) string {
+       inLen := bytes.IndexByte(in, 0)
+       if inLen == -1 {
+               return ""
+       }
+       return string(in[:inLen])
+}
diff --git a/vendor/github.com/cilium/ebpf/types.go b/vendor/github.com/cilium/ebpf/types.go
new file mode 100644 (file)
index 0000000..0daf9a7
--- /dev/null
@@ -0,0 +1,189 @@
+package ebpf
+
+//go:generate stringer -output types_string.go -type=MapType,ProgramType
+
+// MapType indicates the type map structure
+// that will be initialized in the kernel.
+type MapType uint32
+
+// All the various map types that can be created
+const (
+       UnspecifiedMap MapType = iota
+       // Hash is a hash map
+       Hash
+       // Array is an array map
+       Array
+       // ProgramArray - A program array map is a special kind of array map whose map
+       // values contain only file descriptors referring to other eBPF
+       // programs.  Thus, both the key_size and value_size must be
+       // exactly four bytes.  This map is used in conjunction with the
+       // TailCall helper.
+       ProgramArray
+       // PerfEventArray - A perf event array is used in conjunction with PerfEventRead
+       // and PerfEventOutput calls, to read the raw bpf_perf_data from the registers.
+       PerfEventArray
+       // PerCPUHash - This data structure is useful for people who have high performance
+       // network needs and can reconcile adds at the end of some cycle, so that
+       // hashes can be lock free without the use of XAdd, which can be costly.
+       PerCPUHash
+       // PerCPUArray - This data structure is useful for people who have high performance
+       // network needs and can reconcile adds at the end of some cycle, so that
+       // hashes can be lock free without the use of XAdd, which can be costly.
+       // Each CPU gets a copy of this hash, the contents of all of which can be reconciled
+       // later.
+       PerCPUArray
+       // StackTrace - This holds whole user and kernel stack traces, it can be retrieved with
+       // GetStackID
+       StackTrace
+       // CGroupArray - This is a very niche structure used to help SKBInCGroup determine
+       // if an skb is from a socket belonging to a specific cgroup
+       CGroupArray
+       // LRUHash - This allows you to create a small hash structure that will purge the
+       // least recently used items rather than thow an error when you run out of memory
+       LRUHash
+       // LRUCPUHash - This is NOT like PerCPUHash, this structure is shared among the CPUs,
+       // it has more to do with including the CPU id with the LRU calculation so that if a
+       // particular CPU is using a value over-and-over again, then it will be saved, but if
+       // a value is being retrieved a lot but sparsely across CPUs it is not as important, basically
+       // giving weight to CPU locality over overall usage.
+       LRUCPUHash
+       // LPMTrie - This is an implementation of Longest-Prefix-Match Trie structure. It is useful,
+       // for storing things like IP addresses which can be bit masked allowing for keys of differing
+       // values to refer to the same reference based on their masks. See wikipedia for more details.
+       LPMTrie
+       // ArrayOfMaps - Each item in the array is another map. The inner map mustn't be a map of maps
+       // itself.
+       ArrayOfMaps
+       // HashOfMaps - Each item in the hash map is another map. The inner map mustn't be a map of maps
+       // itself.
+       HashOfMaps
+)
+
+// hasPerCPUValue returns true if the Map stores a value per CPU.
+func (mt MapType) hasPerCPUValue() bool {
+       if mt == PerCPUHash || mt == PerCPUArray {
+               return true
+       }
+       return false
+}
+
+const (
+       _MapCreate = iota
+       _MapLookupElem
+       _MapUpdateElem
+       _MapDeleteElem
+       _MapGetNextKey
+       _ProgLoad
+       _ObjPin
+       _ObjGet
+       _ProgAttach
+       _ProgDetach
+       _ProgTestRun
+       _ProgGetNextID
+       _MapGetNextID
+       _ProgGetFDByID
+       _MapGetFDByID
+       _ObjGetInfoByFD
+)
+
+const (
+       _Any = iota
+       _NoExist
+       _Exist
+)
+
+// ProgramType of the eBPF program
+type ProgramType uint32
+
+// eBPF program types
+const (
+       // Unrecognized program type
+       UnspecifiedProgram ProgramType = iota
+       // SocketFilter socket or seccomp filter
+       SocketFilter
+       // Kprobe program
+       Kprobe
+       // SchedCLS traffic control shaper
+       SchedCLS
+       // SchedACT routing control shaper
+       SchedACT
+       // TracePoint program
+       TracePoint
+       // XDP program
+       XDP
+       // PerfEvent program
+       PerfEvent
+       // CGroupSKB program
+       CGroupSKB
+       // CGroupSock program
+       CGroupSock
+       // LWTIn program
+       LWTIn
+       // LWTOut program
+       LWTOut
+       // LWTXmit program
+       LWTXmit
+       // SockOps program
+       SockOps
+       // SkSKB program
+       SkSKB
+       // CGroupDevice program
+       CGroupDevice
+       // SkMsg program
+       SkMsg
+       // RawTracepoint program
+       RawTracepoint
+       // CGroupSockAddr program
+       CGroupSockAddr
+       // LWTSeg6Local program
+       LWTSeg6Local
+       // LircMode2 program
+       LircMode2
+       // SkReuseport program
+       SkReuseport
+       // FlowDissector program
+       FlowDissector
+       // CGroupSysctl program
+       CGroupSysctl
+       // RawTracepointWritable program
+       RawTracepointWritable
+       // CGroupSockopt program
+       CGroupSockopt
+)
+
+// AttachType of the eBPF program, needed to differentiate allowed context accesses in
+// some newer program types like CGroupSockAddr. Should be set to AttachNone if not required.
+// Will cause invalid argument (EINVAL) at program load time if set incorrectly.
+type AttachType uint32
+
+// AttachNone is an alias for AttachCGroupInetIngress for readability reasons
+const AttachNone AttachType = 0
+
+const (
+       AttachCGroupInetIngress AttachType = iota
+       AttachCGroupInetEgress
+       AttachCGroupInetSockCreate
+       AttachCGroupSockOps
+       AttachSkSKBStreamParser
+       AttachSkSKBStreamVerdict
+       AttachCGroupDevice
+       AttachSkMsgVerdict
+       AttachCGroupInet4Bind
+       AttachCGroupInet6Bind
+       AttachCGroupInet4Connect
+       AttachCGroupInet6Connect
+       AttachCGroupInet4PostBind
+       AttachCGroupInet6PostBind
+       AttachCGroupUDP4Sendmsg
+       AttachCGroupUDP6Sendmsg
+       AttachLircMode2
+       AttachFlowDissector
+       AttachCGroupSysctl
+       AttachCGroupUDP4Recvmsg
+       AttachCGroupUDP6Recvmsg
+       AttachCGroupGetsockopt
+       AttachCGroupSetsockopt
+)
+
+// AttachFlags of the eBPF program used in BPF_PROG_ATTACH command
+type AttachFlags uint32
diff --git a/vendor/github.com/cilium/ebpf/types_string.go b/vendor/github.com/cilium/ebpf/types_string.go
new file mode 100644 (file)
index 0000000..4813437
--- /dev/null
@@ -0,0 +1,78 @@
+// Code generated by "stringer -output types_string.go -type=MapType,ProgramType"; DO NOT EDIT.
+
+package ebpf
+
+import "strconv"
+
+func _() {
+       // An "invalid array index" compiler error signifies that the constant values have changed.
+       // Re-run the stringer command to generate them again.
+       var x [1]struct{}
+       _ = x[UnspecifiedMap-0]
+       _ = x[Hash-1]
+       _ = x[Array-2]
+       _ = x[ProgramArray-3]
+       _ = x[PerfEventArray-4]
+       _ = x[PerCPUHash-5]
+       _ = x[PerCPUArray-6]
+       _ = x[StackTrace-7]
+       _ = x[CGroupArray-8]
+       _ = x[LRUHash-9]
+       _ = x[LRUCPUHash-10]
+       _ = x[LPMTrie-11]
+       _ = x[ArrayOfMaps-12]
+       _ = x[HashOfMaps-13]
+}
+
+const _MapType_name = "UnspecifiedMapHashArrayProgramArrayPerfEventArrayPerCPUHashPerCPUArrayStackTraceCGroupArrayLRUHashLRUCPUHashLPMTrieArrayOfMapsHashOfMaps"
+
+var _MapType_index = [...]uint8{0, 14, 18, 23, 35, 49, 59, 70, 80, 91, 98, 108, 115, 126, 136}
+
+func (i MapType) String() string {
+       if i >= MapType(len(_MapType_index)-1) {
+               return "MapType(" + strconv.FormatInt(int64(i), 10) + ")"
+       }
+       return _MapType_name[_MapType_index[i]:_MapType_index[i+1]]
+}
+func _() {
+       // An "invalid array index" compiler error signifies that the constant values have changed.
+       // Re-run the stringer command to generate them again.
+       var x [1]struct{}
+       _ = x[UnspecifiedProgram-0]
+       _ = x[SocketFilter-1]
+       _ = x[Kprobe-2]
+       _ = x[SchedCLS-3]
+       _ = x[SchedACT-4]
+       _ = x[TracePoint-5]
+       _ = x[XDP-6]
+       _ = x[PerfEvent-7]
+       _ = x[CGroupSKB-8]
+       _ = x[CGroupSock-9]
+       _ = x[LWTIn-10]
+       _ = x[LWTOut-11]
+       _ = x[LWTXmit-12]
+       _ = x[SockOps-13]
+       _ = x[SkSKB-14]
+       _ = x[CGroupDevice-15]
+       _ = x[SkMsg-16]
+       _ = x[RawTracepoint-17]
+       _ = x[CGroupSockAddr-18]
+       _ = x[LWTSeg6Local-19]
+       _ = x[LircMode2-20]
+       _ = x[SkReuseport-21]
+       _ = x[FlowDissector-22]
+       _ = x[CGroupSysctl-23]
+       _ = x[RawTracepointWritable-24]
+       _ = x[CGroupSockopt-25]
+}
+
+const _ProgramType_name = "UnspecifiedProgramSocketFilterKprobeSchedCLSSchedACTTracePointXDPPerfEventCGroupSKBCGroupSockLWTInLWTOutLWTXmitSockOpsSkSKBCGroupDeviceSkMsgRawTracepointCGroupSockAddrLWTSeg6LocalLircMode2SkReuseportFlowDissectorCGroupSysctlRawTracepointWritableCGroupSockopt"
+
+var _ProgramType_index = [...]uint16{0, 18, 30, 36, 44, 52, 62, 65, 74, 83, 93, 98, 104, 111, 118, 123, 135, 140, 153, 167, 179, 188, 199, 212, 224, 245, 258}
+
+func (i ProgramType) String() string {
+       if i >= ProgramType(len(_ProgramType_index)-1) {
+               return "ProgramType(" + strconv.FormatInt(int64(i), 10) + ")"
+       }
+       return _ProgramType_name[_ProgramType_index[i]:_ProgramType_index[i+1]]
+}