From: Dmitry Smirnov Date: Sun, 26 Jan 2020 09:24:01 +0000 (+0000) Subject: Import runc_1.0.0~rc10+dfsg1.orig.tar.xz X-Git-Tag: archive/raspbian/1.0.0_rc10+dfsg1-1+rpi1^2~5 X-Git-Url: https://dgit.raspbian.org/?a=commitdiff_plain;h=c31a8e4cd7563effe228286ad003510332456cf9;p=runc.git Import runc_1.0.0~rc10+dfsg1.orig.tar.xz [dgit import orig runc_1.0.0~rc10+dfsg1.orig.tar.xz] --- c31a8e4cd7563effe228286ad003510332456cf9 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..282e34e --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +vendor/pkg +/runc +/runc-* +contrib/cmd/recvtty/recvtty +man/man8 +release diff --git a/.pullapprove.yml b/.pullapprove.yml new file mode 100644 index 0000000..fc8c5d3 --- /dev/null +++ b/.pullapprove.yml @@ -0,0 +1,10 @@ +approve_by_comment: true +approve_regex: ^LGTM +reject_regex: ^Rejected +reset_on_push: true +author_approval: ignored +reviewers: + teams: + - runc-maintainers + name: default + required: 2 diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..5c2928d --- /dev/null +++ b/.travis.yml @@ -0,0 +1,54 @@ +dist: bionic +language: go +go: + - 1.11.x + - 1.12.x + - tip + +matrix: + include: + - go: 1.12.x + env: + - RUNC_USE_SYSTEMD=1 + script: + - make BUILDTAGS="${BUILDTAGS}" all + - sudo PATH="$PATH" make localintegration RUNC_USE_SYSTEMD=1 + - go: 1.12.x + env: + - VIRTUALBOX_VERSION=6.0 + - VAGRANT_VERSION=2.2.6 + - FEDORA_VERSION=31 + before_install: + - cat /proc/cpuinfo + - wget -q https://www.virtualbox.org/download/oracle_vbox_2016.asc -O- | sudo apt-key add - && sudo sh -c "echo deb https://download.virtualbox.org/virtualbox/debian $(lsb_release -cs) contrib >> /etc/apt/sources.list" && sudo apt-get update && sudo apt-get install -yq build-essential gcc make linux-headers-$(uname -r) virtualbox-${VIRTUALBOX_VERSION} && sudo usermod -aG vboxusers $(whoami) + - wget https://releases.hashicorp.com/vagrant/${VAGRANT_VERSION}/vagrant_${VAGRANT_VERSION}_$(uname -m).deb && sudo dpkg -i vagrant_${VAGRANT_VERSION}_$(uname -m).deb + - vagrant init bento/fedora-${FEDORA_VERSION} && vagrant up && mkdir -p ~/.ssh && vagrant ssh-config >> ~/.ssh/config + - ssh default sudo dnf install -y podman + script: + - ssh default sudo podman build -t test /vagrant + - ssh default sudo podman run --privileged --cgroupns=private test make localunittest + allow_failures: + - go: tip + +go_import_path: github.com/opencontainers/runc + +# `make ci` uses Docker. +sudo: required +services: + - docker + +env: + global: + - BUILDTAGS="seccomp apparmor selinux ambient" + +before_install: + - sudo apt-get -qq update + - sudo apt-get install -y libseccomp-dev + - go get -u golang.org/x/lint/golint + - go get -u github.com/vbatts/git-validation + - env | grep TRAVIS_ + +script: + - git-validation -run DCO,short-subject -v + - make BUILDTAGS="${BUILDTAGS}" + - make BUILDTAGS="${BUILDTAGS}" clean ci cross diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..3b674cf --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,124 @@ +## Contribution Guidelines + +### Security issues + +If you are reporting a security issue, do not create an issue or file a pull +request on GitHub. Instead, disclose the issue responsibly by sending an email +to security@opencontainers.org (which is inhabited only by the maintainers of +the various OCI projects). + +### Pull requests are always welcome + +We are always thrilled to receive pull requests, and do our best to +process them as fast as possible. Not sure if that typo is worth a pull +request? Do it! We will appreciate it. + +If your pull request is not accepted on the first try, don't be +discouraged! If there's a problem with the implementation, hopefully you +received feedback on what to improve. + +We're trying very hard to keep runc lean and focused. We don't want it +to do everything for everybody. This means that we might decide against +incorporating a new feature. However, there might be a way to implement +that feature *on top of* runc. + + +### Conventions + +Fork the repo and make changes on your fork in a feature branch: + +- If it's a bugfix branch, name it XXX-something where XXX is the number of the + issue +- If it's a feature branch, create an enhancement issue to announce your + intentions, and name it XXX-something where XXX is the number of the issue. + +Submit unit tests for your changes. Go has a great test framework built in; use +it! Take a look at existing tests for inspiration. Run the full test suite on +your branch before submitting a pull request. + +Update the documentation when creating or modifying features. Test +your documentation changes for clarity, concision, and correctness, as +well as a clean documentation build. See ``docs/README.md`` for more +information on building the docs and how docs get released. + +Write clean code. Universally formatted code promotes ease of writing, reading, +and maintenance. Always run `gofmt -s -w file.go` on each changed file before +committing your changes. Most editors have plugins that do this automatically. + +Pull requests descriptions should be as clear as possible and include a +reference to all the issues that they address. + +Pull requests must not contain commits from other users or branches. + +Commit messages must start with a capitalized and short summary (max. 50 +chars) written in the imperative, followed by an optional, more detailed +explanatory text which is separated from the summary by an empty line. + +Code review comments may be added to your pull request. Discuss, then make the +suggested modifications and push additional commits to your feature branch. Be +sure to post a comment after pushing. The new commits will show up in the pull +request automatically, but the reviewers will not be notified unless you +comment. + +Before the pull request is merged, make sure that you squash your commits into +logical units of work using `git rebase -i` and `git push -f`. After every +commit the test suite should be passing. Include documentation changes in the +same commit so that a revert would remove all traces of the feature or fix. + +Commits that fix or close an issue should include a reference like `Closes #XXX` +or `Fixes #XXX`, which will automatically close the issue when merged. + +### Sign your work + +The sign-off is a simple line at the end of the explanation for the +patch, which certifies that you wrote it or otherwise have the right to +pass it on as an open-source patch. The rules are pretty simple: if you +can certify the below (from +[developercertificate.org](http://developercertificate.org/)): + +``` +Developer Certificate of Origin +Version 1.1 + +Copyright (C) 2004, 2006 The Linux Foundation and its contributors. +660 York Street, Suite 102, +San Francisco, CA 94110 USA + +Everyone is permitted to copy and distribute verbatim copies of this +license document, but changing it is not allowed. + + +Developer's Certificate of Origin 1.1 + +By making a contribution to this project, I certify that: + +(a) The contribution was created in whole or in part by me and I + have the right to submit it under the open source license + indicated in the file; or + +(b) The contribution is based upon previous work that, to the best + of my knowledge, is covered under an appropriate open source + license and I have the right under that license to submit that + work with modifications, whether created in whole or in part + by me, under the same open source license (unless I am + permitted to submit under a different license), as indicated + in the file; or + +(c) The contribution was provided directly to me by some other + person who certified (a), (b) or (c) and I have not modified + it. + +(d) I understand and agree that this project and the contribution + are public and that a record of the contribution (including all + personal information I submit with it, including my sign-off) is + maintained indefinitely and may be redistributed consistent with + this project or the open source license(s) involved. +``` + +then you just add a line to every git commit message: + + Signed-off-by: Joe Smith + +using your real name (sorry, no pseudonyms or anonymous contributions.) + +You can add the sign off when creating the git commit via `git commit -s`. diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..5c65470 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,66 @@ +FROM golang:1.12-stretch + +RUN dpkg --add-architecture armel \ + && dpkg --add-architecture armhf \ + && dpkg --add-architecture arm64 \ + && dpkg --add-architecture ppc64el \ + && apt-get update && apt-get install -y \ + build-essential \ + curl \ + sudo \ + gawk \ + iptables \ + jq \ + pkg-config \ + libaio-dev \ + libcap-dev \ + libprotobuf-dev \ + libprotobuf-c0-dev \ + libnl-3-dev \ + libnet-dev \ + libseccomp2 \ + libseccomp-dev \ + protobuf-c-compiler \ + protobuf-compiler \ + python-minimal \ + uidmap \ + kmod \ + crossbuild-essential-armel crossbuild-essential-armhf crossbuild-essential-arm64 crossbuild-essential-ppc64el \ + libseccomp-dev:armel libseccomp-dev:armhf libseccomp-dev:arm64 libseccomp-dev:ppc64el \ + --no-install-recommends \ + && apt-get clean + +# Add a dummy user for the rootless integration tests. While runC does +# not require an entry in /etc/passwd to operate, one of the tests uses +# `git clone` -- and `git clone` does not allow you to clone a +# repository if the current uid does not have an entry in /etc/passwd. +RUN useradd -u1000 -m -d/home/rootless -s/bin/bash rootless + +# install bats +RUN cd /tmp \ + && git clone https://github.com/sstephenson/bats.git \ + && cd bats \ + && git reset --hard 03608115df2071fff4eaaff1605768c275e5f81f \ + && ./install.sh /usr/local \ + && rm -rf /tmp/bats + +# install criu +ENV CRIU_VERSION v3.12 +RUN mkdir -p /usr/src/criu \ + && curl -sSL https://github.com/checkpoint-restore/criu/archive/${CRIU_VERSION}.tar.gz | tar -v -C /usr/src/criu/ -xz --strip-components=1 \ + && cd /usr/src/criu \ + && make install-criu \ + && rm -rf /usr/src/criu + +# setup a playground for us to spawn containers in +ENV ROOTFS /busybox +RUN mkdir -p ${ROOTFS} + +COPY script/tmpmount / +WORKDIR /go/src/github.com/opencontainers/runc +ENTRYPOINT ["/tmpmount"] + +ADD . /go/src/github.com/opencontainers/runc + +RUN . tests/integration/multi-arch.bash \ + && curl -o- -sSL `get_busybox` | tar xfJC - ${ROOTFS} diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..2744858 --- /dev/null +++ b/LICENSE @@ -0,0 +1,191 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + Copyright 2014 Docker, Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/MAINTAINERS b/MAINTAINERS new file mode 100644 index 0000000..9fe08d3 --- /dev/null +++ b/MAINTAINERS @@ -0,0 +1,5 @@ +Michael Crosby (@crosbymichael) +Mrunal Patel (@mrunalp) +Daniel, Dao Quang Minh (@dqminh) +Qiang Huang (@hqhq) +Aleksa Sarai (@cyphar) diff --git a/MAINTAINERS_GUIDE.md b/MAINTAINERS_GUIDE.md new file mode 100644 index 0000000..7442103 --- /dev/null +++ b/MAINTAINERS_GUIDE.md @@ -0,0 +1,120 @@ +## Introduction + +Dear maintainer. Thank you for investing the time and energy to help +make runc as useful as possible. Maintaining a project is difficult, +sometimes unrewarding work. Sure, you will get to contribute cool +features to the project. But most of your time will be spent reviewing, +cleaning up, documenting, answering questions, justifying design +decisions - while everyone has all the fun! But remember - the quality +of the maintainers work is what distinguishes the good projects from the +great. So please be proud of your work, even the unglamorous parts, +and encourage a culture of appreciation and respect for *every* aspect +of improving the project - not just the hot new features. + +This document is a manual for maintainers old and new. It explains what +is expected of maintainers, how they should work, and what tools are +available to them. + +This is a living document - if you see something out of date or missing, +speak up! + +## What are a maintainer's responsibility? + +It is every maintainer's responsibility to: + +* 1) Expose a clear roadmap for improving their component. +* 2) Deliver prompt feedback and decisions on pull requests. +* 3) Be available to anyone with questions, bug reports, criticism etc. + on their component. This includes IRC and GitHub issues and pull requests. +* 4) Make sure their component respects the philosophy, design and + roadmap of the project. + +## How are decisions made? + +Short answer: with pull requests to the runc repository. + +runc is an open-source project with an open design philosophy. This +means that the repository is the source of truth for EVERY aspect of the +project, including its philosophy, design, roadmap and APIs. *If it's +part of the project, it's in the repo. It's in the repo, it's part of +the project.* + +As a result, all decisions can be expressed as changes to the +repository. An implementation change is a change to the source code. An +API change is a change to the API specification. A philosophy change is +a change to the philosophy manifesto. And so on. + +All decisions affecting runc, big and small, follow the same 3 steps: + +* Step 1: Open a pull request. Anyone can do this. + +* Step 2: Discuss the pull request. Anyone can do this. + +* Step 3: Accept (`LGTM`) or refuse a pull request. The relevant maintainers do +this (see below "Who decides what?") + +*I'm a maintainer, should I make pull requests too?* + +Yes. Nobody should ever push to master directly. All changes should be +made through a pull request. + +## Who decides what? + +All decisions are pull requests, and the relevant maintainers make +decisions by accepting or refusing the pull request. Review and acceptance +by anyone is denoted by adding a comment in the pull request: `LGTM`. +However, only currently listed `MAINTAINERS` are counted towards the required +two LGTMs. + +Overall the maintainer system works because of mutual respect across the +maintainers of the project. The maintainers trust one another to make decisions +in the best interests of the project. Sometimes maintainers can disagree and +this is part of a healthy project to represent the point of views of various people. +In the case where maintainers cannot find agreement on a specific change the +role of a Chief Maintainer comes into play. + +The Chief Maintainer for the project is responsible for overall architecture +of the project to maintain conceptual integrity. Large decisions and +architecture changes should be reviewed by the chief maintainer. +The current chief maintainer for the project is Michael Crosby (@crosbymichael). + +Even though the maintainer system is built on trust, if there is a conflict +with the chief maintainer on a decision, their decision can be challenged +and brought to the technical oversight board if two-thirds of the +maintainers vote for an appeal. It is expected that this would be a +very exceptional event. + + +### How are maintainers added? + +The best maintainers have a vested interest in the project. Maintainers +are first and foremost contributors that have shown they are committed to +the long term success of the project. Contributors wanting to become +maintainers are expected to be deeply involved in contributing code, +pull request review, and triage of issues in the project for more than two months. + +Just contributing does not make you a maintainer, it is about building trust +with the current maintainers of the project and being a person that they can +depend on and trust to make decisions in the best interest of the project. The +final vote to add a new maintainer should be approved by over 66% of the current +maintainers with the chief maintainer having veto power. In case of a veto, +conflict resolution rules expressed above apply. The voting period is +five business days on the Pull Request to add the new maintainer. + + +### What is expected of maintainers? + +Part of a healthy project is to have active maintainers to support the community +in contributions and perform tasks to keep the project running. Maintainers are +expected to be able to respond in a timely manner if their help is required on specific +issues where they are pinged. Being a maintainer is a time consuming commitment and should +not be taken lightly. + +When a maintainer is unable to perform the required duties they can be removed with +a vote by 66% of the current maintainers with the chief maintainer having veto power. +The voting period is ten business days. Issues related to a maintainer's performance should +be discussed with them among the other maintainers so that they are not surprised by +a pull request removing them. + + + diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..81db9d9 --- /dev/null +++ b/Makefile @@ -0,0 +1,133 @@ +.PHONY: all shell dbuild man release \ + localtest localunittest localintegration \ + test unittest integration \ + cross localcross + +CONTAINER_ENGINE := docker +GO := go + +SOURCES := $(shell find . 2>&1 | grep -E '.*\.(c|h|go)$$') +PREFIX := $(DESTDIR)/usr/local +BINDIR := $(PREFIX)/sbin +GIT_BRANCH := $(shell git rev-parse --abbrev-ref HEAD 2>/dev/null) +GIT_BRANCH_CLEAN := $(shell echo $(GIT_BRANCH) | sed -e "s/[^[:alnum:]]/-/g") +RUNC_IMAGE := runc_dev$(if $(GIT_BRANCH_CLEAN),:$(GIT_BRANCH_CLEAN)) +PROJECT := github.com/opencontainers/runc +BUILDTAGS ?= seccomp +COMMIT_NO := $(shell git rev-parse HEAD 2> /dev/null || true) +COMMIT ?= $(if $(shell git status --porcelain --untracked-files=no),"${COMMIT_NO}-dirty","${COMMIT_NO}") + +MAN_DIR := $(CURDIR)/man/man8 +MAN_PAGES = $(shell ls $(MAN_DIR)/*.8) +MAN_PAGES_BASE = $(notdir $(MAN_PAGES)) +MAN_INSTALL_PATH := ${PREFIX}/share/man/man8/ + +RELEASE_DIR := $(CURDIR)/release + +VERSION := ${shell cat ./VERSION} + +SHELL := $(shell command -v bash 2>/dev/null) + +.DEFAULT: runc + +runc: $(SOURCES) + $(GO) build -buildmode=pie $(EXTRA_FLAGS) -ldflags "-X main.gitCommit=${COMMIT} -X main.version=${VERSION} $(EXTRA_LDFLAGS)" -tags "$(BUILDTAGS)" -o runc . + +all: runc recvtty + +recvtty: contrib/cmd/recvtty/recvtty + +contrib/cmd/recvtty/recvtty: $(SOURCES) + $(GO) build -buildmode=pie $(EXTRA_FLAGS) -ldflags "-X main.gitCommit=${COMMIT} -X main.version=${VERSION} $(EXTRA_LDFLAGS)" -tags "$(BUILDTAGS)" -o contrib/cmd/recvtty/recvtty ./contrib/cmd/recvtty + +static: $(SOURCES) + CGO_ENABLED=1 $(GO) build $(EXTRA_FLAGS) -tags "$(BUILDTAGS) netgo osusergo" -installsuffix netgo -ldflags "-w -extldflags -static -X main.gitCommit=${COMMIT} -X main.version=${VERSION} $(EXTRA_LDFLAGS)" -o runc . + CGO_ENABLED=1 $(GO) build $(EXTRA_FLAGS) -tags "$(BUILDTAGS) netgo osusergo" -installsuffix netgo -ldflags "-w -extldflags -static -X main.gitCommit=${COMMIT} -X main.version=${VERSION} $(EXTRA_LDFLAGS)" -o contrib/cmd/recvtty/recvtty ./contrib/cmd/recvtty + +release: + script/release.sh -r release/$(VERSION) -v $(VERSION) + +dbuild: runcimage + $(CONTAINER_ENGINE) run ${CONTAINER_ENGINE_RUN_FLAGS} --rm -v $(CURDIR):/go/src/$(PROJECT) --privileged $(RUNC_IMAGE) make clean all + +lint: + $(GO) vet $(allpackages) + $(GO) fmt $(allpackages) + +man: + man/md2man-all.sh + +runcimage: + $(CONTAINER_ENGINE) build ${CONTAINER_ENGINE_BUILD_FLAGS} -t $(RUNC_IMAGE) . + +test: + make unittest integration rootlessintegration + +localtest: + make localunittest localintegration localrootlessintegration + +unittest: runcimage + $(CONTAINER_ENGINE) run ${CONTAINER_ENGINE_RUN_FLAGS} -t --privileged --rm -v /lib/modules:/lib/modules:ro -v $(CURDIR):/go/src/$(PROJECT) $(RUNC_IMAGE) make localunittest TESTFLAGS=${TESTFLAGS} + +localunittest: all + $(GO) test -timeout 3m -tags "$(BUILDTAGS)" ${TESTFLAGS} -v $(allpackages) + +integration: runcimage + $(CONTAINER_ENGINE) run ${CONTAINER_ENGINE_RUN_FLAGS} -t --privileged --rm -v /lib/modules:/lib/modules:ro -v $(CURDIR):/go/src/$(PROJECT) $(RUNC_IMAGE) make localintegration TESTPATH=${TESTPATH} + +localintegration: all + bats -t tests/integration${TESTPATH} + +rootlessintegration: runcimage + $(CONTAINER_ENGINE) run ${CONTAINER_ENGINE_RUN_FLAGS} -t --privileged --rm -v $(CURDIR):/go/src/$(PROJECT) $(RUNC_IMAGE) make localrootlessintegration + +localrootlessintegration: all + tests/rootless.sh + +shell: runcimage + $(CONTAINER_ENGINE) run ${CONTAINER_ENGINE_RUN_FLAGS} -ti --privileged --rm -v $(CURDIR):/go/src/$(PROJECT) $(RUNC_IMAGE) bash + +install: + install -D -m0755 runc $(BINDIR)/runc + +install-bash: + install -D -m0644 contrib/completions/bash/runc $(PREFIX)/share/bash-completion/completions/runc + +install-man: + install -d -m 755 $(MAN_INSTALL_PATH) + install -m 644 $(MAN_PAGES) $(MAN_INSTALL_PATH) + +uninstall: + rm -f $(BINDIR)/runc + +uninstall-bash: + rm -f $(PREFIX)/share/bash-completion/completions/runc + +uninstall-man: + rm -f $(addprefix $(MAN_INSTALL_PATH),$(MAN_PAGES_BASE)) + +clean: + rm -f runc runc-* + rm -f contrib/cmd/recvtty/recvtty + rm -rf $(RELEASE_DIR) + rm -rf $(MAN_DIR) + +validate: + script/validate-gofmt + script/validate-c + $(GO) vet $(allpackages) + +ci: validate test release + +cross: runcimage + $(CONTAINER_ENGINE) run ${CONTAINER_ENGINE_RUN_FLAGS} -e BUILDTAGS="$(BUILDTAGS)" --rm -v $(CURDIR):/go/src/$(PROJECT) $(RUNC_IMAGE) make localcross + +localcross: + CGO_ENABLED=1 GOARCH=arm GOARM=6 CC=arm-linux-gnueabi-gcc $(GO) build -buildmode=pie $(EXTRA_FLAGS) -ldflags "-X main.gitCommit=${COMMIT} -X main.version=${VERSION} $(EXTRA_LDFLAGS)" -tags "$(BUILDTAGS)" -o runc-armel . + CGO_ENABLED=1 GOARCH=arm GOARM=7 CC=arm-linux-gnueabihf-gcc $(GO) build -buildmode=pie $(EXTRA_FLAGS) -ldflags "-X main.gitCommit=${COMMIT} -X main.version=${VERSION} $(EXTRA_LDFLAGS)" -tags "$(BUILDTAGS)" -o runc-armhf . + CGO_ENABLED=1 GOARCH=arm64 CC=aarch64-linux-gnu-gcc $(GO) build -buildmode=pie $(EXTRA_FLAGS) -ldflags "-X main.gitCommit=${COMMIT} -X main.version=${VERSION} $(EXTRA_LDFLAGS)" -tags "$(BUILDTAGS)" -o runc-arm64 . + CGO_ENABLED=1 GOARCH=ppc64le CC=powerpc64le-linux-gnu-gcc $(GO) build -buildmode=pie $(EXTRA_FLAGS) -ldflags "-X main.gitCommit=${COMMIT} -X main.version=${VERSION} $(EXTRA_LDFLAGS)" -tags "$(BUILDTAGS)" -o runc-ppc64le . + +# memoize allpackages, so that it's executed only once and only if used +_allpackages = $(shell $(GO) list ./... | grep -v vendor) +allpackages = $(if $(__allpackages),,$(eval __allpackages := $$(_allpackages)))$(__allpackages) diff --git a/NOTICE b/NOTICE new file mode 100644 index 0000000..5c97abc --- /dev/null +++ b/NOTICE @@ -0,0 +1,17 @@ +runc + +Copyright 2012-2015 Docker, Inc. + +This product includes software developed at Docker, Inc. (http://www.docker.com). + +The following is courtesy of our legal counsel: + + +Use and transfer of Docker may be subject to certain restrictions by the +United States and other governments. +It is your responsibility to ensure that your use and/or transfer does not +violate applicable laws. + +For more information, please see http://www.bis.doc.gov + +See also http://www.apache.org/dev/crypto.html and/or seek legal counsel. diff --git a/PRINCIPLES.md b/PRINCIPLES.md new file mode 100644 index 0000000..fdcc373 --- /dev/null +++ b/PRINCIPLES.md @@ -0,0 +1,19 @@ +# runc principles + +In the design and development of runc and libcontainer we try to follow these principles: + +(Work in progress) + +* Don't try to replace every tool. Instead, be an ingredient to improve them. +* Less code is better. +* Fewer components are better. Do you really need to add one more class? +* 50 lines of straightforward, readable code is better than 10 lines of magic that nobody can understand. +* Don't do later what you can do now. "//TODO: refactor" is not acceptable in new code. +* When hesitating between two options, choose the one that is easier to reverse. +* "No" is temporary; "Yes" is forever. If you're not sure about a new feature, say no. You can change your mind later. +* Containers must be portable to the greatest possible number of machines. Be suspicious of any change which makes machines less interchangeable. +* The fewer moving parts in a container, the better. +* Don't merge it unless you document it. +* Don't document it unless you can keep it up-to-date. +* Don't merge it unless you test it! +* Everyone's problem is slightly different. Focus on the part that is the same for everyone, and solve that. diff --git a/README.md b/README.md new file mode 100644 index 0000000..a806f27 --- /dev/null +++ b/README.md @@ -0,0 +1,280 @@ +# runc + +[![Build Status](https://travis-ci.org/opencontainers/runc.svg?branch=master)](https://travis-ci.org/opencontainers/runc) +[![Go Report Card](https://goreportcard.com/badge/github.com/opencontainers/runc)](https://goreportcard.com/report/github.com/opencontainers/runc) +[![GoDoc](https://godoc.org/github.com/opencontainers/runc?status.svg)](https://godoc.org/github.com/opencontainers/runc) + +## Introduction + +`runc` is a CLI tool for spawning and running containers according to the OCI specification. + +## Releases + +`runc` depends on and tracks the [runtime-spec](https://github.com/opencontainers/runtime-spec) repository. +We will try to make sure that `runc` and the OCI specification major versions stay in lockstep. +This means that `runc` 1.0.0 should implement the 1.0 version of the specification. + +You can find official releases of `runc` on the [release](https://github.com/opencontainers/runc/releases) page. + +Currently, the following features are not considered to be production-ready: + +* Support for cgroup v2 + +## Security + +The reporting process and disclosure communications are outlined in [/org/security](https://github.com/opencontainers/org/blob/master/security/). + +## Building + +`runc` currently supports the Linux platform with various architecture support. +It must be built with Go version 1.6 or higher in order for some features to function properly. + +In order to enable seccomp support you will need to install `libseccomp` on your platform. +> e.g. `libseccomp-devel` for CentOS, or `libseccomp-dev` for Ubuntu + +Otherwise, if you do not want to build `runc` with seccomp support you can add `BUILDTAGS=""` when running make. + +```bash +# create a 'github.com/opencontainers' in your GOPATH/src +cd github.com/opencontainers +git clone https://github.com/opencontainers/runc +cd runc + +make +sudo make install +``` + +You can also use `go get` to install to your `GOPATH`, assuming that you have a `github.com` parent folder already created under `src`: + +```bash +go get github.com/opencontainers/runc +cd $GOPATH/src/github.com/opencontainers/runc +make +sudo make install +``` + +`runc` will be installed to `/usr/local/sbin/runc` on your system. + + +#### Build Tags + +`runc` supports optional build tags for compiling support of various features. +To add build tags to the make option the `BUILDTAGS` variable must be set. + +```bash +make BUILDTAGS='seccomp apparmor' +``` + +| Build Tag | Feature | Dependency | +|-----------|------------------------------------|-------------| +| seccomp | Syscall filtering | libseccomp | +| selinux | selinux process and mount labeling | | +| apparmor | apparmor profile support | | +| ambient | ambient capability support | kernel 4.3 | +| nokmem | disable kernel memory account | | + + +### Running the test suite + +`runc` currently supports running its test suite via Docker. +To run the suite just type `make test`. + +```bash +make test +``` + +There are additional make targets for running the tests outside of a container but this is not recommended as the tests are written with the expectation that they can write and remove anywhere. + +You can run a specific test case by setting the `TESTFLAGS` variable. + +```bash +# make test TESTFLAGS="-run=SomeTestFunction" +``` + +You can run a specific integration test by setting the `TESTPATH` variable. + +```bash +# make test TESTPATH="/checkpoint.bats" +``` + +You can run a test in your proxy environment by setting `DOCKER_BUILD_PROXY` and `DOCKER_RUN_PROXY` variables. + +```bash +# make test DOCKER_BUILD_PROXY="--build-arg HTTP_PROXY=http://yourproxy/" DOCKER_RUN_PROXY="-e HTTP_PROXY=http://yourproxy/" +``` + +### Dependencies Management + +`runc` uses [vndr](https://github.com/LK4D4/vndr) for dependencies management. +Please refer to [vndr](https://github.com/LK4D4/vndr) for how to add or update +new dependencies. + +## Using runc + +### Creating an OCI Bundle + +In order to use runc you must have your container in the format of an OCI bundle. +If you have Docker installed you can use its `export` method to acquire a root filesystem from an existing Docker container. + +```bash +# create the top most bundle directory +mkdir /mycontainer +cd /mycontainer + +# create the rootfs directory +mkdir rootfs + +# export busybox via Docker into the rootfs directory +docker export $(docker create busybox) | tar -C rootfs -xvf - +``` + +After a root filesystem is populated you just generate a spec in the format of a `config.json` file inside your bundle. +`runc` provides a `spec` command to generate a base template spec that you are then able to edit. +To find features and documentation for fields in the spec please refer to the [specs](https://github.com/opencontainers/runtime-spec) repository. + +```bash +runc spec +``` + +### Running Containers + +Assuming you have an OCI bundle from the previous step you can execute the container in two different ways. + +The first way is to use the convenience command `run` that will handle creating, starting, and deleting the container after it exits. + +```bash +# run as root +cd /mycontainer +runc run mycontainerid +``` + +If you used the unmodified `runc spec` template this should give you a `sh` session inside the container. + +The second way to start a container is using the specs lifecycle operations. +This gives you more power over how the container is created and managed while it is running. +This will also launch the container in the background so you will have to edit the `config.json` to remove the `terminal` setting for the simple examples here. +Your process field in the `config.json` should look like this below with `"terminal": false` and `"args": ["sleep", "5"]`. + + +```json + "process": { + "terminal": false, + "user": { + "uid": 0, + "gid": 0 + }, + "args": [ + "sleep", "5" + ], + "env": [ + "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", + "TERM=xterm" + ], + "cwd": "/", + "capabilities": { + "bounding": [ + "CAP_AUDIT_WRITE", + "CAP_KILL", + "CAP_NET_BIND_SERVICE" + ], + "effective": [ + "CAP_AUDIT_WRITE", + "CAP_KILL", + "CAP_NET_BIND_SERVICE" + ], + "inheritable": [ + "CAP_AUDIT_WRITE", + "CAP_KILL", + "CAP_NET_BIND_SERVICE" + ], + "permitted": [ + "CAP_AUDIT_WRITE", + "CAP_KILL", + "CAP_NET_BIND_SERVICE" + ], + "ambient": [ + "CAP_AUDIT_WRITE", + "CAP_KILL", + "CAP_NET_BIND_SERVICE" + ] + }, + "rlimits": [ + { + "type": "RLIMIT_NOFILE", + "hard": 1024, + "soft": 1024 + } + ], + "noNewPrivileges": true + }, +``` + +Now we can go through the lifecycle operations in your shell. + + +```bash +# run as root +cd /mycontainer +runc create mycontainerid + +# view the container is created and in the "created" state +runc list + +# start the process inside the container +runc start mycontainerid + +# after 5 seconds view that the container has exited and is now in the stopped state +runc list + +# now delete the container +runc delete mycontainerid +``` + +This allows higher level systems to augment the containers creation logic with setup of various settings after the container is created and/or before it is deleted. For example, the container's network stack is commonly set up after `create` but before `start`. + +#### Rootless containers +`runc` has the ability to run containers without root privileges. This is called `rootless`. You need to pass some parameters to `runc` in order to run rootless containers. See below and compare with the previous version. + +**Note:** In order to use this feature, "User Namespaces" must be compiled and enabled in your kernel. There are various ways to do this depending on your distribution: +- Confirm `CONFIG_USER_NS=y` is set in your kernel configuration (normally found in `/proc/config.gz`) +- Arch/Debian: `echo 1 > /proc/sys/kernel/unprivileged_userns_clone` +- RHEL/CentOS 7: `echo 28633 > /proc/sys/user/max_user_namespaces` + +Run the following commands as an ordinary user: +```bash +# Same as the first example +mkdir ~/mycontainer +cd ~/mycontainer +mkdir rootfs +docker export $(docker create busybox) | tar -C rootfs -xvf - + +# The --rootless parameter instructs runc spec to generate a configuration for a rootless container, which will allow you to run the container as a non-root user. +runc spec --rootless + +# The --root parameter tells runc where to store the container state. It must be writable by the user. +runc --root /tmp/runc run mycontainerid +``` + +#### Supervisors + +`runc` can be used with process supervisors and init systems to ensure that containers are restarted when they exit. +An example systemd unit file looks something like this. + +```systemd +[Unit] +Description=Start My Container + +[Service] +Type=forking +ExecStart=/usr/local/sbin/runc run -d --pid-file /run/mycontainerid.pid mycontainerid +ExecStopPost=/usr/local/sbin/runc delete mycontainerid +WorkingDirectory=/mycontainer +PIDFile=/run/mycontainerid.pid + +[Install] +WantedBy=multi-user.target +``` + +## License + +The code and docs are released under the [Apache 2.0 license](LICENSE). diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 0000000..63a7438 --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,3 @@ +# Security + +The reporting process and disclosure communications are outlined in [/org/security](https://github.com/opencontainers/org/blob/master/security/). diff --git a/VERSION b/VERSION new file mode 100644 index 0000000..950f8ca --- /dev/null +++ b/VERSION @@ -0,0 +1 @@ +1.0.0-rc10 diff --git a/checkpoint.go b/checkpoint.go new file mode 100644 index 0000000..ae01ea3 --- /dev/null +++ b/checkpoint.go @@ -0,0 +1,137 @@ +// +build linux + +package main + +import ( + "fmt" + "os" + "strconv" + "strings" + + "github.com/opencontainers/runc/libcontainer" + "github.com/opencontainers/runc/libcontainer/system" + "github.com/opencontainers/runtime-spec/specs-go" + "github.com/sirupsen/logrus" + "github.com/urfave/cli" + + "golang.org/x/sys/unix" +) + +var checkpointCommand = cli.Command{ + Name: "checkpoint", + Usage: "checkpoint a running container", + ArgsUsage: ` + +Where "" is the name for the instance of the container to be +checkpointed.`, + Description: `The checkpoint command saves the state of the container instance.`, + Flags: []cli.Flag{ + cli.StringFlag{Name: "image-path", Value: "", Usage: "path for saving criu image files"}, + cli.StringFlag{Name: "work-path", Value: "", Usage: "path for saving work files and logs"}, + cli.StringFlag{Name: "parent-path", Value: "", Usage: "path for previous criu image files in pre-dump"}, + cli.BoolFlag{Name: "leave-running", Usage: "leave the process running after checkpointing"}, + cli.BoolFlag{Name: "tcp-established", Usage: "allow open tcp connections"}, + cli.BoolFlag{Name: "ext-unix-sk", Usage: "allow external unix sockets"}, + cli.BoolFlag{Name: "shell-job", Usage: "allow shell jobs"}, + cli.BoolFlag{Name: "lazy-pages", Usage: "use userfaultfd to lazily restore memory pages"}, + cli.StringFlag{Name: "status-fd", Value: "", Usage: "criu writes \\0 to this FD once lazy-pages is ready"}, + cli.StringFlag{Name: "page-server", Value: "", Usage: "ADDRESS:PORT of the page server"}, + cli.BoolFlag{Name: "file-locks", Usage: "handle file locks, for safety"}, + cli.BoolFlag{Name: "pre-dump", Usage: "dump container's memory information only, leave the container running after this"}, + cli.StringFlag{Name: "manage-cgroups-mode", Value: "", Usage: "cgroups mode: 'soft' (default), 'full' and 'strict'"}, + cli.StringSliceFlag{Name: "empty-ns", Usage: "create a namespace, but don't restore its properties"}, + cli.BoolFlag{Name: "auto-dedup", Usage: "enable auto deduplication of memory images"}, + }, + Action: func(context *cli.Context) error { + if err := checkArgs(context, 1, exactArgs); err != nil { + return err + } + // XXX: Currently this is untested with rootless containers. + if os.Geteuid() != 0 || system.RunningInUserNS() { + logrus.Warn("runc checkpoint is untested with rootless containers") + } + + container, err := getContainer(context) + if err != nil { + return err + } + status, err := container.Status() + if err != nil { + return err + } + if status == libcontainer.Created || status == libcontainer.Stopped { + fatalf("Container cannot be checkpointed in %s state", status.String()) + } + defer destroy(container) + options := criuOptions(context) + // these are the mandatory criu options for a container + setPageServer(context, options) + setManageCgroupsMode(context, options) + if err := setEmptyNsMask(context, options); err != nil { + return err + } + return container.Checkpoint(options) + }, +} + +func getCheckpointImagePath(context *cli.Context) string { + imagePath := context.String("image-path") + if imagePath == "" { + imagePath = getDefaultImagePath(context) + } + return imagePath +} + +func setPageServer(context *cli.Context, options *libcontainer.CriuOpts) { + // xxx following criu opts are optional + // The dump image can be sent to a criu page server + if psOpt := context.String("page-server"); psOpt != "" { + addressPort := strings.Split(psOpt, ":") + if len(addressPort) != 2 { + fatal(fmt.Errorf("Use --page-server ADDRESS:PORT to specify page server")) + } + portInt, err := strconv.Atoi(addressPort[1]) + if err != nil { + fatal(fmt.Errorf("Invalid port number")) + } + options.PageServer = libcontainer.CriuPageServerInfo{ + Address: addressPort[0], + Port: int32(portInt), + } + } +} + +func setManageCgroupsMode(context *cli.Context, options *libcontainer.CriuOpts) { + if cgOpt := context.String("manage-cgroups-mode"); cgOpt != "" { + switch cgOpt { + case "soft": + options.ManageCgroupsMode = libcontainer.CRIU_CG_MODE_SOFT + case "full": + options.ManageCgroupsMode = libcontainer.CRIU_CG_MODE_FULL + case "strict": + options.ManageCgroupsMode = libcontainer.CRIU_CG_MODE_STRICT + default: + fatal(fmt.Errorf("Invalid manage cgroups mode")) + } + } +} + +var namespaceMapping = map[specs.LinuxNamespaceType]int{ + specs.NetworkNamespace: unix.CLONE_NEWNET, +} + +func setEmptyNsMask(context *cli.Context, options *libcontainer.CriuOpts) error { + /* Runc doesn't manage network devices and their configuration */ + nsmask := unix.CLONE_NEWNET + + for _, ns := range context.StringSlice("empty-ns") { + f, exists := namespaceMapping[specs.LinuxNamespaceType(ns)] + if !exists { + return fmt.Errorf("namespace %q is not supported", ns) + } + nsmask |= f + } + + options.EmptyNs = uint32(nsmask) + return nil +} diff --git a/contrib/cmd/recvtty/recvtty.go b/contrib/cmd/recvtty/recvtty.go new file mode 100644 index 0000000..a658b8d --- /dev/null +++ b/contrib/cmd/recvtty/recvtty.go @@ -0,0 +1,238 @@ +/* + * Copyright 2016 SUSE LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package main + +import ( + "fmt" + "io" + "io/ioutil" + "net" + "os" + "strings" + + "github.com/containerd/console" + "github.com/opencontainers/runc/libcontainer/utils" + "github.com/urfave/cli" +) + +// version will be populated by the Makefile, read from +// VERSION file of the source code. +var version = "" + +// gitCommit will be the hash that the binary was built from +// and will be populated by the Makefile +var gitCommit = "" + +const ( + usage = `Open Container Initiative contrib/cmd/recvtty + +recvtty is a reference implementation of a consumer of runC's --console-socket +API. It has two main modes of operation: + + * single: Only permit one terminal to be sent to the socket, which is + then hooked up to the stdio of the recvtty process. This is useful + for rudimentary shell management of a container. + + * null: Permit as many terminals to be sent to the socket, but they + are read to /dev/null. This is used for testing, and imitates the + old runC API's --console=/dev/pts/ptmx hack which would allow for a + similar trick. This is probably not what you want to use, unless + you're doing something like our bats integration tests. + +To use recvtty, just specify a socket path at which you want to receive +terminals: + + $ recvtty [--mode ] socket.sock +` +) + +func bail(err error) { + fmt.Fprintf(os.Stderr, "[recvtty] fatal error: %v\n", err) + os.Exit(1) +} + +func handleSingle(path string) error { + // Open a socket. + ln, err := net.Listen("unix", path) + if err != nil { + return err + } + defer ln.Close() + + // We only accept a single connection, since we can only really have + // one reader for os.Stdin. Plus this is all a PoC. + conn, err := ln.Accept() + if err != nil { + return err + } + defer conn.Close() + + // Close ln, to allow for other instances to take over. + ln.Close() + + // Get the fd of the connection. + unixconn, ok := conn.(*net.UnixConn) + if !ok { + return fmt.Errorf("failed to cast to unixconn") + } + + socket, err := unixconn.File() + if err != nil { + return err + } + defer socket.Close() + + // Get the master file descriptor from runC. + master, err := utils.RecvFd(socket) + if err != nil { + return err + } + c, err := console.ConsoleFromFile(master) + if err != nil { + return err + } + console.ClearONLCR(c.Fd()) + + // Copy from our stdio to the master fd. + quitChan := make(chan struct{}) + go func() { + io.Copy(os.Stdout, c) + quitChan <- struct{}{} + }() + go func() { + io.Copy(c, os.Stdin) + quitChan <- struct{}{} + }() + + // Only close the master fd once we've stopped copying. + <-quitChan + c.Close() + return nil +} + +func handleNull(path string) error { + // Open a socket. + ln, err := net.Listen("unix", path) + if err != nil { + return err + } + defer ln.Close() + + // As opposed to handleSingle we accept as many connections as we get, but + // we don't interact with Stdin at all (and we copy stdout to /dev/null). + for { + conn, err := ln.Accept() + if err != nil { + return err + } + go func(conn net.Conn) { + // Don't leave references lying around. + defer conn.Close() + + // Get the fd of the connection. + unixconn, ok := conn.(*net.UnixConn) + if !ok { + return + } + + socket, err := unixconn.File() + if err != nil { + return + } + defer socket.Close() + + // Get the master file descriptor from runC. + master, err := utils.RecvFd(socket) + if err != nil { + return + } + + // Just do a dumb copy to /dev/null. + devnull, err := os.OpenFile("/dev/null", os.O_RDWR, 0) + if err != nil { + // TODO: Handle this nicely. + return + } + + io.Copy(devnull, master) + devnull.Close() + }(conn) + } +} + +func main() { + app := cli.NewApp() + app.Name = "recvtty" + app.Usage = usage + + // Set version to be the same as runC. + var v []string + if version != "" { + v = append(v, version) + } + if gitCommit != "" { + v = append(v, fmt.Sprintf("commit: %s", gitCommit)) + } + app.Version = strings.Join(v, "\n") + + // Set the flags. + app.Flags = []cli.Flag{ + cli.StringFlag{ + Name: "mode, m", + Value: "single", + Usage: "Mode of operation (single or null)", + }, + cli.StringFlag{ + Name: "pid-file", + Value: "", + Usage: "Path to write daemon process ID to", + }, + } + + app.Action = func(ctx *cli.Context) error { + args := ctx.Args() + if len(args) != 1 { + return fmt.Errorf("need to specify a single socket path") + } + path := ctx.Args()[0] + + pidPath := ctx.String("pid-file") + if pidPath != "" { + pid := fmt.Sprintf("%d\n", os.Getpid()) + if err := ioutil.WriteFile(pidPath, []byte(pid), 0644); err != nil { + return err + } + } + + switch ctx.String("mode") { + case "single": + if err := handleSingle(path); err != nil { + return err + } + case "null": + if err := handleNull(path); err != nil { + return err + } + default: + return fmt.Errorf("need to select a valid mode: %s", ctx.String("mode")) + } + return nil + } + if err := app.Run(os.Args); err != nil { + bail(err) + } +} diff --git a/contrib/completions/bash/runc b/contrib/completions/bash/runc new file mode 100644 index 0000000..9517a5b --- /dev/null +++ b/contrib/completions/bash/runc @@ -0,0 +1,826 @@ +#!/bin/bash +# +# bash completion file for runc command +# +# This script provides completion of: +# - commands and their options +# - filepaths +# +# To enable the completions either: +# - place this file in /usr/share/bash-completion/completions +# or +# - copy this file to e.g. ~/.runc-completion.sh and add the line +# below to your .bashrc after bash completion features are loaded +# . ~/.runc-completion.sh +# +# Configuration: +# + +# Note for developers: +# Please arrange options sorted alphabetically by long name with the short +# options immediately following their corresponding long form. +# This order should be applied to lists, alternatives and code blocks. + +__runc_previous_extglob_setting=$(shopt -p extglob) +shopt -s extglob + +__runc_list_all() { + COMPREPLY=($(compgen -W "$(runc list -q)" -- $cur)) +} + +__runc_pos_first_nonflag() { + local argument_flags=$1 + + local counter=$((${subcommand_pos:-${command_pos}} + 1)) + while [ $counter -le $cword ]; do + if [ -n "$argument_flags" ] && eval "case '${words[$counter]}' in $argument_flags) true ;; *) false ;; esac"; then + ((counter++)) + else + case "${words[$counter]}" in + -*) ;; + *) + break + ;; + esac + fi + ((counter++)) + done + + echo $counter +} + +# Transforms a multiline list of strings into a single line string +# with the words separated by "|". +# This is used to prepare arguments to __runc_pos_first_nonflag(). +__runc_to_alternatives() { + local parts=($1) + local IFS='|' + echo "${parts[*]}" +} + +# Transforms a multiline list of options into an extglob pattern +# suitable for use in case statements. +__runc_to_extglob() { + local extglob=$(__runc_to_alternatives "$1") + echo "@($extglob)" +} + +# Subcommand processing. +# Locates the first occurrence of any of the subcommands contained in the +# first argument. In case of a match, calls the corresponding completion +# function and returns 0. +# If no match is found, 1 is returned. The calling function can then +# continue processing its completion. +# +# TODO if the preceding command has options that accept arguments and an +# argument is equal to one of the subcommands, this is falsely detected as +# a match. +__runc_subcommands() { + local subcommands="$1" + + local counter=$(($command_pos + 1)) + while [ $counter -lt $cword ]; do + case "${words[$counter]}" in + $(__runc_to_extglob "$subcommands")) + subcommand_pos=$counter + local subcommand=${words[$counter]} + local completions_func=_runc_${command}_${subcommand} + declare -F $completions_func >/dev/null && $completions_func + return 0 + ;; + esac + ((counter++)) + done + return 1 +} + +# List all Signals +__runc_list_signals() { + COMPREPLY=($(compgen -W "$(for i in $(kill -l | xargs); do echo $i; done | grep SIG)")) +} + +# suppress trailing whitespace +__runc_nospace() { + # compopt is not available in ancient bash versions + type compopt &>/dev/null && compopt -o nospace +} + +# The list of capabilities is defined in types.go, ALL was added manually. +__runc_complete_capabilities() { + COMPREPLY=($(compgen -W " + ALL + AUDIT_CONTROL + AUDIT_WRITE + AUDIT_READ + BLOCK_SUSPEND + CHOWN + DAC_OVERRIDE + DAC_READ_SEARCH + FOWNER + FSETID + IPC_LOCK + IPC_OWNER + KILL + LEASE + LINUX_IMMUTABLE + MAC_ADMIN + MAC_OVERRIDE + MKNOD + NET_ADMIN + NET_BIND_SERVICE + NET_BROADCAST + NET_RAW + SETFCAP + SETGID + SETPCAP + SETUID + SYS_ADMIN + SYS_BOOT + SYS_CHROOT + SYSLOG + SYS_MODULE + SYS_NICE + SYS_PACCT + SYS_PTRACE + SYS_RAWIO + SYS_RESOURCE + SYS_TIME + SYS_TTY_CONFIG + WAKE_ALARM + " -- "$cur")) +} + +_runc_exec() { + local boolean_options=" + --help + --no-new-privs + --tty, -t + --detach, -d + " + + local options_with_args=" + --console-socket + --cwd + --env, -e + --user, -u + --additional-gids, -g + --process, -p + --pid-file + --process-label + --apparmor + --cap, -c + --preserve-fds + " + + local all_options="$options_with_args $boolean_options" + + case "$prev" in + --cap | -c) + __runc_complete_capabilities + return + ;; + + --console-socket | --cwd | --process | --apparmor) + case "$cur" in + *:*) ;; # TODO somehow do _filedir for stuff inside the image, if it's already specified (which is also somewhat difficult to determine) + '') + COMPREPLY=($(compgen -W '/' -- "$cur")) + __runc_nospace + ;; + /*) + _filedir + __runc_nospace + ;; + esac + return + ;; + --env | -e) + COMPREPLY=($(compgen -e -- "$cur")) + __runc_nospace + return + ;; + $(__runc_to_extglob "$options_with_args")) + return + ;; + esac + + case "$cur" in + -*) + COMPREPLY=($(compgen -W "$all_options" -- "$cur")) + ;; + *) + __runc_list_all + ;; + esac +} + +# global options that may appear after the runc command +_runc_runc() { + local boolean_options=" + $global_boolean_options + --help + --version -v + --debug + " + local options_with_args=" + --log + --log-format + --root + --criu + --rootless + " + + case "$prev" in + --log | --root | --criu) + case "$cur" in + *:*) ;; # TODO somehow do _filedir for stuff inside the image, if it's already specified (which is also somewhat difficult to determine) + '') + COMPREPLY=($(compgen -W '/' -- "$cur")) + __runc_nospace + ;; + *) + _filedir + __runc_nospace + ;; + esac + return + ;; + + --log-format) + COMPREPLY=($(compgen -W 'text json' -- "$cur")) + return + ;; + + $(__runc_to_extglob "$options_with_args")) + return + ;; + esac + + case "$cur" in + -*) + COMPREPLY=($(compgen -W "$boolean_options $options_with_args" -- "$cur")) + ;; + *) + local counter=$(__runc_pos_first_nonflag $(__runc_to_extglob "$options_with_args")) + if [ $cword -eq $counter ]; then + COMPREPLY=($(compgen -W "${commands[*]} help" -- "$cur")) + fi + ;; + esac +} + +_runc_pause() { + local boolean_options=" + --help + -h + " + + case "$cur" in + -*) + COMPREPLY=($(compgen -W "$boolean_options $options_with_args" -- "$cur")) + ;; + *) + __runc_list_all + ;; + esac +} + +_runc_ps() { + local boolean_options=" + --help + -h + " + local options_with_args=" + --format, -f + " + + case "$cur" in + -*) + COMPREPLY=($(compgen -W "$boolean_options $options_with_args" -- "$cur")) + ;; + *) + __runc_list_all + ;; + esac +} + +_runc_delete() { + local boolean_options=" + --help + -h + --format, -f + " + + case "$cur" in + -*) + COMPREPLY=($(compgen -W "$boolean_options $options_with_args" -- "$cur")) + ;; + *) + __runc_list_all + ;; + esac +} + +_runc_kill() { + local boolean_options=" + --help + -h + --all + -a + " + + case "$prev" in + "kill") + __runc_list_all + return + ;; + *) + __runc_list_signals + return + ;; + esac + case "$cur" in + -*) + COMPREPLY=($(compgen -W "$boolean_options $options_with_args" -- "$cur")) + ;; + *) + __runc_list_all + ;; + esac +} + +_runc_events() { + local boolean_options=" + --help + --stats + " + + local options_with_args=" + --interval + " + + case "$prev" in + $(__runc_to_extglob "$options_with_args")) + return + ;; + esac + + case "$cur" in + -*) + COMPREPLY=($(compgen -W "$boolean_options $options_with_args" -- "$cur")) + ;; + *) + __runc_list_all + ;; + esac +} + +_runc_list() { + local boolean_options=" + --help + --quiet + -q + " + + local options_with_args=" + --format + -f + " + + case "$prev" in + --format | -f) + COMPREPLY=($(compgen -W 'text json' -- "$cur")) + return + ;; + + $(__runc_to_extglob "$options_with_args")) + return + ;; + esac + + case "$cur" in + -*) + COMPREPLY=($(compgen -W "$boolean_options $options_with_args" -- "$cur")) + ;; + *) + local counter=$(__runc_pos_first_nonflag $(__runc_to_extglob "$options_with_args")) + ;; + esac +} + +_runc_spec() { + local boolean_options=" + --help + --rootless + " + + local options_with_args=" + --bundle + -b + " + + case "$prev" in + --bundle | -b) + case "$cur" in + '') + COMPREPLY=($(compgen -W '/' -- "$cur")) + __runc_nospace + ;; + /*) + _filedir + __runc_nospace + ;; + esac + return + ;; + + $(__runc_to_extglob "$options_with_args")) + return + ;; + esac + + case "$cur" in + -*) + COMPREPLY=($(compgen -W "$boolean_options $options_with_args" -- "$cur")) + ;; + *) + local counter=$(__runc_pos_first_nonflag $(__runc_to_extglob "$options_with_args")) + ;; + esac +} + +_runc_run() { + local boolean_options=" + --help + --detatch + -d + --no-subreaper + --no-pivot + --no-new-keyring + " + + local options_with_args=" + --bundle + -b + --console-socket + --pid-file + --preserve-fds + " + + case "$prev" in + --bundle | -b | --console-socket | --pid-file) + case "$cur" in + '') + COMPREPLY=($(compgen -W '/' -- "$cur")) + __runc_nospace + ;; + /*) + _filedir + __runc_nospace + ;; + esac + return + ;; + + $(__runc_to_extglob "$options_with_args")) + return + ;; + esac + + case "$cur" in + -*) + COMPREPLY=($(compgen -W "$boolean_options $options_with_args" -- "$cur")) + ;; + *) + __runc_list_all + ;; + esac +} + +_runc_checkpoint() { + local boolean_options=" + --help + -h + --leave-running + --tcp-established + --ext-unix-sk + --shell-job + --lazy-pages + --file-locks + --pre-dump + --auto-dedup + " + + local options_with_args=" + --image-path + --work-path + --parent-path + --status-fd + --page-server + --manage-cgroups-mode + --empty-ns + " + + case "$prev" in + --page-server) ;; + + --manage-cgroups-mode) + COMPREPLY=($(compgen -W "soft full strict" -- "$cur")) + return + ;; + + --image-path | --work-path | --parent-path) + case "$cur" in + *:*) ;; # TODO somehow do _filedir for stuff inside the image, if it's already specified (which is also somewhat difficult to determine) + '') + COMPREPLY=($(compgen -W '/' -- "$cur")) + __runc_nospace + ;; + *) + _filedir + __runc_nospace + ;; + esac + return + ;; + + $(__runc_to_extglob "$options_with_args")) + return + ;; + esac + + case "$cur" in + -*) + COMPREPLY=($(compgen -W "$boolean_options $options_with_args" -- "$cur")) + ;; + *) + __runc_list_all + ;; + esac +} +_runc_create() { + local boolean_options=" + --help + --no-pivot + --no-new-keyring + " + + local options_with_args=" + --bundle + -b + --console-socket + --pid-file + --preserve-fds + " + case "$prev" in + --bundle | -b | --console-socket | --pid-file) + case "$cur" in + '') + COMPREPLY=($(compgen -W '/' -- "$cur")) + __runc_nospace + ;; + /*) + _filedir + __runc_nospace + ;; + esac + return + ;; + + $(__runc_to_extglob "$options_with_args")) + return + ;; + esac + + case "$cur" in + -*) + COMPREPLY=($(compgen -W "$boolean_options $options_with_args" -- "$cur")) + ;; + *) + __runc_list_all + ;; + esac + +} + +_runc_help() { + local counter=$(__runc_pos_first_nonflag) + if [ $cword -eq $counter ]; then + COMPREPLY=($(compgen -W "${commands[*]}" -- "$cur")) + fi +} + +_runc_restore() { + local boolean_options=" + --help + --tcp-established + --ext-unix-sk + --shell-job + --file-locks + --detach + -d + --no-subreaper + --no-pivot + --auto-dedup + --lazy-pages + " + + local options_with_args=" + -b + --bundle + --image-path + --work-path + --manage-cgroups-mode + --pid-file + --empty-ns + " + + local all_options="$options_with_args $boolean_options" + + case "$prev" in + --manage-cgroups-mode) + COMPREPLY=($(compgen -W "soft full strict" -- "$cur")) + return + ;; + + --pid-file | --image-path | --work-path | --bundle | -b) + case "$cur" in + *:*) ;; # TODO somehow do _filedir for stuff inside the image, if it's already specified (which is also somewhat difficult to determine) + '') + COMPREPLY=($(compgen -W '/' -- "$cur")) + __runc_nospace + ;; + /*) + _filedir + __runc_nospace + ;; + esac + return + ;; + + $(__runc_to_extglob "$options_with_args")) + return + ;; + esac + + case "$cur" in + -*) + COMPREPLY=($(compgen -W "$all_options" -- "$cur")) + ;; + *) + __runc_list_all + ;; + esac +} + +_runc_resume() { + local boolean_options=" + --help + -h + " + + case "$cur" in + -*) + COMPREPLY=($(compgen -W "$boolean_options $options_with_args" -- "$cur")) + ;; + *) + __runc_list_all + ;; + esac +} + +_runc_state() { + local boolean_options=" + --help + -h + " + + case "$cur" in + -*) + COMPREPLY=($(compgen -W "$boolean_options $options_with_args" -- "$cur")) + ;; + *) + __runc_list_all + ;; + esac +} +_runc_start() { + local boolean_options=" + --help + -h + " + + case "$cur" in + -*) + COMPREPLY=($(compgen -W "$boolean_options $options_with_args" -- "$cur")) + ;; + *) + __runc_list_all + ;; + esac +} +_runc_update() { + local boolean_options=" + --help + " + + local options_with_args=" + --blkio-weight + --cpu-period + --cpu-quota + --cpu-rt-period + --cpu-rt-runtime + --cpu-share + --cpuset-cpus + --cpuset-mems + --kernel-memory + --kernel-memory-tcp + --memory + --memory-reservation + --memory-swap + --pids-limit + --l3-cache-schema + --mem-bw-schema + " + + case "$prev" in + $(__runc_to_extglob "$options_with_args")) + return + ;; + esac + + case "$cur" in + -*) + COMPREPLY=($(compgen -W "$boolean_options $options_with_args" -- "$cur")) + ;; + *) + __runc_list_all + ;; + esac +} + +_runc() { + local previous_extglob_setting=$(shopt -p extglob) + shopt -s extglob + + local commands=( + checkpoint + create + delete + events + exec + init + kill + list + pause + ps + restore + resume + run + spec + start + state + update + help + h + ) + + # These options are valid as global options for all client commands + # and valid as command options for `runc daemon` + local global_boolean_options=" + --help -h + --version -v + " + + COMPREPLY=() + local cur prev words cword + _get_comp_words_by_ref -n : cur prev words cword + + local command='runc' command_pos=0 subcommand_pos + local counter=1 + while [ $counter -lt $cword ]; do + case "${words[$counter]}" in + -*) ;; + =) + ((counter++)) + ;; + *) + command="${words[$counter]}" + command_pos=$counter + break + ;; + esac + ((counter++)) + done + + local completions_func=_runc_${command} + declare -F $completions_func >/dev/null && $completions_func + + eval "$previous_extglob_setting" + return 0 +} + +eval "$__runc_previous_extglob_setting" +unset __runc_previous_extglob_setting + +complete -F _runc runc diff --git a/create.go b/create.go new file mode 100644 index 0000000..5f3ac60 --- /dev/null +++ b/create.go @@ -0,0 +1,74 @@ +package main + +import ( + "os" + + "github.com/urfave/cli" +) + +var createCommand = cli.Command{ + Name: "create", + Usage: "create a container", + ArgsUsage: ` + +Where "" is your name for the instance of the container that you +are starting. The name you provide for the container instance must be unique on +your host.`, + Description: `The create command creates an instance of a container for a bundle. The bundle +is a directory with a specification file named "` + specConfig + `" and a root +filesystem. + +The specification file includes an args parameter. The args parameter is used +to specify command(s) that get run when the container is started. To change the +command(s) that get executed on start, edit the args parameter of the spec. See +"runc spec --help" for more explanation.`, + Flags: []cli.Flag{ + cli.StringFlag{ + Name: "bundle, b", + Value: "", + Usage: `path to the root of the bundle directory, defaults to the current directory`, + }, + cli.StringFlag{ + Name: "console-socket", + Value: "", + Usage: "path to an AF_UNIX socket which will receive a file descriptor referencing the master end of the console's pseudoterminal", + }, + cli.StringFlag{ + Name: "pid-file", + Value: "", + Usage: "specify the file to write the process id to", + }, + cli.BoolFlag{ + Name: "no-pivot", + Usage: "do not use pivot root to jail process inside rootfs. This should be used whenever the rootfs is on top of a ramdisk", + }, + cli.BoolFlag{ + Name: "no-new-keyring", + Usage: "do not create a new session keyring for the container. This will cause the container to inherit the calling processes session key", + }, + cli.IntFlag{ + Name: "preserve-fds", + Usage: "Pass N additional file descriptors to the container (stdio + $LISTEN_FDS + N in total)", + }, + }, + Action: func(context *cli.Context) error { + if err := checkArgs(context, 1, exactArgs); err != nil { + return err + } + if err := revisePidFile(context); err != nil { + return err + } + spec, err := setupSpec(context) + if err != nil { + return err + } + status, err := startContainer(context, spec, CT_ACT_CREATE, nil) + if err != nil { + return err + } + // exit with the container's exit status so any external supervisor is + // notified of the exit with the correct exit status. + os.Exit(status) + return nil + }, +} diff --git a/delete.go b/delete.go new file mode 100644 index 0000000..fb6f38e --- /dev/null +++ b/delete.go @@ -0,0 +1,89 @@ +// +build !solaris + +package main + +import ( + "fmt" + "os" + "path/filepath" + "syscall" + "time" + + "github.com/opencontainers/runc/libcontainer" + "github.com/urfave/cli" + + "golang.org/x/sys/unix" +) + +func killContainer(container libcontainer.Container) error { + _ = container.Signal(unix.SIGKILL, false) + for i := 0; i < 100; i++ { + time.Sleep(100 * time.Millisecond) + if err := container.Signal(syscall.Signal(0), false); err != nil { + destroy(container) + return nil + } + } + return fmt.Errorf("container init still running") +} + +var deleteCommand = cli.Command{ + Name: "delete", + Usage: "delete any resources held by the container often used with detached container", + ArgsUsage: ` + +Where "" is the name for the instance of the container. + +EXAMPLE: +For example, if the container id is "ubuntu01" and runc list currently shows the +status of "ubuntu01" as "stopped" the following will delete resources held for +"ubuntu01" removing "ubuntu01" from the runc list of containers: + + # runc delete ubuntu01`, + Flags: []cli.Flag{ + cli.BoolFlag{ + Name: "force, f", + Usage: "Forcibly deletes the container if it is still running (uses SIGKILL)", + }, + }, + Action: func(context *cli.Context) error { + if err := checkArgs(context, 1, exactArgs); err != nil { + return err + } + + id := context.Args().First() + force := context.Bool("force") + container, err := getContainer(context) + if err != nil { + if lerr, ok := err.(libcontainer.Error); ok && lerr.Code() == libcontainer.ContainerNotExists { + // if there was an aborted start or something of the sort then the container's directory could exist but + // libcontainer does not see it because the state.json file inside that directory was never created. + path := filepath.Join(context.GlobalString("root"), id) + if e := os.RemoveAll(path); e != nil { + fmt.Fprintf(os.Stderr, "remove %s: %v\n", path, e) + } + if force { + return nil + } + } + return err + } + s, err := container.Status() + if err != nil { + return err + } + switch s { + case libcontainer.Stopped: + destroy(container) + case libcontainer.Created: + return killContainer(container) + default: + if force { + return killContainer(container) + } + return fmt.Errorf("cannot delete container %s that is not stopped: %s\n", id, s) + } + + return nil + }, +} diff --git a/docs/checkpoint-restore.md b/docs/checkpoint-restore.md new file mode 100644 index 0000000..80ec46b --- /dev/null +++ b/docs/checkpoint-restore.md @@ -0,0 +1,50 @@ +# Checkpoint and Restore # + +For a basic description about checkpointing and restoring containers with +`runc` please see [runc-checkpoint(8)](../man/runc-checkpoint.8.md) and +[runc-restore(8)](../man/runc-restore.8.md). + +## Checkpoint/Restore Annotations ## + +In addition to specifying options on the command-line like it is described +in the man-pages (see above), it is also possible to influence CRIU's +behaviour using CRIU configuration files. For details about CRIU's +configuration file support please see [CRIU's wiki](https://criu.org/Configuration_files). + +In addition to CRIU's default configuration files `runc` tells CRIU to +also evaluate the file `/etc/criu/runc.conf`. Using the annotation +`org.criu.config` it is, however, possible to change this additional +CRIU configuration file. + +If the annotation `org.criu.config` is set to an empty string `runc` +will not pass any additional configuration file to CRIU. With an empty +string it is therefore possible to disable the additional CRIU configuration +file. This can be used to make sure that no additional configuration file +changes CRIU's behaviour accidentally. + +If the annotation `org.criu.config` is set to a non-empty string `runc` will +pass that string to CRIU to be evaluated as an additional configuration file. +If CRIU cannot open this additional configuration file, it will ignore this +file and continue. + +### Annotation Example to disable additional CRIU configuration file ### + +``` +{ + "ociVersion": "1.0.0", + "annotations": { + "org.criu.config": "" + }, + "process": { +``` + +### Annotation Example to set a specific CRIU configuration file ### + +``` +{ + "ociVersion": "1.0.0", + "annotations": { + "org.criu.config": "/etc/special-runc-criu-options" + }, + "process": { +``` diff --git a/docs/terminals.md b/docs/terminals.md new file mode 100644 index 0000000..fc000e1 --- /dev/null +++ b/docs/terminals.md @@ -0,0 +1,314 @@ +# Terminals and Standard IO # + +*Note that the default configuration of `runc` (foreground, new terminal) is +generally the best option for most users. This document exists to help explain +what the purpose of the different modes is, and to try to steer users away from +common mistakes and misunderstandings.* + +In general, most processes on Unix (and Unix-like) operating systems have 3 +standard file descriptors provided at the start, collectively referred to as +"standard IO" (`stdio`): + +* `0`: standard-in (`stdin`), the input stream into the process +* `1`: standard-out (`stdout`), the output stream from the process +* `2`: standard-error (`stderr`), the error stream from the process + +When creating and running a container via `runc`, it is important to take care +to structure the `stdio` the new container's process receives. In some ways +containers are just regular processes, while in other ways they're an isolated +sub-partition of your machine (in a similar sense to a VM). This means that the +structure of IO is not as simple as with ordinary programs (which generally +just use the file descriptors you give them). + +## Other File Descriptors ## + +Before we continue, it is important to note that processes can have more file +descriptors than just `stdio`. By default in `runc` no other file descriptors +will be passed to the spawned container process. If you wish to explicitly pass +file descriptors to the container you have to use the `--preserve-fds` option. +These ancillary file descriptors don't have any of the strange semantics +discussed further in this document (those only apply to `stdio`) -- they are +passed untouched by `runc`. + +It should be noted that `--preserve-fds` does not take individual file +descriptors to preserve. Instead, it takes how many file descriptors (not +including `stdio` or `LISTEN_FDS`) should be passed to the container. In the +following example: + +``` +% runc run --preserve-fds 5 +``` + +`runc` will pass the first `5` file descriptors (`3`, `4`, `5`, `6`, and `7` -- +assuming that `LISTEN_FDS` has not been configured) to the container. + +In addition to `--preserve-fds`, `LISTEN_FDS` file descriptors are passed +automatically to allow for `systemd`-style socket activation. To extend the +above example: + +``` +% LISTEN_PID=$pid_of_runc LISTEN_FDS=3 runc run --preserve-fds 5 +``` + +`runc` will now pass the first `8` file descriptors (and it will also pass +`LISTEN_FDS=3` and `LISTEN_PID=1` to the container). The first `3` (`3`, `4`, +and `5`) were passed due to `LISTEN_FDS` and the other `5` (`6`, `7`, `8`, `9`, +and `10`) were passed due to `--preserve-fds`. You should keep this in mind if +you use `runc` directly in something like a `systemd` unit file. To disable +this `LISTEN_FDS`-style passing just unset `LISTEN_FDS`. + +**Be very careful when passing file descriptors to a container process.** Due +to some Linux kernel (mis)features, a container with access to certain types of +file descriptors (such as `O_PATH` descriptors) outside of the container's root +file system can use these to break out of the container's pivoted mount +namespace. [This has resulted in CVEs in the past.][CVE-2016-9962] + +[CVE-2016-9962]: https://nvd.nist.gov/vuln/detail/CVE-2016-9962 + +## Terminal Modes ## + +`runc` supports two distinct methods for passing `stdio` to the container's +primary process: + +* [new terminal](#new-terminal) (`terminal: true`) +* [pass-through](#pass-through) (`terminal: false`) + +When first using `runc` these two modes will look incredibly similar, but this +can be quite deceptive as these different modes have quite different +characteristics. + +By default, `runc spec` will create a configuration that will create a new +terminal (`terminal: true`). However, if the `terminal: ...` line is not +present in `config.json` then pass-through is the default. + +*In general we recommend using new terminal, because it means that tools like +`sudo` will work inside your container. But pass-through can be useful if you +know what you're doing, or if you're using `runc` as part of a non-interactive +pipeline.* + +### New Terminal ### + +In new terminal mode, `runc` will create a brand-new "console" (or more +precisely, a new pseudo-terminal using the container's namespaced +`/dev/pts/ptmx`) for your contained process to use as its `stdio`. + +When you start a process in new terminal mode, `runc` will do the following: + +1. Create a new pseudo-terminal. +2. Pass the slave end to the container's primary process as its `stdio`. +3. Send the master end to a process to interact with the `stdio` for the + container's primary process ([details below](#runc-modes)). + +It should be noted that since a new pseudo-terminal is being used for +communication with the container, some strange properties of pseudo-terminals +might surprise you. For instance, by default, all new pseudo-terminals +translate the byte `'\n'` to the sequence `'\r\n'` on both `stdout` and +`stderr`. In addition there are [a whole range of `ioctls(2)` that can only +interact with pseudo-terminal `stdio`][tty_ioctl(4)]. + +> **NOTE**: In new terminal mode, all three `stdio` file descriptors are the +> same underlying file. The reason for this is to match how a shell's `stdio` +> looks to a process (as well as remove race condition issues with having to +> deal with multiple master pseudo-terminal file descriptors). However this +> means that it is not really possible to uniquely distinguish between `stdout` +> and `stderr` from the caller's perspective. + +[tty_ioctl(4)]: https://linux.die.net/man/4/tty_ioctl + +### Pass-Through ### + +If you have already set up some file handles that you wish your contained +process to use as its `stdio`, then you can ask `runc` to pass them through to +the contained process (this is not necessarily the same as `--preserve-fds`'s +passing of file descriptors -- [details below](#runc-modes)). As an example +(assuming that `terminal: false` is set in `config.json`): + +``` +% echo input | runc run some_container > /tmp/log.out 2>& /tmp/log.err +``` + +Here the container's various `stdio` file descriptors will be substituted with +the following: + +* `stdin` will be sourced from the `echo input` pipeline. +* `stdout` will be output into `/tmp/log.out` on the host. +* `stderr` will be output into `/tmp/log.err` on the host. + +It should be noted that the actual file handles seen inside the container may +be different [based on the mode `runc` is being used in](#runc-modes) (for +instance, the file referenced by `1` could be `/tmp/log.out` directly or a pipe +which `runc` is using to buffer output, based on the mode). However the net +result will be the same in either case. In principle you could use the [new +terminal mode](#new-terminal) in a pipeline, but the difference will become +more clear when you are introduced to [`runc`'s detached mode](#runc-modes). + +## `runc` Modes ## + +`runc` itself runs in two modes: + +* [foreground](#foreground) +* [detached](#detached) + +You can use either [terminal mode](#terminal-modes) with either `runc` mode. +However, there are considerations that may indicate preference for one mode +over another. It should be noted that while two types of modes (terminal and +`runc`) are conceptually independent from each other, you should be aware of +the intricacies of which combination you are using. + +*In general we recommend using foreground because it's the most +straight-forward to use, with the only downside being that you will have a +long-running `runc` process. Detached mode is difficult to get right and +generally requires having your own `stdio` management.* + +### Foreground ### + +The default (and most straight-forward) mode of `runc`. In this mode, your +`runc` command remains in the foreground with the container process as a child. +All `stdio` is buffered through the foreground `runc` process (irrespective of +which terminal mode you are using). This is conceptually quite similar to +running a normal process interactively in a shell (and if you are using `runc` +in a shell interactively, this is what you should use). + +Because the `stdio` will be buffered in this mode, some very important +peculiarities of this mode should be kept in mind: + +* With [new terminal mode](#new-terminal), the container will see a + pseudo-terminal as its `stdio` (as you might expect). However, the `stdio` of + the foreground `runc` process will remain the `stdio` that the process was + started with -- and `runc` will copy all `stdio` between its `stdio` and the + container's `stdio`. This means that while a new pseudo-terminal has been + created, the foreground `runc` process manages it over the lifetime of the + container. + +* With [pass-through mode](#pass-through), the foreground `runc`'s `stdio` is + **not** passed to the container. Instead, the container's `stdio` is a set of + pipes which are used to copy data between `runc`'s `stdio` and the + container's `stdio`. This means that the container never has direct access to + host file descriptors (aside from the pipes created by the container runtime, + but that shouldn't be an issue). + +The main drawback of the foreground mode of operation is that it requires a +long-running foreground `runc` process. If you kill the foreground `runc` +process then you will no longer have access to the `stdio` of the container +(and in most cases this will result in the container dying abnormally due to +`SIGPIPE` or some other error). By extension this means that any bug in the +long-running foreground `runc` process (such as a memory leak) or a stray +OOM-kill sweep could result in your container being killed **through no fault +of the user**. In addition, there is no way in foreground mode of passing a +file descriptor directly to the container process as its `stdio` (like +`--preserve-fds` does). + +These shortcomings are obviously sub-optimal and are the reason that `runc` has +an additional mode called "detached mode". + +### Detached ### + +In contrast to foreground mode, in detached mode there is no long-running +foreground `runc` process once the container has started. In fact, there is no +long-running `runc` process at all. However, this means that it is up to the +caller to handle the `stdio` after `runc` has set it up for you. In a shell +this means that the `runc` command will exit and control will return to the +shell, after the container has been set up. + +You can run `runc` in detached mode in one of the following ways: + +* `runc run -d ...` which operates similar to `runc run` but is detached. +* `runc create` followed by `runc start` which is the standard container + lifecycle defined by the OCI runtime specification (`runc create` sets up the + container completely, waiting for `runc start` to begin execution of user + code). + +The main use-case of detached mode is for higher-level tools that want to be +wrappers around `runc`. By running `runc` in detached mode, those tools have +far more control over the container's `stdio` without `runc` getting in the +way (most wrappers around `runc` like `cri-o` or `containerd` use detached mode +for this reason). + +Unfortunately using detached mode is a bit more complicated and requires more +care than the foreground mode -- mainly because it is now up to the caller to +handle the `stdio` of the container. + +#### Detached Pass-Through #### + +In detached mode, pass-through actually does what it says on the tin -- the +`stdio` file descriptors of the `runc` process are passed through (untouched) +to the container's `stdio`. The purpose of this option is to allow a user to +set up `stdio` for a container themselves and then force `runc` to just use +their pre-prepared `stdio` (without any pseudo-terminal funny business). *If +you don't see why this would be useful, don't use this option.* + +**You must be incredibly careful when using detached pass-through (especially +in a shell).** The reason for this is that by using detached pass-through you +are passing host file descriptors to the container. In the case of a shell, +usually your `stdio` is going to be a pseudo-terminal (on your host). A +malicious container could take advantage of TTY-specific `ioctls` like +`TIOCSTI` to fake input into the **host** shell (remember that in detached +mode, control is returned to your shell and so the terminal you've given the +container is being read by a shell prompt). + +There are also several other issues with running non-malicious containers in a +shell with detached pass-through (where you pass your shell's `stdio` to the +container): + +* Output from the container will be interleaved with output from your shell (in + a non-deterministic way), without any real way of distinguishing from where a + particular piece of output came from. + +* Any input to `stdin` will be non-deterministically split and given to either + the container or the shell (because both are blocked on a `read(2)` of the + same FIFO-style file descriptor). + +They are all related to the fact that there is going to be a race when either +your host or the container tries to read from (or write to) `stdio`. This +problem is especially obvious when in a shell, where usually the terminal has +been put into raw mode (where each individual key-press should cause `read(2)` +to return). + +> **NOTE**: There is also currently a [known problem][issue-1721] where using +> detached pass-through will result in the container hanging if the `stdout` or +> `stderr` is a pipe (though this should be a temporary issue). + +[issue-1721]: https://github.com/opencontainers/runc/issues/1721 + +#### Detached New Terminal #### + +When creating a new pseudo-terminal in detached mode, and fairly obvious +problem appears -- how do we use the new terminal that `runc` created? Unlike +in pass-through, `runc` has created a new set of file descriptors that need to +be used by *something* in order for container communication to work. + +The way this problem is resolved is through the use of Unix domain sockets. +There is a feature of Unix sockets called `SCM_RIGHTS` which allows a file +descriptor to be sent through a Unix socket to a completely separate process +(which can then use that file descriptor as though they opened it). When using +`runc` in detached new terminal mode, this is how a user gets access to the +pseudo-terminal's master file descriptor. + +To this end, there is a new option (which is required if you want to use `runc` +in detached new terminal mode): `--console-socket`. This option takes the path +to a Unix domain socket which `runc` will connect to and send the +pseudo-terminal master file descriptor down. The general process for getting +the pseudo-terminal master is as follows: + +1. Create a Unix domain socket at some path, `$socket_path`. +2. Call `runc run` or `runc create` with the argument `--console-socket + $socket_path`. +3. Using `recvmsg(2)` retrieve the file descriptor sent using `SCM_RIGHTS` by + `runc`. +4. Now the manager can interact with the `stdio` of the container, using the + retrieved pseudo-terminal master. + +After `runc` exits, the only process with a copy of the pseudo-terminal master +file descriptor is whoever read the file descriptor from the socket. + +> **NOTE**: Currently `runc` doesn't support abstract socket addresses (due to +> it not being possible to pass an `argv` with a null-byte as the first +> character). In the future this may change, but currently you must use a valid +> path name. + +In order to help users make use of detached new terminal mode, we have provided +a [Go implementation in the `go-runc` bindings][containerd/go-runc.Socket], as +well as [a simple client][recvtty]. + +[containerd/go-runc.Socket]: https://godoc.org/github.com/containerd/go-runc#Socket +[recvtty]: /contrib/cmd/recvtty diff --git a/events.go b/events.go new file mode 100644 index 0000000..fb3f630 --- /dev/null +++ b/events.go @@ -0,0 +1,215 @@ +// +build linux + +package main + +import ( + "encoding/json" + "fmt" + "os" + "sync" + "time" + + "github.com/opencontainers/runc/libcontainer" + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/intelrdt" + "github.com/opencontainers/runc/types" + + "github.com/sirupsen/logrus" + "github.com/urfave/cli" +) + +var eventsCommand = cli.Command{ + Name: "events", + Usage: "display container events such as OOM notifications, cpu, memory, and IO usage statistics", + ArgsUsage: ` + +Where "" is the name for the instance of the container.`, + Description: `The events command displays information about the container. By default the +information is displayed once every 5 seconds.`, + Flags: []cli.Flag{ + cli.DurationFlag{Name: "interval", Value: 5 * time.Second, Usage: "set the stats collection interval"}, + cli.BoolFlag{Name: "stats", Usage: "display the container's stats then exit"}, + }, + Action: func(context *cli.Context) error { + if err := checkArgs(context, 1, exactArgs); err != nil { + return err + } + container, err := getContainer(context) + if err != nil { + return err + } + duration := context.Duration("interval") + if duration <= 0 { + return fmt.Errorf("duration interval must be greater than 0") + } + status, err := container.Status() + if err != nil { + return err + } + if status == libcontainer.Stopped { + return fmt.Errorf("container with id %s is not running", container.ID()) + } + var ( + stats = make(chan *libcontainer.Stats, 1) + events = make(chan *types.Event, 1024) + group = &sync.WaitGroup{} + ) + group.Add(1) + go func() { + defer group.Done() + enc := json.NewEncoder(os.Stdout) + for e := range events { + if err := enc.Encode(e); err != nil { + logrus.Error(err) + } + } + }() + if context.Bool("stats") { + s, err := container.Stats() + if err != nil { + return err + } + events <- &types.Event{Type: "stats", ID: container.ID(), Data: convertLibcontainerStats(s)} + close(events) + group.Wait() + return nil + } + go func() { + for range time.Tick(context.Duration("interval")) { + s, err := container.Stats() + if err != nil { + logrus.Error(err) + continue + } + stats <- s + } + }() + n, err := container.NotifyOOM() + if err != nil { + return err + } + for { + select { + case _, ok := <-n: + if ok { + // this means an oom event was received, if it is !ok then + // the channel was closed because the container stopped and + // the cgroups no longer exist. + events <- &types.Event{Type: "oom", ID: container.ID()} + } else { + n = nil + } + case s := <-stats: + events <- &types.Event{Type: "stats", ID: container.ID(), Data: convertLibcontainerStats(s)} + } + if n == nil { + close(events) + break + } + } + group.Wait() + return nil + }, +} + +func convertLibcontainerStats(ls *libcontainer.Stats) *types.Stats { + cg := ls.CgroupStats + if cg == nil { + return nil + } + var s types.Stats + s.Pids.Current = cg.PidsStats.Current + s.Pids.Limit = cg.PidsStats.Limit + + s.CPU.Usage.Kernel = cg.CpuStats.CpuUsage.UsageInKernelmode + s.CPU.Usage.User = cg.CpuStats.CpuUsage.UsageInUsermode + s.CPU.Usage.Total = cg.CpuStats.CpuUsage.TotalUsage + s.CPU.Usage.Percpu = cg.CpuStats.CpuUsage.PercpuUsage + s.CPU.Throttling.Periods = cg.CpuStats.ThrottlingData.Periods + s.CPU.Throttling.ThrottledPeriods = cg.CpuStats.ThrottlingData.ThrottledPeriods + s.CPU.Throttling.ThrottledTime = cg.CpuStats.ThrottlingData.ThrottledTime + + s.Memory.Cache = cg.MemoryStats.Cache + s.Memory.Kernel = convertMemoryEntry(cg.MemoryStats.KernelUsage) + s.Memory.KernelTCP = convertMemoryEntry(cg.MemoryStats.KernelTCPUsage) + s.Memory.Swap = convertMemoryEntry(cg.MemoryStats.SwapUsage) + s.Memory.Usage = convertMemoryEntry(cg.MemoryStats.Usage) + s.Memory.Raw = cg.MemoryStats.Stats + + s.Blkio.IoServiceBytesRecursive = convertBlkioEntry(cg.BlkioStats.IoServiceBytesRecursive) + s.Blkio.IoServicedRecursive = convertBlkioEntry(cg.BlkioStats.IoServicedRecursive) + s.Blkio.IoQueuedRecursive = convertBlkioEntry(cg.BlkioStats.IoQueuedRecursive) + s.Blkio.IoServiceTimeRecursive = convertBlkioEntry(cg.BlkioStats.IoServiceTimeRecursive) + s.Blkio.IoWaitTimeRecursive = convertBlkioEntry(cg.BlkioStats.IoWaitTimeRecursive) + s.Blkio.IoMergedRecursive = convertBlkioEntry(cg.BlkioStats.IoMergedRecursive) + s.Blkio.IoTimeRecursive = convertBlkioEntry(cg.BlkioStats.IoTimeRecursive) + s.Blkio.SectorsRecursive = convertBlkioEntry(cg.BlkioStats.SectorsRecursive) + + s.Hugetlb = make(map[string]types.Hugetlb) + for k, v := range cg.HugetlbStats { + s.Hugetlb[k] = convertHugtlb(v) + } + + if is := ls.IntelRdtStats; is != nil { + if intelrdt.IsCatEnabled() { + s.IntelRdt.L3CacheInfo = convertL3CacheInfo(is.L3CacheInfo) + s.IntelRdt.L3CacheSchemaRoot = is.L3CacheSchemaRoot + s.IntelRdt.L3CacheSchema = is.L3CacheSchema + } + if intelrdt.IsMbaEnabled() { + s.IntelRdt.MemBwInfo = convertMemBwInfo(is.MemBwInfo) + s.IntelRdt.MemBwSchemaRoot = is.MemBwSchemaRoot + s.IntelRdt.MemBwSchema = is.MemBwSchema + } + } + + s.NetworkInterfaces = ls.Interfaces + return &s +} + +func convertHugtlb(c cgroups.HugetlbStats) types.Hugetlb { + return types.Hugetlb{ + Usage: c.Usage, + Max: c.MaxUsage, + Failcnt: c.Failcnt, + } +} + +func convertMemoryEntry(c cgroups.MemoryData) types.MemoryEntry { + return types.MemoryEntry{ + Limit: c.Limit, + Usage: c.Usage, + Max: c.MaxUsage, + Failcnt: c.Failcnt, + } +} + +func convertBlkioEntry(c []cgroups.BlkioStatEntry) []types.BlkioEntry { + var out []types.BlkioEntry + for _, e := range c { + out = append(out, types.BlkioEntry{ + Major: e.Major, + Minor: e.Minor, + Op: e.Op, + Value: e.Value, + }) + } + return out +} + +func convertL3CacheInfo(i *intelrdt.L3CacheInfo) *types.L3CacheInfo { + return &types.L3CacheInfo{ + CbmMask: i.CbmMask, + MinCbmBits: i.MinCbmBits, + NumClosids: i.NumClosids, + } +} + +func convertMemBwInfo(i *intelrdt.MemBwInfo) *types.MemBwInfo { + return &types.MemBwInfo{ + BandwidthGran: i.BandwidthGran, + DelayLinear: i.DelayLinear, + MinBandwidth: i.MinBandwidth, + NumClosids: i.NumClosids, + } +} diff --git a/exec.go b/exec.go new file mode 100644 index 0000000..b963d68 --- /dev/null +++ b/exec.go @@ -0,0 +1,235 @@ +// +build linux + +package main + +import ( + "encoding/json" + "fmt" + "os" + "strconv" + "strings" + + "github.com/opencontainers/runc/libcontainer" + "github.com/opencontainers/runc/libcontainer/utils" + "github.com/opencontainers/runtime-spec/specs-go" + "github.com/urfave/cli" +) + +var execCommand = cli.Command{ + Name: "exec", + Usage: "execute new process inside the container", + ArgsUsage: ` [command options] || -p process.json + +Where "" is the name for the instance of the container and +"" is the command to be executed in the container. +"" can't be empty unless a "-p" flag provided. + +EXAMPLE: +For example, if the container is configured to run the linux ps command the +following will output a list of processes running in the container: + + # runc exec ps`, + Flags: []cli.Flag{ + cli.StringFlag{ + Name: "console-socket", + Usage: "path to an AF_UNIX socket which will receive a file descriptor referencing the master end of the console's pseudoterminal", + }, + cli.StringFlag{ + Name: "cwd", + Usage: "current working directory in the container", + }, + cli.StringSliceFlag{ + Name: "env, e", + Usage: "set environment variables", + }, + cli.BoolFlag{ + Name: "tty, t", + Usage: "allocate a pseudo-TTY", + }, + cli.StringFlag{ + Name: "user, u", + Usage: "UID (format: [:])", + }, + cli.Int64SliceFlag{ + Name: "additional-gids, g", + Usage: "additional gids", + }, + cli.StringFlag{ + Name: "process, p", + Usage: "path to the process.json", + }, + cli.BoolFlag{ + Name: "detach,d", + Usage: "detach from the container's process", + }, + cli.StringFlag{ + Name: "pid-file", + Value: "", + Usage: "specify the file to write the process id to", + }, + cli.StringFlag{ + Name: "process-label", + Usage: "set the asm process label for the process commonly used with selinux", + }, + cli.StringFlag{ + Name: "apparmor", + Usage: "set the apparmor profile for the process", + }, + cli.BoolFlag{ + Name: "no-new-privs", + Usage: "set the no new privileges value for the process", + }, + cli.StringSliceFlag{ + Name: "cap, c", + Value: &cli.StringSlice{}, + Usage: "add a capability to the bounding set for the process", + }, + cli.BoolFlag{ + Name: "no-subreaper", + Usage: "disable the use of the subreaper used to reap reparented processes", + Hidden: true, + }, + cli.IntFlag{ + Name: "preserve-fds", + Usage: "Pass N additional file descriptors to the container (stdio + $LISTEN_FDS + N in total)", + }, + }, + Action: func(context *cli.Context) error { + if err := checkArgs(context, 1, minArgs); err != nil { + return err + } + if err := revisePidFile(context); err != nil { + return err + } + status, err := execProcess(context) + if err == nil { + os.Exit(status) + } + return fmt.Errorf("exec failed: %v", err) + }, + SkipArgReorder: true, +} + +func execProcess(context *cli.Context) (int, error) { + container, err := getContainer(context) + if err != nil { + return -1, err + } + status, err := container.Status() + if err != nil { + return -1, err + } + if status == libcontainer.Stopped { + return -1, fmt.Errorf("cannot exec a container that has stopped") + } + path := context.String("process") + if path == "" && len(context.Args()) == 1 { + return -1, fmt.Errorf("process args cannot be empty") + } + detach := context.Bool("detach") + state, err := container.State() + if err != nil { + return -1, err + } + bundle := utils.SearchLabels(state.Config.Labels, "bundle") + p, err := getProcess(context, bundle) + if err != nil { + return -1, err + } + + logLevel := "info" + if context.GlobalBool("debug") { + logLevel = "debug" + } + + r := &runner{ + enableSubreaper: false, + shouldDestroy: false, + container: container, + consoleSocket: context.String("console-socket"), + detach: detach, + pidFile: context.String("pid-file"), + action: CT_ACT_RUN, + init: false, + preserveFDs: context.Int("preserve-fds"), + logLevel: logLevel, + } + return r.run(p) +} + +func getProcess(context *cli.Context, bundle string) (*specs.Process, error) { + if path := context.String("process"); path != "" { + f, err := os.Open(path) + if err != nil { + return nil, err + } + defer f.Close() + var p specs.Process + if err := json.NewDecoder(f).Decode(&p); err != nil { + return nil, err + } + return &p, validateProcessSpec(&p) + } + // process via cli flags + if err := os.Chdir(bundle); err != nil { + return nil, err + } + spec, err := loadSpec(specConfig) + if err != nil { + return nil, err + } + p := spec.Process + p.Args = context.Args()[1:] + // override the cwd, if passed + if context.String("cwd") != "" { + p.Cwd = context.String("cwd") + } + if ap := context.String("apparmor"); ap != "" { + p.ApparmorProfile = ap + } + if l := context.String("process-label"); l != "" { + p.SelinuxLabel = l + } + if caps := context.StringSlice("cap"); len(caps) > 0 { + for _, c := range caps { + p.Capabilities.Bounding = append(p.Capabilities.Bounding, c) + p.Capabilities.Inheritable = append(p.Capabilities.Inheritable, c) + p.Capabilities.Effective = append(p.Capabilities.Effective, c) + p.Capabilities.Permitted = append(p.Capabilities.Permitted, c) + p.Capabilities.Ambient = append(p.Capabilities.Ambient, c) + } + } + // append the passed env variables + p.Env = append(p.Env, context.StringSlice("env")...) + + // set the tty + if context.IsSet("tty") { + p.Terminal = context.Bool("tty") + } + if context.IsSet("no-new-privs") { + p.NoNewPrivileges = context.Bool("no-new-privs") + } + // override the user, if passed + if context.String("user") != "" { + u := strings.SplitN(context.String("user"), ":", 2) + if len(u) > 1 { + gid, err := strconv.Atoi(u[1]) + if err != nil { + return nil, fmt.Errorf("parsing %s as int for gid failed: %v", u[1], err) + } + p.User.GID = uint32(gid) + } + uid, err := strconv.Atoi(u[0]) + if err != nil { + return nil, fmt.Errorf("parsing %s as int for uid failed: %v", u[0], err) + } + p.User.UID = uint32(uid) + } + for _, gid := range context.Int64Slice("additional-gids") { + if gid < 0 { + return nil, fmt.Errorf("additional-gids must be a positive number %d", gid) + } + p.User.AdditionalGids = append(p.User.AdditionalGids, uint32(gid)) + } + return p, validateProcessSpec(p) +} diff --git a/init.go b/init.go new file mode 100644 index 0000000..08351fd --- /dev/null +++ b/init.go @@ -0,0 +1,50 @@ +package main + +import ( + "fmt" + "os" + "runtime" + + "github.com/opencontainers/runc/libcontainer" + "github.com/opencontainers/runc/libcontainer/logs" + _ "github.com/opencontainers/runc/libcontainer/nsenter" + "github.com/sirupsen/logrus" + "github.com/urfave/cli" +) + +func init() { + if len(os.Args) > 1 && os.Args[1] == "init" { + runtime.GOMAXPROCS(1) + runtime.LockOSThread() + + level := os.Getenv("_LIBCONTAINER_LOGLEVEL") + logLevel, err := logrus.ParseLevel(level) + if err != nil { + panic(fmt.Sprintf("libcontainer: failed to parse log level: %q: %v", level, err)) + } + + err = logs.ConfigureLogging(logs.Config{ + LogPipeFd: os.Getenv("_LIBCONTAINER_LOGPIPE"), + LogFormat: "json", + LogLevel: logLevel, + }) + if err != nil { + panic(fmt.Sprintf("libcontainer: failed to configure logging: %v", err)) + } + logrus.Debug("child process in init()") + } +} + +var initCommand = cli.Command{ + Name: "init", + Usage: `initialize the namespaces and launch the process (do not call it outside of runc)`, + Action: func(context *cli.Context) error { + factory, _ := libcontainer.New("") + if err := factory.StartInitialization(); err != nil { + // as the error is sent back to the parent there is no need to log + // or write it to stderr because the parent process will handle this + os.Exit(1) + } + panic("libcontainer: container init failed to exec") + }, +} diff --git a/kill.go b/kill.go new file mode 100644 index 0000000..c2d7929 --- /dev/null +++ b/kill.go @@ -0,0 +1,68 @@ +// +build linux + +package main + +import ( + "fmt" + "strconv" + "strings" + "syscall" + + "github.com/urfave/cli" +) + +var killCommand = cli.Command{ + Name: "kill", + Usage: "kill sends the specified signal (default: SIGTERM) to the container's init process", + ArgsUsage: ` [signal] + +Where "" is the name for the instance of the container and +"[signal]" is the signal to be sent to the init process. + +EXAMPLE: +For example, if the container id is "ubuntu01" the following will send a "KILL" +signal to the init process of the "ubuntu01" container: + + # runc kill ubuntu01 KILL`, + Flags: []cli.Flag{ + cli.BoolFlag{ + Name: "all, a", + Usage: "send the specified signal to all processes inside the container", + }, + }, + Action: func(context *cli.Context) error { + if err := checkArgs(context, 1, minArgs); err != nil { + return err + } + if err := checkArgs(context, 2, maxArgs); err != nil { + return err + } + container, err := getContainer(context) + if err != nil { + return err + } + + sigstr := context.Args().Get(1) + if sigstr == "" { + sigstr = "SIGTERM" + } + + signal, err := parseSignal(sigstr) + if err != nil { + return err + } + return container.Signal(signal, context.Bool("all")) + }, +} + +func parseSignal(rawSignal string) (syscall.Signal, error) { + s, err := strconv.Atoi(rawSignal) + if err == nil { + return syscall.Signal(s), nil + } + signal, ok := signalMap[strings.TrimPrefix(strings.ToUpper(rawSignal), "SIG")] + if !ok { + return -1, fmt.Errorf("unknown signal %q", rawSignal) + } + return signal, nil +} diff --git a/libcontainer/README.md b/libcontainer/README.md new file mode 100644 index 0000000..a791ca2 --- /dev/null +++ b/libcontainer/README.md @@ -0,0 +1,331 @@ +# libcontainer + +[![GoDoc](https://godoc.org/github.com/opencontainers/runc/libcontainer?status.svg)](https://godoc.org/github.com/opencontainers/runc/libcontainer) + +Libcontainer provides a native Go implementation for creating containers +with namespaces, cgroups, capabilities, and filesystem access controls. +It allows you to manage the lifecycle of the container performing additional operations +after the container is created. + + +#### Container +A container is a self contained execution environment that shares the kernel of the +host system and which is (optionally) isolated from other containers in the system. + +#### Using libcontainer + +Because containers are spawned in a two step process you will need a binary that +will be executed as the init process for the container. In libcontainer, we use +the current binary (/proc/self/exe) to be executed as the init process, and use +arg "init", we call the first step process "bootstrap", so you always need a "init" +function as the entry of "bootstrap". + +In addition to the go init function the early stage bootstrap is handled by importing +[nsenter](https://github.com/opencontainers/runc/blob/master/libcontainer/nsenter/README.md). + +```go +import ( + _ "github.com/opencontainers/runc/libcontainer/nsenter" +) + +func init() { + if len(os.Args) > 1 && os.Args[1] == "init" { + runtime.GOMAXPROCS(1) + runtime.LockOSThread() + factory, _ := libcontainer.New("") + if err := factory.StartInitialization(); err != nil { + logrus.Fatal(err) + } + panic("--this line should have never been executed, congratulations--") + } +} +``` + +Then to create a container you first have to initialize an instance of a factory +that will handle the creation and initialization for a container. + +```go +factory, err := libcontainer.New("/var/lib/container", libcontainer.Cgroupfs, libcontainer.InitArgs(os.Args[0], "init")) +if err != nil { + logrus.Fatal(err) + return +} +``` + +Once you have an instance of the factory created we can create a configuration +struct describing how the container is to be created. A sample would look similar to this: + +```go +defaultMountFlags := unix.MS_NOEXEC | unix.MS_NOSUID | unix.MS_NODEV +config := &configs.Config{ + Rootfs: "/your/path/to/rootfs", + Capabilities: &configs.Capabilities{ + Bounding: []string{ + "CAP_CHOWN", + "CAP_DAC_OVERRIDE", + "CAP_FSETID", + "CAP_FOWNER", + "CAP_MKNOD", + "CAP_NET_RAW", + "CAP_SETGID", + "CAP_SETUID", + "CAP_SETFCAP", + "CAP_SETPCAP", + "CAP_NET_BIND_SERVICE", + "CAP_SYS_CHROOT", + "CAP_KILL", + "CAP_AUDIT_WRITE", + }, + Effective: []string{ + "CAP_CHOWN", + "CAP_DAC_OVERRIDE", + "CAP_FSETID", + "CAP_FOWNER", + "CAP_MKNOD", + "CAP_NET_RAW", + "CAP_SETGID", + "CAP_SETUID", + "CAP_SETFCAP", + "CAP_SETPCAP", + "CAP_NET_BIND_SERVICE", + "CAP_SYS_CHROOT", + "CAP_KILL", + "CAP_AUDIT_WRITE", + }, + Inheritable: []string{ + "CAP_CHOWN", + "CAP_DAC_OVERRIDE", + "CAP_FSETID", + "CAP_FOWNER", + "CAP_MKNOD", + "CAP_NET_RAW", + "CAP_SETGID", + "CAP_SETUID", + "CAP_SETFCAP", + "CAP_SETPCAP", + "CAP_NET_BIND_SERVICE", + "CAP_SYS_CHROOT", + "CAP_KILL", + "CAP_AUDIT_WRITE", + }, + Permitted: []string{ + "CAP_CHOWN", + "CAP_DAC_OVERRIDE", + "CAP_FSETID", + "CAP_FOWNER", + "CAP_MKNOD", + "CAP_NET_RAW", + "CAP_SETGID", + "CAP_SETUID", + "CAP_SETFCAP", + "CAP_SETPCAP", + "CAP_NET_BIND_SERVICE", + "CAP_SYS_CHROOT", + "CAP_KILL", + "CAP_AUDIT_WRITE", + }, + Ambient: []string{ + "CAP_CHOWN", + "CAP_DAC_OVERRIDE", + "CAP_FSETID", + "CAP_FOWNER", + "CAP_MKNOD", + "CAP_NET_RAW", + "CAP_SETGID", + "CAP_SETUID", + "CAP_SETFCAP", + "CAP_SETPCAP", + "CAP_NET_BIND_SERVICE", + "CAP_SYS_CHROOT", + "CAP_KILL", + "CAP_AUDIT_WRITE", + }, + }, + Namespaces: configs.Namespaces([]configs.Namespace{ + {Type: configs.NEWNS}, + {Type: configs.NEWUTS}, + {Type: configs.NEWIPC}, + {Type: configs.NEWPID}, + {Type: configs.NEWUSER}, + {Type: configs.NEWNET}, + {Type: configs.NEWCGROUP}, + }), + Cgroups: &configs.Cgroup{ + Name: "test-container", + Parent: "system", + Resources: &configs.Resources{ + MemorySwappiness: nil, + AllowAllDevices: nil, + AllowedDevices: configs.DefaultAllowedDevices, + }, + }, + MaskPaths: []string{ + "/proc/kcore", + "/sys/firmware", + }, + ReadonlyPaths: []string{ + "/proc/sys", "/proc/sysrq-trigger", "/proc/irq", "/proc/bus", + }, + Devices: configs.DefaultAutoCreatedDevices, + Hostname: "testing", + Mounts: []*configs.Mount{ + { + Source: "proc", + Destination: "/proc", + Device: "proc", + Flags: defaultMountFlags, + }, + { + Source: "tmpfs", + Destination: "/dev", + Device: "tmpfs", + Flags: unix.MS_NOSUID | unix.MS_STRICTATIME, + Data: "mode=755", + }, + { + Source: "devpts", + Destination: "/dev/pts", + Device: "devpts", + Flags: unix.MS_NOSUID | unix.MS_NOEXEC, + Data: "newinstance,ptmxmode=0666,mode=0620,gid=5", + }, + { + Device: "tmpfs", + Source: "shm", + Destination: "/dev/shm", + Data: "mode=1777,size=65536k", + Flags: defaultMountFlags, + }, + { + Source: "mqueue", + Destination: "/dev/mqueue", + Device: "mqueue", + Flags: defaultMountFlags, + }, + { + Source: "sysfs", + Destination: "/sys", + Device: "sysfs", + Flags: defaultMountFlags | unix.MS_RDONLY, + }, + }, + UidMappings: []configs.IDMap{ + { + ContainerID: 0, + HostID: 1000, + Size: 65536, + }, + }, + GidMappings: []configs.IDMap{ + { + ContainerID: 0, + HostID: 1000, + Size: 65536, + }, + }, + Networks: []*configs.Network{ + { + Type: "loopback", + Address: "127.0.0.1/0", + Gateway: "localhost", + }, + }, + Rlimits: []configs.Rlimit{ + { + Type: unix.RLIMIT_NOFILE, + Hard: uint64(1025), + Soft: uint64(1025), + }, + }, +} +``` + +Once you have the configuration populated you can create a container: + +```go +container, err := factory.Create("container-id", config) +if err != nil { + logrus.Fatal(err) + return +} +``` + +To spawn bash as the initial process inside the container and have the +processes pid returned in order to wait, signal, or kill the process: + +```go +process := &libcontainer.Process{ + Args: []string{"/bin/bash"}, + Env: []string{"PATH=/bin"}, + User: "daemon", + Stdin: os.Stdin, + Stdout: os.Stdout, + Stderr: os.Stderr, + Init: true, +} + +err := container.Run(process) +if err != nil { + container.Destroy() + logrus.Fatal(err) + return +} + +// wait for the process to finish. +_, err := process.Wait() +if err != nil { + logrus.Fatal(err) +} + +// destroy the container. +container.Destroy() +``` + +Additional ways to interact with a running container are: + +```go +// return all the pids for all processes running inside the container. +processes, err := container.Processes() + +// get detailed cpu, memory, io, and network statistics for the container and +// it's processes. +stats, err := container.Stats() + +// pause all processes inside the container. +container.Pause() + +// resume all paused processes. +container.Resume() + +// send signal to container's init process. +container.Signal(signal) + +// update container resource constraints. +container.Set(config) + +// get current status of the container. +status, err := container.Status() + +// get current container's state information. +state, err := container.State() +``` + + +#### Checkpoint & Restore + +libcontainer now integrates [CRIU](http://criu.org/) for checkpointing and restoring containers. +This let's you save the state of a process running inside a container to disk, and then restore +that state into a new process, on the same machine or on another machine. + +`criu` version 1.5.2 or higher is required to use checkpoint and restore. +If you don't already have `criu` installed, you can build it from source, following the +[online instructions](http://criu.org/Installation). `criu` is also installed in the docker image +generated when building libcontainer with docker. + + +## Copyright and license + +Code and documentation copyright 2014 Docker, inc. +The code and documentation are released under the [Apache 2.0 license](../LICENSE). +The documentation is also released under Creative Commons Attribution 4.0 International License. +You may obtain a copy of the license, titled CC-BY-4.0, at http://creativecommons.org/licenses/by/4.0/. diff --git a/libcontainer/SPEC.md b/libcontainer/SPEC.md new file mode 100644 index 0000000..07ebdc1 --- /dev/null +++ b/libcontainer/SPEC.md @@ -0,0 +1,465 @@ +## Container Specification - v1 + +This is the standard configuration for version 1 containers. It includes +namespaces, standard filesystem setup, a default Linux capability set, and +information about resource reservations. It also has information about any +populated environment settings for the processes running inside a container. + +Along with the configuration of how a container is created the standard also +discusses actions that can be performed on a container to manage and inspect +information about the processes running inside. + +The v1 profile is meant to be able to accommodate the majority of applications +with a strong security configuration. + +### System Requirements and Compatibility + +Minimum requirements: +* Kernel version - 3.10 recommended 2.6.2x minimum(with backported patches) +* Mounted cgroups with each subsystem in its own hierarchy + + +### Namespaces + +| Flag | Enabled | +| --------------- | ------- | +| CLONE_NEWPID | 1 | +| CLONE_NEWUTS | 1 | +| CLONE_NEWIPC | 1 | +| CLONE_NEWNET | 1 | +| CLONE_NEWNS | 1 | +| CLONE_NEWUSER | 1 | +| CLONE_NEWCGROUP | 1 | + +Namespaces are created for the container via the `unshare` syscall. + + +### Filesystem + +A root filesystem must be provided to a container for execution. The container +will use this root filesystem (rootfs) to jail and spawn processes inside where +the binaries and system libraries are local to that directory. Any binaries +to be executed must be contained within this rootfs. + +Mounts that happen inside the container are automatically cleaned up when the +container exits as the mount namespace is destroyed and the kernel will +unmount all the mounts that were setup within that namespace. + +For a container to execute properly there are certain filesystems that +are required to be mounted within the rootfs that the runtime will setup. + +| Path | Type | Flags | Data | +| ----------- | ------ | -------------------------------------- | ---------------------------------------- | +| /proc | proc | MS_NOEXEC,MS_NOSUID,MS_NODEV | | +| /dev | tmpfs | MS_NOEXEC,MS_STRICTATIME | mode=755 | +| /dev/shm | tmpfs | MS_NOEXEC,MS_NOSUID,MS_NODEV | mode=1777,size=65536k | +| /dev/mqueue | mqueue | MS_NOEXEC,MS_NOSUID,MS_NODEV | | +| /dev/pts | devpts | MS_NOEXEC,MS_NOSUID | newinstance,ptmxmode=0666,mode=620,gid=5 | +| /sys | sysfs | MS_NOEXEC,MS_NOSUID,MS_NODEV,MS_RDONLY | | + + +After a container's filesystems are mounted within the newly created +mount namespace `/dev` will need to be populated with a set of device nodes. +It is expected that a rootfs does not need to have any device nodes specified +for `/dev` within the rootfs as the container will setup the correct devices +that are required for executing a container's process. + +| Path | Mode | Access | +| ------------ | ---- | ---------- | +| /dev/null | 0666 | rwm | +| /dev/zero | 0666 | rwm | +| /dev/full | 0666 | rwm | +| /dev/tty | 0666 | rwm | +| /dev/random | 0666 | rwm | +| /dev/urandom | 0666 | rwm | + + +**ptmx** +`/dev/ptmx` will need to be a symlink to the host's `/dev/ptmx` within +the container. + +The use of a pseudo TTY is optional within a container and it should support both. +If a pseudo is provided to the container `/dev/console` will need to be +setup by binding the console in `/dev/` after it has been populated and mounted +in tmpfs. + +| Source | Destination | UID GID | Mode | Type | +| --------------- | ------------ | ------- | ---- | ---- | +| *pty host path* | /dev/console | 0 0 | 0600 | bind | + + +After `/dev/null` has been setup we check for any external links between +the container's io, STDIN, STDOUT, STDERR. If the container's io is pointing +to `/dev/null` outside the container we close and `dup2` the `/dev/null` +that is local to the container's rootfs. + + +After the container has `/proc` mounted a few standard symlinks are setup +within `/dev/` for the io. + +| Source | Destination | +| --------------- | ----------- | +| /proc/self/fd | /dev/fd | +| /proc/self/fd/0 | /dev/stdin | +| /proc/self/fd/1 | /dev/stdout | +| /proc/self/fd/2 | /dev/stderr | + +A `pivot_root` is used to change the root for the process, effectively +jailing the process inside the rootfs. + +```c +put_old = mkdir(...); +pivot_root(rootfs, put_old); +chdir("/"); +unmount(put_old, MS_DETACH); +rmdir(put_old); +``` + +For container's running with a rootfs inside `ramfs` a `MS_MOVE` combined +with a `chroot` is required as `pivot_root` is not supported in `ramfs`. + +```c +mount(rootfs, "/", NULL, MS_MOVE, NULL); +chroot("."); +chdir("/"); +``` + +The `umask` is set back to `0022` after the filesystem setup has been completed. + +### Resources + +Cgroups are used to handle resource allocation for containers. This includes +system resources like cpu, memory, and device access. + +| Subsystem | Enabled | +| ---------- | ------- | +| devices | 1 | +| memory | 1 | +| cpu | 1 | +| cpuacct | 1 | +| cpuset | 1 | +| blkio | 1 | +| perf_event | 1 | +| freezer | 1 | +| hugetlb | 1 | +| pids | 1 | + + +All cgroup subsystem are joined so that statistics can be collected from +each of the subsystems. Freezer does not expose any stats but is joined +so that containers can be paused and resumed. + +The parent process of the container's init must place the init pid inside +the correct cgroups before the initialization begins. This is done so +that no processes or threads escape the cgroups. This sync is +done via a pipe ( specified in the runtime section below ) that the container's +init process will block waiting for the parent to finish setup. + +### IntelRdt + +Intel platforms with new Xeon CPU support Resource Director Technology (RDT). +Cache Allocation Technology (CAT) and Memory Bandwidth Allocation (MBA) are +two sub-features of RDT. + +Cache Allocation Technology (CAT) provides a way for the software to restrict +cache allocation to a defined 'subset' of L3 cache which may be overlapping +with other 'subsets'. The different subsets are identified by class of +service (CLOS) and each CLOS has a capacity bitmask (CBM). + +Memory Bandwidth Allocation (MBA) provides indirect and approximate throttle +over memory bandwidth for the software. A user controls the resource by +indicating the percentage of maximum memory bandwidth or memory bandwidth limit +in MBps unit if MBA Software Controller is enabled. + +It can be used to handle L3 cache and memory bandwidth resources allocation +for containers if hardware and kernel support Intel RDT CAT and MBA features. + +In Linux 4.10 kernel or newer, the interface is defined and exposed via +"resource control" filesystem, which is a "cgroup-like" interface. + +Comparing with cgroups, it has similar process management lifecycle and +interfaces in a container. But unlike cgroups' hierarchy, it has single level +filesystem layout. + +CAT and MBA features are introduced in Linux 4.10 and 4.12 kernel via +"resource control" filesystem. + +Intel RDT "resource control" filesystem hierarchy: +``` +mount -t resctrl resctrl /sys/fs/resctrl +tree /sys/fs/resctrl +/sys/fs/resctrl/ +|-- info +| |-- L3 +| | |-- cbm_mask +| | |-- min_cbm_bits +| | |-- num_closids +| |-- MB +| |-- bandwidth_gran +| |-- delay_linear +| |-- min_bandwidth +| |-- num_closids +|-- ... +|-- schemata +|-- tasks +|-- + |-- ... + |-- schemata + |-- tasks +``` + +For runc, we can make use of `tasks` and `schemata` configuration for L3 +cache and memory bandwidth resources constraints. + +The file `tasks` has a list of tasks that belongs to this group (e.g., +" group). Tasks can be added to a group by writing the task ID +to the "tasks" file (which will automatically remove them from the previous +group to which they belonged). New tasks created by fork(2) and clone(2) are +added to the same group as their parent. + +The file `schemata` has a list of all the resources available to this group. +Each resource (L3 cache, memory bandwidth) has its own line and format. + +L3 cache schema: +It has allocation bitmasks/values for L3 cache on each socket, which +contains L3 cache id and capacity bitmask (CBM). +``` + Format: "L3:=;=;..." +``` +For example, on a two-socket machine, the schema line could be "L3:0=ff;1=c0" +which means L3 cache id 0's CBM is 0xff, and L3 cache id 1's CBM is 0xc0. + +The valid L3 cache CBM is a *contiguous bits set* and number of bits that can +be set is less than the max bit. The max bits in the CBM is varied among +supported Intel CPU models. Kernel will check if it is valid when writing. +e.g., default value 0xfffff in root indicates the max bits of CBM is 20 +bits, which mapping to entire L3 cache capacity. Some valid CBM values to +set in a group: 0xf, 0xf0, 0x3ff, 0x1f00 and etc. + +Memory bandwidth schema: +It has allocation values for memory bandwidth on each socket, which contains +L3 cache id and memory bandwidth. +``` + Format: "MB:=bandwidth0;=bandwidth1;..." +``` +For example, on a two-socket machine, the schema line could be "MB:0=20;1=70" + +The minimum bandwidth percentage value for each CPU model is predefined and +can be looked up through "info/MB/min_bandwidth". The bandwidth granularity +that is allocated is also dependent on the CPU model and can be looked up at +"info/MB/bandwidth_gran". The available bandwidth control steps are: +min_bw + N * bw_gran. Intermediate values are rounded to the next control +step available on the hardware. + +If MBA Software Controller is enabled through mount option "-o mba_MBps" +mount -t resctrl resctrl -o mba_MBps /sys/fs/resctrl +We could specify memory bandwidth in "MBps" (Mega Bytes per second) unit +instead of "percentages". The kernel underneath would use a software feedback +mechanism or a "Software Controller" which reads the actual bandwidth using +MBM counters and adjust the memory bandwidth percentages to ensure: +"actual memory bandwidth < user specified memory bandwidth". + +For example, on a two-socket machine, the schema line could be +"MB:0=5000;1=7000" which means 5000 MBps memory bandwidth limit on socket 0 +and 7000 MBps memory bandwidth limit on socket 1. + +For more information about Intel RDT kernel interface: +https://www.kernel.org/doc/Documentation/x86/intel_rdt_ui.txt + +``` +An example for runc: +Consider a two-socket machine with two L3 caches where the default CBM is +0x7ff and the max CBM length is 11 bits, and minimum memory bandwidth of 10% +with a memory bandwidth granularity of 10%. + +Tasks inside the container only have access to the "upper" 7/11 of L3 cache +on socket 0 and the "lower" 5/11 L3 cache on socket 1, and may use a +maximum memory bandwidth of 20% on socket 0 and 70% on socket 1. + +"linux": { + "intelRdt": { + "closID": "guaranteed_group", + "l3CacheSchema": "L3:0=7f0;1=1f", + "memBwSchema": "MB:0=20;1=70" + } +} +``` + +### Security + +The standard set of Linux capabilities that are set in a container +provide a good default for security and flexibility for the applications. + + +| Capability | Enabled | +| -------------------- | ------- | +| CAP_NET_RAW | 1 | +| CAP_NET_BIND_SERVICE | 1 | +| CAP_AUDIT_READ | 1 | +| CAP_AUDIT_WRITE | 1 | +| CAP_DAC_OVERRIDE | 1 | +| CAP_SETFCAP | 1 | +| CAP_SETPCAP | 1 | +| CAP_SETGID | 1 | +| CAP_SETUID | 1 | +| CAP_MKNOD | 1 | +| CAP_CHOWN | 1 | +| CAP_FOWNER | 1 | +| CAP_FSETID | 1 | +| CAP_KILL | 1 | +| CAP_SYS_CHROOT | 1 | +| CAP_NET_BROADCAST | 0 | +| CAP_SYS_MODULE | 0 | +| CAP_SYS_RAWIO | 0 | +| CAP_SYS_PACCT | 0 | +| CAP_SYS_ADMIN | 0 | +| CAP_SYS_NICE | 0 | +| CAP_SYS_RESOURCE | 0 | +| CAP_SYS_TIME | 0 | +| CAP_SYS_TTY_CONFIG | 0 | +| CAP_AUDIT_CONTROL | 0 | +| CAP_MAC_OVERRIDE | 0 | +| CAP_MAC_ADMIN | 0 | +| CAP_NET_ADMIN | 0 | +| CAP_SYSLOG | 0 | +| CAP_DAC_READ_SEARCH | 0 | +| CAP_LINUX_IMMUTABLE | 0 | +| CAP_IPC_LOCK | 0 | +| CAP_IPC_OWNER | 0 | +| CAP_SYS_PTRACE | 0 | +| CAP_SYS_BOOT | 0 | +| CAP_LEASE | 0 | +| CAP_WAKE_ALARM | 0 | +| CAP_BLOCK_SUSPEND | 0 | + + +Additional security layers like [apparmor](https://wiki.ubuntu.com/AppArmor) +and [selinux](http://selinuxproject.org/page/Main_Page) can be used with +the containers. A container should support setting an apparmor profile or +selinux process and mount labels if provided in the configuration. + +Standard apparmor profile: +```c +#include +profile flags=(attach_disconnected,mediate_deleted) { + #include + network, + capability, + file, + umount, + + deny @{PROC}/sys/fs/** wklx, + deny @{PROC}/sysrq-trigger rwklx, + deny @{PROC}/mem rwklx, + deny @{PROC}/kmem rwklx, + deny @{PROC}/sys/kernel/[^s][^h][^m]* wklx, + deny @{PROC}/sys/kernel/*/** wklx, + + deny mount, + + deny /sys/[^f]*/** wklx, + deny /sys/f[^s]*/** wklx, + deny /sys/fs/[^c]*/** wklx, + deny /sys/fs/c[^g]*/** wklx, + deny /sys/fs/cg[^r]*/** wklx, + deny /sys/firmware/efi/efivars/** rwklx, + deny /sys/kernel/security/** rwklx, +} +``` + +*TODO: seccomp work is being done to find a good default config* + +### Runtime and Init Process + +During container creation the parent process needs to talk to the container's init +process and have a form of synchronization. This is accomplished by creating +a pipe that is passed to the container's init. When the init process first spawns +it will block on its side of the pipe until the parent closes its side. This +allows the parent to have time to set the new process inside a cgroup hierarchy +and/or write any uid/gid mappings required for user namespaces. +The pipe is passed to the init process via FD 3. + +The application consuming libcontainer should be compiled statically. libcontainer +does not define any init process and the arguments provided are used to `exec` the +process inside the application. There should be no long running init within the +container spec. + +If a pseudo tty is provided to a container it will open and `dup2` the console +as the container's STDIN, STDOUT, STDERR as well as mounting the console +as `/dev/console`. + +An extra set of mounts are provided to a container and setup for use. A container's +rootfs can contain some non portable files inside that can cause side effects during +execution of a process. These files are usually created and populated with the container +specific information via the runtime. + +**Extra runtime files:** +* /etc/hosts +* /etc/resolv.conf +* /etc/hostname +* /etc/localtime + + +#### Defaults + +There are a few defaults that can be overridden by users, but in their omission +these apply to processes within a container. + +| Type | Value | +| ------------------- | ------------------------------ | +| Parent Death Signal | SIGKILL | +| UID | 0 | +| GID | 0 | +| GROUPS | 0, NULL | +| CWD | "/" | +| $HOME | Current user's home dir or "/" | +| Readonly rootfs | false | +| Pseudo TTY | false | + + +## Actions + +After a container is created there is a standard set of actions that can +be done to the container. These actions are part of the public API for +a container. + +| Action | Description | +| -------------- | ------------------------------------------------------------------ | +| Get processes | Return all the pids for processes running inside a container | +| Get Stats | Return resource statistics for the container as a whole | +| Wait | Waits on the container's init process ( pid 1 ) | +| Wait Process | Wait on any of the container's processes returning the exit status | +| Destroy | Kill the container's init process and remove any filesystem state | +| Signal | Send a signal to the container's init process | +| Signal Process | Send a signal to any of the container's processes | +| Pause | Pause all processes inside the container | +| Resume | Resume all processes inside the container if paused | +| Exec | Execute a new process inside of the container ( requires setns ) | +| Set | Setup configs of the container after it's created | + +### Execute a new process inside of a running container + +User can execute a new process inside of a running container. Any binaries to be +executed must be accessible within the container's rootfs. + +The started process will run inside the container's rootfs. Any changes +made by the process to the container's filesystem will persist after the +process finished executing. + +The started process will join all the container's existing namespaces. When the +container is paused, the process will also be paused and will resume when +the container is unpaused. The started process will only run when the container's +primary process (PID 1) is running, and will not be restarted when the container +is restarted. + +#### Planned additions + +The started process will have its own cgroups nested inside the container's +cgroups. This is used for process tracking and optionally resource allocation +handling for the new process. Freezer cgroup is required, the rest of the cgroups +are optional. The process executor must place its pid inside the correct +cgroups before starting the process. This is done so that no child processes or +threads can escape the cgroups. + +When the process is stopped, the process executor will try (in a best-effort way) +to stop all its children and remove the sub-cgroups. diff --git a/libcontainer/apparmor/apparmor.go b/libcontainer/apparmor/apparmor.go new file mode 100644 index 0000000..debfc1e --- /dev/null +++ b/libcontainer/apparmor/apparmor.go @@ -0,0 +1,60 @@ +// +build apparmor,linux + +package apparmor + +import ( + "fmt" + "io/ioutil" + "os" + + "github.com/opencontainers/runc/libcontainer/utils" +) + +// IsEnabled returns true if apparmor is enabled for the host. +func IsEnabled() bool { + if _, err := os.Stat("/sys/kernel/security/apparmor"); err == nil && os.Getenv("container") == "" { + if _, err = os.Stat("/sbin/apparmor_parser"); err == nil { + buf, err := ioutil.ReadFile("/sys/module/apparmor/parameters/enabled") + return err == nil && len(buf) > 1 && buf[0] == 'Y' + } + } + return false +} + +func setProcAttr(attr, value string) error { + // Under AppArmor you can only change your own attr, so use /proc/self/ + // instead of /proc// like libapparmor does + path := fmt.Sprintf("/proc/self/attr/%s", attr) + + f, err := os.OpenFile(path, os.O_WRONLY, 0) + if err != nil { + return err + } + defer f.Close() + + if err := utils.EnsureProcHandle(f); err != nil { + return err + } + + _, err = fmt.Fprintf(f, "%s", value) + return err +} + +// changeOnExec reimplements aa_change_onexec from libapparmor in Go +func changeOnExec(name string) error { + value := "exec " + name + if err := setProcAttr("exec", value); err != nil { + return fmt.Errorf("apparmor failed to apply profile: %s", err) + } + return nil +} + +// ApplyProfile will apply the profile with the specified name to the process after +// the next exec. +func ApplyProfile(name string) error { + if name == "" { + return nil + } + + return changeOnExec(name) +} diff --git a/libcontainer/apparmor/apparmor_disabled.go b/libcontainer/apparmor/apparmor_disabled.go new file mode 100644 index 0000000..d4110cf --- /dev/null +++ b/libcontainer/apparmor/apparmor_disabled.go @@ -0,0 +1,20 @@ +// +build !apparmor !linux + +package apparmor + +import ( + "errors" +) + +var ErrApparmorNotEnabled = errors.New("apparmor: config provided but apparmor not supported") + +func IsEnabled() bool { + return false +} + +func ApplyProfile(name string) error { + if name != "" { + return ErrApparmorNotEnabled + } + return nil +} diff --git a/libcontainer/capabilities_linux.go b/libcontainer/capabilities_linux.go new file mode 100644 index 0000000..9daef29 --- /dev/null +++ b/libcontainer/capabilities_linux.go @@ -0,0 +1,117 @@ +// +build linux + +package libcontainer + +import ( + "fmt" + "strings" + + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/syndtr/gocapability/capability" +) + +const allCapabilityTypes = capability.CAPS | capability.BOUNDS | capability.AMBS + +var capabilityMap map[string]capability.Cap + +func init() { + capabilityMap = make(map[string]capability.Cap) + last := capability.CAP_LAST_CAP + // workaround for RHEL6 which has no /proc/sys/kernel/cap_last_cap + if last == capability.Cap(63) { + last = capability.CAP_BLOCK_SUSPEND + } + for _, cap := range capability.List() { + if cap > last { + continue + } + capKey := fmt.Sprintf("CAP_%s", strings.ToUpper(cap.String())) + capabilityMap[capKey] = cap + } +} + +func newContainerCapList(capConfig *configs.Capabilities) (*containerCapabilities, error) { + bounding := []capability.Cap{} + for _, c := range capConfig.Bounding { + v, ok := capabilityMap[c] + if !ok { + return nil, fmt.Errorf("unknown capability %q", c) + } + bounding = append(bounding, v) + } + effective := []capability.Cap{} + for _, c := range capConfig.Effective { + v, ok := capabilityMap[c] + if !ok { + return nil, fmt.Errorf("unknown capability %q", c) + } + effective = append(effective, v) + } + inheritable := []capability.Cap{} + for _, c := range capConfig.Inheritable { + v, ok := capabilityMap[c] + if !ok { + return nil, fmt.Errorf("unknown capability %q", c) + } + inheritable = append(inheritable, v) + } + permitted := []capability.Cap{} + for _, c := range capConfig.Permitted { + v, ok := capabilityMap[c] + if !ok { + return nil, fmt.Errorf("unknown capability %q", c) + } + permitted = append(permitted, v) + } + ambient := []capability.Cap{} + for _, c := range capConfig.Ambient { + v, ok := capabilityMap[c] + if !ok { + return nil, fmt.Errorf("unknown capability %q", c) + } + ambient = append(ambient, v) + } + pid, err := capability.NewPid2(0) + if err != nil { + return nil, err + } + err = pid.Load() + if err != nil { + return nil, err + } + return &containerCapabilities{ + bounding: bounding, + effective: effective, + inheritable: inheritable, + permitted: permitted, + ambient: ambient, + pid: pid, + }, nil +} + +type containerCapabilities struct { + pid capability.Capabilities + bounding []capability.Cap + effective []capability.Cap + inheritable []capability.Cap + permitted []capability.Cap + ambient []capability.Cap +} + +// ApplyBoundingSet sets the capability bounding set to those specified in the whitelist. +func (c *containerCapabilities) ApplyBoundingSet() error { + c.pid.Clear(capability.BOUNDS) + c.pid.Set(capability.BOUNDS, c.bounding...) + return c.pid.Apply(capability.BOUNDS) +} + +// Apply sets all the capabilities for the current process in the config. +func (c *containerCapabilities) ApplyCaps() error { + c.pid.Clear(allCapabilityTypes) + c.pid.Set(capability.BOUNDS, c.bounding...) + c.pid.Set(capability.PERMITTED, c.permitted...) + c.pid.Set(capability.INHERITABLE, c.inheritable...) + c.pid.Set(capability.EFFECTIVE, c.effective...) + c.pid.Set(capability.AMBIENT, c.ambient...) + return c.pid.Apply(allCapabilityTypes) +} diff --git a/libcontainer/cgroups/cgroups.go b/libcontainer/cgroups/cgroups.go new file mode 100644 index 0000000..c0a9659 --- /dev/null +++ b/libcontainer/cgroups/cgroups.go @@ -0,0 +1,74 @@ +// +build linux + +package cgroups + +import ( + "fmt" + + "github.com/opencontainers/runc/libcontainer/configs" +) + +type Manager interface { + // Applies cgroup configuration to the process with the specified pid + Apply(pid int) error + + // Returns the PIDs inside the cgroup set + GetPids() ([]int, error) + + // Returns the PIDs inside the cgroup set & all sub-cgroups + GetAllPids() ([]int, error) + + // Returns statistics for the cgroup set + GetStats() (*Stats, error) + + // Toggles the freezer cgroup according with specified state + Freeze(state configs.FreezerState) error + + // Destroys the cgroup set + Destroy() error + + // The option func SystemdCgroups() and Cgroupfs() require following attributes: + // Paths map[string]string + // Cgroups *configs.Cgroup + // Paths maps cgroup subsystem to path at which it is mounted. + // Cgroups specifies specific cgroup settings for the various subsystems + + // Returns cgroup paths to save in a state file and to be able to + // restore the object later. + GetPaths() map[string]string + + // GetUnifiedPath returns the unified path when running in unified mode. + // The value corresponds to the all values of GetPaths() map. + // + // GetUnifiedPath returns error when running in hybrid mode as well as + // in legacy mode. + GetUnifiedPath() (string, error) + + // Sets the cgroup as configured. + Set(container *configs.Config) error + + // Gets the cgroup as configured. + GetCgroups() (*configs.Cgroup, error) +} + +type NotFoundError struct { + Subsystem string +} + +func (e *NotFoundError) Error() string { + return fmt.Sprintf("mountpoint for %s not found", e.Subsystem) +} + +func NewNotFoundError(sub string) error { + return &NotFoundError{ + Subsystem: sub, + } +} + +func IsNotFound(err error) bool { + if err == nil { + return false + } + _, ok := err.(*NotFoundError) + return ok +} diff --git a/libcontainer/cgroups/cgroups_test.go b/libcontainer/cgroups/cgroups_test.go new file mode 100644 index 0000000..9efb83e --- /dev/null +++ b/libcontainer/cgroups/cgroups_test.go @@ -0,0 +1,20 @@ +// +build linux + +package cgroups + +import ( + "testing" +) + +func TestParseCgroups(t *testing.T) { + cgroups, err := ParseCgroupFile("/proc/self/cgroup") + if err != nil { + t.Fatal(err) + } + if IsCgroup2UnifiedMode() { + return + } + if _, ok := cgroups["cpu"]; !ok { + t.Fail() + } +} diff --git a/libcontainer/cgroups/cgroups_unsupported.go b/libcontainer/cgroups/cgroups_unsupported.go new file mode 100644 index 0000000..278d507 --- /dev/null +++ b/libcontainer/cgroups/cgroups_unsupported.go @@ -0,0 +1,3 @@ +// +build !linux + +package cgroups diff --git a/libcontainer/cgroups/ebpf/devicefilter/devicefilter.go b/libcontainer/cgroups/ebpf/devicefilter/devicefilter.go new file mode 100644 index 0000000..847ce8e --- /dev/null +++ b/libcontainer/cgroups/ebpf/devicefilter/devicefilter.go @@ -0,0 +1,180 @@ +// Package devicefilter containes eBPF device filter program +// +// The implementation is based on https://github.com/containers/crun/blob/0.10.2/src/libcrun/ebpf.c +// +// Although ebpf.c is originally licensed under LGPL-3.0-or-later, the author (Giuseppe Scrivano) +// agreed to relicense the file in Apache License 2.0: https://github.com/opencontainers/runc/issues/2144#issuecomment-543116397 +package devicefilter + +import ( + "fmt" + "math" + + "github.com/cilium/ebpf/asm" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/pkg/errors" + "golang.org/x/sys/unix" +) + +const ( + // license string format is same as kernel MODULE_LICENSE macro + license = "Apache" +) + +// DeviceFilter returns eBPF device filter program and its license string +func DeviceFilter(devices []*configs.Device) (asm.Instructions, string, error) { + p := &program{} + p.init() + for i := len(devices) - 1; i >= 0; i-- { + if err := p.appendDevice(devices[i]); err != nil { + return nil, "", err + } + } + insts, err := p.finalize() + return insts, license, err +} + +type program struct { + insts asm.Instructions + hasWildCard bool + blockID int +} + +func (p *program) init() { + // struct bpf_cgroup_dev_ctx: https://elixir.bootlin.com/linux/v5.3.6/source/include/uapi/linux/bpf.h#L3423 + /* + u32 access_type + u32 major + u32 minor + */ + // R2 <- type (lower 16 bit of u32 access_type at R1[0]) + p.insts = append(p.insts, + asm.LoadMem(asm.R2, asm.R1, 0, asm.Half)) + + // R3 <- access (upper 16 bit of u32 access_type at R1[0]) + p.insts = append(p.insts, + asm.LoadMem(asm.R3, asm.R1, 0, asm.Word), + // RSh: bitwise shift right + asm.RSh.Imm32(asm.R3, 16)) + + // R4 <- major (u32 major at R1[4]) + p.insts = append(p.insts, + asm.LoadMem(asm.R4, asm.R1, 4, asm.Word)) + + // R5 <- minor (u32 minor at R1[8]) + p.insts = append(p.insts, + asm.LoadMem(asm.R5, asm.R1, 8, asm.Word)) +} + +// appendDevice needs to be called from the last element of OCI linux.resources.devices to the head element. +func (p *program) appendDevice(dev *configs.Device) error { + if p.blockID < 0 { + return errors.New("the program is finalized") + } + if p.hasWildCard { + // All entries after wildcard entry are ignored + return nil + } + + bpfType := int32(-1) + hasType := true + switch dev.Type { + case 'c': + bpfType = int32(unix.BPF_DEVCG_DEV_CHAR) + case 'b': + bpfType = int32(unix.BPF_DEVCG_DEV_BLOCK) + case 'a': + hasType = false + default: + // if not specified in OCI json, typ is set to DeviceTypeAll + return errors.Errorf("invalid DeviceType %q", string(dev.Type)) + } + if dev.Major > math.MaxUint32 { + return errors.Errorf("invalid major %d", dev.Major) + } + if dev.Minor > math.MaxUint32 { + return errors.Errorf("invalid minor %d", dev.Major) + } + hasMajor := dev.Major >= 0 // if not specified in OCI json, major is set to -1 + hasMinor := dev.Minor >= 0 + bpfAccess := int32(0) + for _, r := range dev.Permissions { + switch r { + case 'r': + bpfAccess |= unix.BPF_DEVCG_ACC_READ + case 'w': + bpfAccess |= unix.BPF_DEVCG_ACC_WRITE + case 'm': + bpfAccess |= unix.BPF_DEVCG_ACC_MKNOD + default: + return errors.Errorf("unknown device access %v", r) + } + } + // If the access is rwm, skip the check. + hasAccess := bpfAccess != (unix.BPF_DEVCG_ACC_READ | unix.BPF_DEVCG_ACC_WRITE | unix.BPF_DEVCG_ACC_MKNOD) + + blockSym := fmt.Sprintf("block-%d", p.blockID) + nextBlockSym := fmt.Sprintf("block-%d", p.blockID+1) + prevBlockLastIdx := len(p.insts) - 1 + if hasType { + p.insts = append(p.insts, + // if (R2 != bpfType) goto next + asm.JNE.Imm(asm.R2, bpfType, nextBlockSym), + ) + } + if hasAccess { + p.insts = append(p.insts, + // if (R3 & bpfAccess == 0 /* use R1 as a temp var */) goto next + asm.Mov.Reg32(asm.R1, asm.R3), + asm.And.Imm32(asm.R1, bpfAccess), + asm.JEq.Imm(asm.R1, 0, nextBlockSym), + ) + } + if hasMajor { + p.insts = append(p.insts, + // if (R4 != major) goto next + asm.JNE.Imm(asm.R4, int32(dev.Major), nextBlockSym), + ) + } + if hasMinor { + p.insts = append(p.insts, + // if (R5 != minor) goto next + asm.JNE.Imm(asm.R5, int32(dev.Minor), nextBlockSym), + ) + } + if !hasType && !hasAccess && !hasMajor && !hasMinor { + p.hasWildCard = true + } + p.insts = append(p.insts, acceptBlock(dev.Allow)...) + // set blockSym to the first instruction we added in this iteration + p.insts[prevBlockLastIdx+1] = p.insts[prevBlockLastIdx+1].Sym(blockSym) + p.blockID++ + return nil +} + +func (p *program) finalize() (asm.Instructions, error) { + if p.hasWildCard { + // acceptBlock with asm.Return() is already inserted + return p.insts, nil + } + blockSym := fmt.Sprintf("block-%d", p.blockID) + p.insts = append(p.insts, + // R0 <- 0 + asm.Mov.Imm32(asm.R0, 0).Sym(blockSym), + asm.Return(), + ) + p.blockID = -1 + return p.insts, nil +} + +func acceptBlock(accept bool) asm.Instructions { + v := int32(0) + if accept { + v = 1 + } + return []asm.Instruction{ + // R0 <- v + asm.Mov.Imm32(asm.R0, v), + asm.Return(), + } +} diff --git a/libcontainer/cgroups/ebpf/devicefilter/devicefilter_test.go b/libcontainer/cgroups/ebpf/devicefilter/devicefilter_test.go new file mode 100644 index 0000000..59ff4b4 --- /dev/null +++ b/libcontainer/cgroups/ebpf/devicefilter/devicefilter_test.go @@ -0,0 +1,258 @@ +package devicefilter + +import ( + "strings" + "testing" + + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/specconv" +) + +func hash(s, comm string) string { + var res []string + for _, l := range strings.Split(s, "\n") { + trimmed := strings.TrimSpace(l) + if trimmed == "" || strings.HasPrefix(trimmed, comm) { + continue + } + res = append(res, trimmed) + } + return strings.Join(res, "\n") +} + +func testDeviceFilter(t testing.TB, devices []*configs.Device, expectedStr string) { + insts, _, err := DeviceFilter(devices) + if err != nil { + t.Fatalf("%s: %v (devices: %+v)", t.Name(), err, devices) + } + s := insts.String() + t.Logf("%s: devices: %+v\n%s", t.Name(), devices, s) + if expectedStr != "" { + hashed := hash(s, "//") + expectedHashed := hash(expectedStr, "//") + if expectedHashed != hashed { + t.Fatalf("expected:\n%q\ngot\n%q", expectedHashed, hashed) + } + } +} + +func TestDeviceFilter_Nil(t *testing.T) { + expected := ` +// load parameters into registers + 0: LdXMemH dst: r2 src: r1 off: 0 imm: 0 + 1: LdXMemW dst: r3 src: r1 off: 0 imm: 0 + 2: RSh32Imm dst: r3 imm: 16 + 3: LdXMemW dst: r4 src: r1 off: 4 imm: 0 + 4: LdXMemW dst: r5 src: r1 off: 8 imm: 0 +block-0: +// return 0 (reject) + 5: Mov32Imm dst: r0 imm: 0 + 6: Exit + ` + testDeviceFilter(t, nil, expected) +} + +func TestDeviceFilter_BuiltInAllowList(t *testing.T) { + expected := ` +// load parameters into registers + 0: LdXMemH dst: r2 src: r1 off: 0 imm: 0 + 1: LdXMemW dst: r3 src: r1 off: 0 imm: 0 + 2: RSh32Imm dst: r3 imm: 16 + 3: LdXMemW dst: r4 src: r1 off: 4 imm: 0 + 4: LdXMemW dst: r5 src: r1 off: 8 imm: 0 +block-0: +// tuntap (c, 10, 200, rwm, allow) + 5: JNEImm dst: r2 off: -1 imm: 2 + 6: JNEImm dst: r4 off: -1 imm: 10 + 7: JNEImm dst: r5 off: -1 imm: 200 + 8: Mov32Imm dst: r0 imm: 1 + 9: Exit +block-1: + 10: JNEImm dst: r2 off: -1 imm: 2 + 11: JNEImm dst: r4 off: -1 imm: 5 + 12: JNEImm dst: r5 off: -1 imm: 2 + 13: Mov32Imm dst: r0 imm: 1 + 14: Exit +block-2: +// /dev/pts (c, 136, wildcard, rwm, true) + 15: JNEImm dst: r2 off: -1 imm: 2 + 16: JNEImm dst: r4 off: -1 imm: 136 + 17: Mov32Imm dst: r0 imm: 1 + 18: Exit +block-3: + 19: JNEImm dst: r2 off: -1 imm: 2 + 20: JNEImm dst: r4 off: -1 imm: 5 + 21: JNEImm dst: r5 off: -1 imm: 1 + 22: Mov32Imm dst: r0 imm: 1 + 23: Exit +block-4: + 24: JNEImm dst: r2 off: -1 imm: 2 + 25: JNEImm dst: r4 off: -1 imm: 1 + 26: JNEImm dst: r5 off: -1 imm: 9 + 27: Mov32Imm dst: r0 imm: 1 + 28: Exit +block-5: + 29: JNEImm dst: r2 off: -1 imm: 2 + 30: JNEImm dst: r4 off: -1 imm: 1 + 31: JNEImm dst: r5 off: -1 imm: 5 + 32: Mov32Imm dst: r0 imm: 1 + 33: Exit +block-6: + 34: JNEImm dst: r2 off: -1 imm: 2 + 35: JNEImm dst: r4 off: -1 imm: 5 + 36: JNEImm dst: r5 off: -1 imm: 0 + 37: Mov32Imm dst: r0 imm: 1 + 38: Exit +block-7: + 39: JNEImm dst: r2 off: -1 imm: 2 + 40: JNEImm dst: r4 off: -1 imm: 1 + 41: JNEImm dst: r5 off: -1 imm: 7 + 42: Mov32Imm dst: r0 imm: 1 + 43: Exit +block-8: + 44: JNEImm dst: r2 off: -1 imm: 2 + 45: JNEImm dst: r4 off: -1 imm: 1 + 46: JNEImm dst: r5 off: -1 imm: 8 + 47: Mov32Imm dst: r0 imm: 1 + 48: Exit +block-9: + 49: JNEImm dst: r2 off: -1 imm: 2 + 50: JNEImm dst: r4 off: -1 imm: 1 + 51: JNEImm dst: r5 off: -1 imm: 3 + 52: Mov32Imm dst: r0 imm: 1 + 53: Exit +block-10: +// (b, wildcard, wildcard, m, true) + 54: JNEImm dst: r2 off: -1 imm: 1 + 55: Mov32Reg dst: r1 src: r3 + 56: And32Imm dst: r1 imm: 1 + 57: JEqImm dst: r1 off: -1 imm: 0 + 58: Mov32Imm dst: r0 imm: 1 + 59: Exit +block-11: +// (c, wildcard, wildcard, m, true) + 60: JNEImm dst: r2 off: -1 imm: 2 + 61: Mov32Reg dst: r1 src: r3 + 62: And32Imm dst: r1 imm: 1 + 63: JEqImm dst: r1 off: -1 imm: 0 + 64: Mov32Imm dst: r0 imm: 1 + 65: Exit +block-12: + 66: Mov32Imm dst: r0 imm: 0 + 67: Exit +` + testDeviceFilter(t, specconv.AllowedDevices, expected) +} + +func TestDeviceFilter_Privileged(t *testing.T) { + devices := []*configs.Device{ + { + Type: 'a', + Major: -1, + Minor: -1, + Permissions: "rwm", + Allow: true, + }, + } + expected := + ` +// load parameters into registers + 0: LdXMemH dst: r2 src: r1 off: 0 imm: 0 + 1: LdXMemW dst: r3 src: r1 off: 0 imm: 0 + 2: RSh32Imm dst: r3 imm: 16 + 3: LdXMemW dst: r4 src: r1 off: 4 imm: 0 + 4: LdXMemW dst: r5 src: r1 off: 8 imm: 0 +block-0: +// return 1 (accept) + 5: Mov32Imm dst: r0 imm: 1 + 6: Exit + ` + testDeviceFilter(t, devices, expected) +} + +func TestDeviceFilter_PrivilegedExceptSingleDevice(t *testing.T) { + devices := []*configs.Device{ + { + Type: 'a', + Major: -1, + Minor: -1, + Permissions: "rwm", + Allow: true, + }, + { + Type: 'b', + Major: 8, + Minor: 0, + Permissions: "rwm", + Allow: false, + }, + } + expected := ` +// load parameters into registers + 0: LdXMemH dst: r2 src: r1 off: 0 imm: 0 + 1: LdXMemW dst: r3 src: r1 off: 0 imm: 0 + 2: RSh32Imm dst: r3 imm: 16 + 3: LdXMemW dst: r4 src: r1 off: 4 imm: 0 + 4: LdXMemW dst: r5 src: r1 off: 8 imm: 0 +block-0: +// return 0 (reject) if type==b && major == 8 && minor == 0 + 5: JNEImm dst: r2 off: -1 imm: 1 + 6: JNEImm dst: r4 off: -1 imm: 8 + 7: JNEImm dst: r5 off: -1 imm: 0 + 8: Mov32Imm dst: r0 imm: 0 + 9: Exit +block-1: +// return 1 (accept) + 10: Mov32Imm dst: r0 imm: 1 + 11: Exit +` + testDeviceFilter(t, devices, expected) +} + +func TestDeviceFilter_Weird(t *testing.T) { + devices := []*configs.Device{ + { + Type: 'b', + Major: 8, + Minor: 1, + Permissions: "rwm", + Allow: false, + }, + { + Type: 'a', + Major: -1, + Minor: -1, + Permissions: "rwm", + Allow: true, + }, + { + Type: 'b', + Major: 8, + Minor: 2, + Permissions: "rwm", + Allow: false, + }, + } + // 8/1 is allowed, 8/2 is not allowed. + // This conforms to runc v1.0.0-rc.9 (cgroup1) behavior. + expected := ` +// load parameters into registers + 0: LdXMemH dst: r2 src: r1 off: 0 imm: 0 + 1: LdXMemW dst: r3 src: r1 off: 0 imm: 0 + 2: RSh32Imm dst: r3 imm: 16 + 3: LdXMemW dst: r4 src: r1 off: 4 imm: 0 + 4: LdXMemW dst: r5 src: r1 off: 8 imm: 0 +block-0: +// return 0 (reject) if type==b && major == 8 && minor == 2 + 5: JNEImm dst: r2 off: -1 imm: 1 + 6: JNEImm dst: r4 off: -1 imm: 8 + 7: JNEImm dst: r5 off: -1 imm: 2 + 8: Mov32Imm dst: r0 imm: 0 + 9: Exit +block-1: +// return 1 (accept) + 10: Mov32Imm dst: r0 imm: 1 + 11: Exit +` + testDeviceFilter(t, devices, expected) +} diff --git a/libcontainer/cgroups/ebpf/ebpf.go b/libcontainer/cgroups/ebpf/ebpf.go new file mode 100644 index 0000000..4795e0a --- /dev/null +++ b/libcontainer/cgroups/ebpf/ebpf.go @@ -0,0 +1,45 @@ +package ebpf + +import ( + "github.com/cilium/ebpf" + "github.com/cilium/ebpf/asm" + "github.com/pkg/errors" + "golang.org/x/sys/unix" +) + +// LoadAttachCgroupDeviceFilter installs eBPF device filter program to /sys/fs/cgroup/ directory. +// +// Requires the system to be running in cgroup2 unified-mode with kernel >= 4.15 . +// +// https://github.com/torvalds/linux/commit/ebc614f687369f9df99828572b1d85a7c2de3d92 +func LoadAttachCgroupDeviceFilter(insts asm.Instructions, license string, dirFD int) (func() error, error) { + nilCloser := func() error { + return nil + } + // Increase `ulimit -l` limit to avoid BPF_PROG_LOAD error (#2167). + // This limit is not inherited into the container. + memlockLimit := &unix.Rlimit{ + Cur: unix.RLIM_INFINITY, + Max: unix.RLIM_INFINITY, + } + _ = unix.Setrlimit(unix.RLIMIT_MEMLOCK, memlockLimit) + spec := &ebpf.ProgramSpec{ + Type: ebpf.CGroupDevice, + Instructions: insts, + License: license, + } + prog, err := ebpf.NewProgram(spec) + if err != nil { + return nilCloser, err + } + if err := prog.Attach(dirFD, ebpf.AttachCGroupDevice, unix.BPF_F_ALLOW_MULTI); err != nil { + return nilCloser, errors.Wrap(err, "failed to call BPF_PROG_ATTACH (BPF_CGROUP_DEVICE, BPF_F_ALLOW_MULTI)") + } + closer := func() error { + if err := prog.Detach(dirFD, ebpf.AttachCGroupDevice, unix.BPF_F_ALLOW_MULTI); err != nil { + return errors.Wrap(err, "failed to call BPF_PROG_DETACH (BPF_CGROUP_DEVICE, BPF_F_ALLOW_MULTI)") + } + return nil + } + return closer, nil +} diff --git a/libcontainer/cgroups/fs/apply_raw.go b/libcontainer/cgroups/fs/apply_raw.go new file mode 100644 index 0000000..ec148b4 --- /dev/null +++ b/libcontainer/cgroups/fs/apply_raw.go @@ -0,0 +1,411 @@ +// +build linux + +package fs + +import ( + "fmt" + "io" + "os" + "path/filepath" + "sync" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" + libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils" + "github.com/pkg/errors" + "golang.org/x/sys/unix" +) + +var ( + subsystemsLegacy = subsystemSet{ + &CpusetGroup{}, + &DevicesGroup{}, + &MemoryGroup{}, + &CpuGroup{}, + &CpuacctGroup{}, + &PidsGroup{}, + &BlkioGroup{}, + &HugetlbGroup{}, + &NetClsGroup{}, + &NetPrioGroup{}, + &PerfEventGroup{}, + &FreezerGroup{}, + &NameGroup{GroupName: "name=systemd", Join: true}, + } + HugePageSizes, _ = cgroups.GetHugePageSize() +) + +var errSubsystemDoesNotExist = fmt.Errorf("cgroup: subsystem does not exist") + +type subsystemSet []subsystem + +func (s subsystemSet) Get(name string) (subsystem, error) { + for _, ss := range s { + if ss.Name() == name { + return ss, nil + } + } + return nil, errSubsystemDoesNotExist +} + +type subsystem interface { + // Name returns the name of the subsystem. + Name() string + // Returns the stats, as 'stats', corresponding to the cgroup under 'path'. + GetStats(path string, stats *cgroups.Stats) error + // Removes the cgroup represented by 'cgroupData'. + Remove(*cgroupData) error + // Creates and joins the cgroup represented by 'cgroupData'. + Apply(*cgroupData) error + // Set the cgroup represented by cgroup. + Set(path string, cgroup *configs.Cgroup) error +} + +type Manager struct { + mu sync.Mutex + Cgroups *configs.Cgroup + Rootless bool // ignore permission-related errors + Paths map[string]string +} + +// The absolute path to the root of the cgroup hierarchies. +var cgroupRootLock sync.Mutex +var cgroupRoot string + +// Gets the cgroupRoot. +func getCgroupRoot() (string, error) { + cgroupRootLock.Lock() + defer cgroupRootLock.Unlock() + + if cgroupRoot != "" { + return cgroupRoot, nil + } + + root, err := cgroups.FindCgroupMountpointDir() + if err != nil { + return "", err + } + + if _, err := os.Stat(root); err != nil { + return "", err + } + + cgroupRoot = root + return cgroupRoot, nil +} + +type cgroupData struct { + root string + innerPath string + config *configs.Cgroup + pid int +} + +// isIgnorableError returns whether err is a permission error (in the loose +// sense of the word). This includes EROFS (which for an unprivileged user is +// basically a permission error) and EACCES (for similar reasons) as well as +// the normal EPERM. +func isIgnorableError(rootless bool, err error) bool { + // We do not ignore errors if we are root. + if !rootless { + return false + } + // Is it an ordinary EPERM? + if os.IsPermission(errors.Cause(err)) { + return true + } + + // Try to handle other errnos. + var errno error + switch err := errors.Cause(err).(type) { + case *os.PathError: + errno = err.Err + case *os.LinkError: + errno = err.Err + case *os.SyscallError: + errno = err.Err + } + return errno == unix.EROFS || errno == unix.EPERM || errno == unix.EACCES +} + +func (m *Manager) getSubsystems() subsystemSet { + return subsystemsLegacy +} + +func (m *Manager) Apply(pid int) (err error) { + if m.Cgroups == nil { + return nil + } + m.mu.Lock() + defer m.mu.Unlock() + + var c = m.Cgroups + + d, err := getCgroupData(m.Cgroups, pid) + if err != nil { + return err + } + + m.Paths = make(map[string]string) + if c.Paths != nil { + for name, path := range c.Paths { + _, err := d.path(name) + if err != nil { + if cgroups.IsNotFound(err) { + continue + } + return err + } + m.Paths[name] = path + } + return cgroups.EnterPid(m.Paths, pid) + } + + for _, sys := range m.getSubsystems() { + // TODO: Apply should, ideally, be reentrant or be broken up into a separate + // create and join phase so that the cgroup hierarchy for a container can be + // created then join consists of writing the process pids to cgroup.procs + p, err := d.path(sys.Name()) + if err != nil { + // The non-presence of the devices subsystem is + // considered fatal for security reasons. + if cgroups.IsNotFound(err) && sys.Name() != "devices" { + continue + } + return err + } + m.Paths[sys.Name()] = p + + if err := sys.Apply(d); err != nil { + // In the case of rootless (including euid=0 in userns), where an explicit cgroup path hasn't + // been set, we don't bail on error in case of permission problems. + // Cases where limits have been set (and we couldn't create our own + // cgroup) are handled by Set. + if isIgnorableError(m.Rootless, err) && m.Cgroups.Path == "" { + delete(m.Paths, sys.Name()) + continue + } + return err + } + + } + return nil +} + +func (m *Manager) Destroy() error { + if m.Cgroups == nil || m.Cgroups.Paths != nil { + return nil + } + m.mu.Lock() + defer m.mu.Unlock() + if err := cgroups.RemovePaths(m.Paths); err != nil { + return err + } + m.Paths = make(map[string]string) + return nil +} + +func (m *Manager) GetPaths() map[string]string { + m.mu.Lock() + paths := m.Paths + m.mu.Unlock() + return paths +} + +func (m *Manager) GetUnifiedPath() (string, error) { + return "", errors.New("unified path is only supported when running in unified mode") +} + +func (m *Manager) GetStats() (*cgroups.Stats, error) { + m.mu.Lock() + defer m.mu.Unlock() + stats := cgroups.NewStats() + for name, path := range m.Paths { + sys, err := m.getSubsystems().Get(name) + if err == errSubsystemDoesNotExist || !cgroups.PathExists(path) { + continue + } + if err := sys.GetStats(path, stats); err != nil { + return nil, err + } + } + return stats, nil +} + +func (m *Manager) Set(container *configs.Config) error { + if container.Cgroups == nil { + return nil + } + + // If Paths are set, then we are just joining cgroups paths + // and there is no need to set any values. + if m.Cgroups != nil && m.Cgroups.Paths != nil { + return nil + } + + paths := m.GetPaths() + for _, sys := range m.getSubsystems() { + path := paths[sys.Name()] + if err := sys.Set(path, container.Cgroups); err != nil { + if m.Rootless && sys.Name() == "devices" { + continue + } + // When m.Rootless is true, errors from the device subsystem are ignored because it is really not expected to work. + // However, errors from other subsystems are not ignored. + // see @test "runc create (rootless + limits + no cgrouppath + no permission) fails with informative error" + if path == "" { + // We never created a path for this cgroup, so we cannot set + // limits for it (though we have already tried at this point). + return fmt.Errorf("cannot set %s limit: container could not join or create cgroup", sys.Name()) + } + return err + } + } + + if m.Paths["cpu"] != "" { + if err := CheckCpushares(m.Paths["cpu"], container.Cgroups.Resources.CpuShares); err != nil { + return err + } + } + return nil +} + +// Freeze toggles the container's freezer cgroup depending on the state +// provided +func (m *Manager) Freeze(state configs.FreezerState) error { + if m.Cgroups == nil { + return errors.New("cannot toggle freezer: cgroups not configured for container") + } + + paths := m.GetPaths() + dir := paths["freezer"] + prevState := m.Cgroups.Resources.Freezer + m.Cgroups.Resources.Freezer = state + freezer, err := m.getSubsystems().Get("freezer") + if err != nil { + return err + } + err = freezer.Set(dir, m.Cgroups) + if err != nil { + m.Cgroups.Resources.Freezer = prevState + return err + } + return nil +} + +func (m *Manager) GetPids() ([]int, error) { + paths := m.GetPaths() + return cgroups.GetPids(paths["devices"]) +} + +func (m *Manager) GetAllPids() ([]int, error) { + paths := m.GetPaths() + return cgroups.GetAllPids(paths["devices"]) +} + +func getCgroupData(c *configs.Cgroup, pid int) (*cgroupData, error) { + root, err := getCgroupRoot() + if err != nil { + return nil, err + } + + if (c.Name != "" || c.Parent != "") && c.Path != "" { + return nil, fmt.Errorf("cgroup: either Path or Name and Parent should be used") + } + + // XXX: Do not remove this code. Path safety is important! -- cyphar + cgPath := libcontainerUtils.CleanPath(c.Path) + cgParent := libcontainerUtils.CleanPath(c.Parent) + cgName := libcontainerUtils.CleanPath(c.Name) + + innerPath := cgPath + if innerPath == "" { + innerPath = filepath.Join(cgParent, cgName) + } + + return &cgroupData{ + root: root, + innerPath: innerPath, + config: c, + pid: pid, + }, nil +} + +func (raw *cgroupData) path(subsystem string) (string, error) { + mnt, err := cgroups.FindCgroupMountpoint(raw.root, subsystem) + // If we didn't mount the subsystem, there is no point we make the path. + if err != nil { + return "", err + } + + // If the cgroup name/path is absolute do not look relative to the cgroup of the init process. + if filepath.IsAbs(raw.innerPath) { + // Sometimes subsystems can be mounted together as 'cpu,cpuacct'. + return filepath.Join(raw.root, filepath.Base(mnt), raw.innerPath), nil + } + + // Use GetOwnCgroupPath instead of GetInitCgroupPath, because the creating + // process could in container and shared pid namespace with host, and + // /proc/1/cgroup could point to whole other world of cgroups. + parentPath, err := cgroups.GetOwnCgroupPath(subsystem) + if err != nil { + return "", err + } + + return filepath.Join(parentPath, raw.innerPath), nil +} + +func (raw *cgroupData) join(subsystem string) (string, error) { + path, err := raw.path(subsystem) + if err != nil { + return "", err + } + if err := os.MkdirAll(path, 0755); err != nil { + return "", err + } + if err := cgroups.WriteCgroupProc(path, raw.pid); err != nil { + return "", err + } + return path, nil +} + +func removePath(p string, err error) error { + if err != nil { + return err + } + if p != "" { + return os.RemoveAll(p) + } + return nil +} + +func CheckCpushares(path string, c uint64) error { + var cpuShares uint64 + + if c == 0 { + return nil + } + + fd, err := os.Open(filepath.Join(path, "cpu.shares")) + if err != nil { + return err + } + defer fd.Close() + + _, err = fmt.Fscanf(fd, "%d", &cpuShares) + if err != nil && err != io.EOF { + return err + } + + if c > cpuShares { + return fmt.Errorf("The maximum allowed cpu-shares is %d", cpuShares) + } else if c < cpuShares { + return fmt.Errorf("The minimum allowed cpu-shares is %d", cpuShares) + } + + return nil +} + +func (m *Manager) GetCgroups() (*configs.Cgroup, error) { + return m.Cgroups, nil +} diff --git a/libcontainer/cgroups/fs/apply_raw_test.go b/libcontainer/cgroups/fs/apply_raw_test.go new file mode 100644 index 0000000..f3b6556 --- /dev/null +++ b/libcontainer/cgroups/fs/apply_raw_test.go @@ -0,0 +1,297 @@ +// +build linux + +package fs + +import ( + "path/filepath" + "strings" + "testing" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" +) + +func TestInvalidCgroupPath(t *testing.T) { + if cgroups.IsCgroup2UnifiedMode() { + t.Skip("cgroup v1 is not supported") + } + root, err := getCgroupRoot() + if err != nil { + t.Errorf("couldn't get cgroup root: %v", err) + } + + config := &configs.Cgroup{ + Path: "../../../../../../../../../../some/path", + } + + data, err := getCgroupData(config, 0) + if err != nil { + t.Errorf("couldn't get cgroup data: %v", err) + } + + // Make sure the final innerPath doesn't go outside the cgroup mountpoint. + if strings.HasPrefix(data.innerPath, "..") { + t.Errorf("SECURITY: cgroup innerPath is outside cgroup mountpoint!") + } + + // Double-check, using an actual cgroup. + deviceRoot := filepath.Join(root, "devices") + devicePath, err := data.path("devices") + if err != nil { + t.Errorf("couldn't get cgroup path: %v", err) + } + if !strings.HasPrefix(devicePath, deviceRoot) { + t.Errorf("SECURITY: cgroup path() is outside cgroup mountpoint!") + } +} + +func TestInvalidAbsoluteCgroupPath(t *testing.T) { + if cgroups.IsCgroup2UnifiedMode() { + t.Skip("cgroup v1 is not supported") + } + root, err := getCgroupRoot() + if err != nil { + t.Errorf("couldn't get cgroup root: %v", err) + } + + config := &configs.Cgroup{ + Path: "/../../../../../../../../../../some/path", + } + + data, err := getCgroupData(config, 0) + if err != nil { + t.Errorf("couldn't get cgroup data: %v", err) + } + + // Make sure the final innerPath doesn't go outside the cgroup mountpoint. + if strings.HasPrefix(data.innerPath, "..") { + t.Errorf("SECURITY: cgroup innerPath is outside cgroup mountpoint!") + } + + // Double-check, using an actual cgroup. + deviceRoot := filepath.Join(root, "devices") + devicePath, err := data.path("devices") + if err != nil { + t.Errorf("couldn't get cgroup path: %v", err) + } + if !strings.HasPrefix(devicePath, deviceRoot) { + t.Errorf("SECURITY: cgroup path() is outside cgroup mountpoint!") + } +} + +// XXX: Remove me after we get rid of configs.Cgroup.Name and configs.Cgroup.Parent. +func TestInvalidCgroupParent(t *testing.T) { + if cgroups.IsCgroup2UnifiedMode() { + t.Skip("cgroup v1 is not supported") + } + root, err := getCgroupRoot() + if err != nil { + t.Errorf("couldn't get cgroup root: %v", err) + } + + config := &configs.Cgroup{ + Parent: "../../../../../../../../../../some/path", + Name: "name", + } + + data, err := getCgroupData(config, 0) + if err != nil { + t.Errorf("couldn't get cgroup data: %v", err) + } + + // Make sure the final innerPath doesn't go outside the cgroup mountpoint. + if strings.HasPrefix(data.innerPath, "..") { + t.Errorf("SECURITY: cgroup innerPath is outside cgroup mountpoint!") + } + + // Double-check, using an actual cgroup. + deviceRoot := filepath.Join(root, "devices") + devicePath, err := data.path("devices") + if err != nil { + t.Errorf("couldn't get cgroup path: %v", err) + } + if !strings.HasPrefix(devicePath, deviceRoot) { + t.Errorf("SECURITY: cgroup path() is outside cgroup mountpoint!") + } +} + +// XXX: Remove me after we get rid of configs.Cgroup.Name and configs.Cgroup.Parent. +func TestInvalidAbsoluteCgroupParent(t *testing.T) { + if cgroups.IsCgroup2UnifiedMode() { + t.Skip("cgroup v1 is not supported") + } + root, err := getCgroupRoot() + if err != nil { + t.Errorf("couldn't get cgroup root: %v", err) + } + + config := &configs.Cgroup{ + Parent: "/../../../../../../../../../../some/path", + Name: "name", + } + + data, err := getCgroupData(config, 0) + if err != nil { + t.Errorf("couldn't get cgroup data: %v", err) + } + + // Make sure the final innerPath doesn't go outside the cgroup mountpoint. + if strings.HasPrefix(data.innerPath, "..") { + t.Errorf("SECURITY: cgroup innerPath is outside cgroup mountpoint!") + } + + // Double-check, using an actual cgroup. + deviceRoot := filepath.Join(root, "devices") + devicePath, err := data.path("devices") + if err != nil { + t.Errorf("couldn't get cgroup path: %v", err) + } + if !strings.HasPrefix(devicePath, deviceRoot) { + t.Errorf("SECURITY: cgroup path() is outside cgroup mountpoint!") + } +} + +// XXX: Remove me after we get rid of configs.Cgroup.Name and configs.Cgroup.Parent. +func TestInvalidCgroupName(t *testing.T) { + if cgroups.IsCgroup2UnifiedMode() { + t.Skip("cgroup v1 is not supported") + } + root, err := getCgroupRoot() + if err != nil { + t.Errorf("couldn't get cgroup root: %v", err) + } + + config := &configs.Cgroup{ + Parent: "parent", + Name: "../../../../../../../../../../some/path", + } + + data, err := getCgroupData(config, 0) + if err != nil { + t.Errorf("couldn't get cgroup data: %v", err) + } + + // Make sure the final innerPath doesn't go outside the cgroup mountpoint. + if strings.HasPrefix(data.innerPath, "..") { + t.Errorf("SECURITY: cgroup innerPath is outside cgroup mountpoint!") + } + + // Double-check, using an actual cgroup. + deviceRoot := filepath.Join(root, "devices") + devicePath, err := data.path("devices") + if err != nil { + t.Errorf("couldn't get cgroup path: %v", err) + } + if !strings.HasPrefix(devicePath, deviceRoot) { + t.Errorf("SECURITY: cgroup path() is outside cgroup mountpoint!") + } + +} + +// XXX: Remove me after we get rid of configs.Cgroup.Name and configs.Cgroup.Parent. +func TestInvalidAbsoluteCgroupName(t *testing.T) { + if cgroups.IsCgroup2UnifiedMode() { + t.Skip("cgroup v1 is not supported") + } + root, err := getCgroupRoot() + if err != nil { + t.Errorf("couldn't get cgroup root: %v", err) + } + + config := &configs.Cgroup{ + Parent: "parent", + Name: "/../../../../../../../../../../some/path", + } + + data, err := getCgroupData(config, 0) + if err != nil { + t.Errorf("couldn't get cgroup data: %v", err) + } + + // Make sure the final innerPath doesn't go outside the cgroup mountpoint. + if strings.HasPrefix(data.innerPath, "..") { + t.Errorf("SECURITY: cgroup innerPath is outside cgroup mountpoint!") + } + + // Double-check, using an actual cgroup. + deviceRoot := filepath.Join(root, "devices") + devicePath, err := data.path("devices") + if err != nil { + t.Errorf("couldn't get cgroup path: %v", err) + } + if !strings.HasPrefix(devicePath, deviceRoot) { + t.Errorf("SECURITY: cgroup path() is outside cgroup mountpoint!") + } +} + +// XXX: Remove me after we get rid of configs.Cgroup.Name and configs.Cgroup.Parent. +func TestInvalidCgroupNameAndParent(t *testing.T) { + if cgroups.IsCgroup2UnifiedMode() { + t.Skip("cgroup v1 is not supported") + } + root, err := getCgroupRoot() + if err != nil { + t.Errorf("couldn't get cgroup root: %v", err) + } + + config := &configs.Cgroup{ + Parent: "../../../../../../../../../../some/path", + Name: "../../../../../../../../../../some/path", + } + + data, err := getCgroupData(config, 0) + if err != nil { + t.Errorf("couldn't get cgroup data: %v", err) + } + + // Make sure the final innerPath doesn't go outside the cgroup mountpoint. + if strings.HasPrefix(data.innerPath, "..") { + t.Errorf("SECURITY: cgroup innerPath is outside cgroup mountpoint!") + } + + // Double-check, using an actual cgroup. + deviceRoot := filepath.Join(root, "devices") + devicePath, err := data.path("devices") + if err != nil { + t.Errorf("couldn't get cgroup path: %v", err) + } + if !strings.HasPrefix(devicePath, deviceRoot) { + t.Errorf("SECURITY: cgroup path() is outside cgroup mountpoint!") + } +} + +// XXX: Remove me after we get rid of configs.Cgroup.Name and configs.Cgroup.Parent. +func TestInvalidAbsoluteCgroupNameAndParent(t *testing.T) { + if cgroups.IsCgroup2UnifiedMode() { + t.Skip("cgroup v1 is not supported") + } + root, err := getCgroupRoot() + if err != nil { + t.Errorf("couldn't get cgroup root: %v", err) + } + + config := &configs.Cgroup{ + Parent: "/../../../../../../../../../../some/path", + Name: "/../../../../../../../../../../some/path", + } + + data, err := getCgroupData(config, 0) + if err != nil { + t.Errorf("couldn't get cgroup data: %v", err) + } + + // Make sure the final innerPath doesn't go outside the cgroup mountpoint. + if strings.HasPrefix(data.innerPath, "..") { + t.Errorf("SECURITY: cgroup innerPath is outside cgroup mountpoint!") + } + + // Double-check, using an actual cgroup. + deviceRoot := filepath.Join(root, "devices") + devicePath, err := data.path("devices") + if err != nil { + t.Errorf("couldn't get cgroup path: %v", err) + } + if !strings.HasPrefix(devicePath, deviceRoot) { + t.Errorf("SECURITY: cgroup path() is outside cgroup mountpoint!") + } +} diff --git a/libcontainer/cgroups/fs/blkio.go b/libcontainer/cgroups/fs/blkio.go new file mode 100644 index 0000000..52c118d --- /dev/null +++ b/libcontainer/cgroups/fs/blkio.go @@ -0,0 +1,238 @@ +// +build linux + +package fs + +import ( + "bufio" + "fmt" + "os" + "path/filepath" + "strconv" + "strings" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" +) + +type BlkioGroup struct { +} + +func (s *BlkioGroup) Name() string { + return "blkio" +} + +func (s *BlkioGroup) Apply(d *cgroupData) error { + _, err := d.join("blkio") + if err != nil && !cgroups.IsNotFound(err) { + return err + } + return nil +} + +func (s *BlkioGroup) Set(path string, cgroup *configs.Cgroup) error { + if cgroup.Resources.BlkioWeight != 0 { + if err := fscommon.WriteFile(path, "blkio.weight", strconv.FormatUint(uint64(cgroup.Resources.BlkioWeight), 10)); err != nil { + return err + } + } + + if cgroup.Resources.BlkioLeafWeight != 0 { + if err := fscommon.WriteFile(path, "blkio.leaf_weight", strconv.FormatUint(uint64(cgroup.Resources.BlkioLeafWeight), 10)); err != nil { + return err + } + } + for _, wd := range cgroup.Resources.BlkioWeightDevice { + if err := fscommon.WriteFile(path, "blkio.weight_device", wd.WeightString()); err != nil { + return err + } + if err := fscommon.WriteFile(path, "blkio.leaf_weight_device", wd.LeafWeightString()); err != nil { + return err + } + } + for _, td := range cgroup.Resources.BlkioThrottleReadBpsDevice { + if err := fscommon.WriteFile(path, "blkio.throttle.read_bps_device", td.String()); err != nil { + return err + } + } + for _, td := range cgroup.Resources.BlkioThrottleWriteBpsDevice { + if err := fscommon.WriteFile(path, "blkio.throttle.write_bps_device", td.String()); err != nil { + return err + } + } + for _, td := range cgroup.Resources.BlkioThrottleReadIOPSDevice { + if err := fscommon.WriteFile(path, "blkio.throttle.read_iops_device", td.String()); err != nil { + return err + } + } + for _, td := range cgroup.Resources.BlkioThrottleWriteIOPSDevice { + if err := fscommon.WriteFile(path, "blkio.throttle.write_iops_device", td.String()); err != nil { + return err + } + } + + return nil +} + +func (s *BlkioGroup) Remove(d *cgroupData) error { + return removePath(d.path("blkio")) +} + +/* +examples: + + blkio.sectors + 8:0 6792 + + blkio.io_service_bytes + 8:0 Read 1282048 + 8:0 Write 2195456 + 8:0 Sync 2195456 + 8:0 Async 1282048 + 8:0 Total 3477504 + Total 3477504 + + blkio.io_serviced + 8:0 Read 124 + 8:0 Write 104 + 8:0 Sync 104 + 8:0 Async 124 + 8:0 Total 228 + Total 228 + + blkio.io_queued + 8:0 Read 0 + 8:0 Write 0 + 8:0 Sync 0 + 8:0 Async 0 + 8:0 Total 0 + Total 0 +*/ + +func splitBlkioStatLine(r rune) bool { + return r == ' ' || r == ':' +} + +func getBlkioStat(path string) ([]cgroups.BlkioStatEntry, error) { + var blkioStats []cgroups.BlkioStatEntry + f, err := os.Open(path) + if err != nil { + if os.IsNotExist(err) { + return blkioStats, nil + } + return nil, err + } + defer f.Close() + + sc := bufio.NewScanner(f) + for sc.Scan() { + // format: dev type amount + fields := strings.FieldsFunc(sc.Text(), splitBlkioStatLine) + if len(fields) < 3 { + if len(fields) == 2 && fields[0] == "Total" { + // skip total line + continue + } else { + return nil, fmt.Errorf("Invalid line found while parsing %s: %s", path, sc.Text()) + } + } + + v, err := strconv.ParseUint(fields[0], 10, 64) + if err != nil { + return nil, err + } + major := v + + v, err = strconv.ParseUint(fields[1], 10, 64) + if err != nil { + return nil, err + } + minor := v + + op := "" + valueField := 2 + if len(fields) == 4 { + op = fields[2] + valueField = 3 + } + v, err = strconv.ParseUint(fields[valueField], 10, 64) + if err != nil { + return nil, err + } + blkioStats = append(blkioStats, cgroups.BlkioStatEntry{Major: major, Minor: minor, Op: op, Value: v}) + } + + return blkioStats, nil +} + +func (s *BlkioGroup) GetStats(path string, stats *cgroups.Stats) error { + // Try to read CFQ stats available on all CFQ enabled kernels first + if blkioStats, err := getBlkioStat(filepath.Join(path, "blkio.io_serviced_recursive")); err == nil && blkioStats != nil { + return getCFQStats(path, stats) + } + return getStats(path, stats) // Use generic stats as fallback +} + +func getCFQStats(path string, stats *cgroups.Stats) error { + var blkioStats []cgroups.BlkioStatEntry + var err error + + if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.sectors_recursive")); err != nil { + return err + } + stats.BlkioStats.SectorsRecursive = blkioStats + + if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.io_service_bytes_recursive")); err != nil { + return err + } + stats.BlkioStats.IoServiceBytesRecursive = blkioStats + + if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.io_serviced_recursive")); err != nil { + return err + } + stats.BlkioStats.IoServicedRecursive = blkioStats + + if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.io_queued_recursive")); err != nil { + return err + } + stats.BlkioStats.IoQueuedRecursive = blkioStats + + if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.io_service_time_recursive")); err != nil { + return err + } + stats.BlkioStats.IoServiceTimeRecursive = blkioStats + + if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.io_wait_time_recursive")); err != nil { + return err + } + stats.BlkioStats.IoWaitTimeRecursive = blkioStats + + if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.io_merged_recursive")); err != nil { + return err + } + stats.BlkioStats.IoMergedRecursive = blkioStats + + if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.time_recursive")); err != nil { + return err + } + stats.BlkioStats.IoTimeRecursive = blkioStats + + return nil +} + +func getStats(path string, stats *cgroups.Stats) error { + var blkioStats []cgroups.BlkioStatEntry + var err error + + if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.throttle.io_service_bytes")); err != nil { + return err + } + stats.BlkioStats.IoServiceBytesRecursive = blkioStats + + if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.throttle.io_serviced")); err != nil { + return err + } + stats.BlkioStats.IoServicedRecursive = blkioStats + + return nil +} diff --git a/libcontainer/cgroups/fs/blkio_test.go b/libcontainer/cgroups/fs/blkio_test.go new file mode 100644 index 0000000..5ba60fa --- /dev/null +++ b/libcontainer/cgroups/fs/blkio_test.go @@ -0,0 +1,637 @@ +// +build linux + +package fs + +import ( + "strconv" + "testing" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" +) + +const ( + sectorsRecursiveContents = `8:0 1024` + serviceBytesRecursiveContents = `8:0 Read 100 +8:0 Write 200 +8:0 Sync 300 +8:0 Async 500 +8:0 Total 500 +Total 500` + servicedRecursiveContents = `8:0 Read 10 +8:0 Write 40 +8:0 Sync 20 +8:0 Async 30 +8:0 Total 50 +Total 50` + queuedRecursiveContents = `8:0 Read 1 +8:0 Write 4 +8:0 Sync 2 +8:0 Async 3 +8:0 Total 5 +Total 5` + serviceTimeRecursiveContents = `8:0 Read 173959 +8:0 Write 0 +8:0 Sync 0 +8:0 Async 173959 +8:0 Total 17395 +Total 17395` + waitTimeRecursiveContents = `8:0 Read 15571 +8:0 Write 0 +8:0 Sync 0 +8:0 Async 15571 +8:0 Total 15571` + mergedRecursiveContents = `8:0 Read 5 +8:0 Write 10 +8:0 Sync 0 +8:0 Async 0 +8:0 Total 15 +Total 15` + timeRecursiveContents = `8:0 8` + throttleServiceBytes = `8:0 Read 11030528 +8:0 Write 23 +8:0 Sync 42 +8:0 Async 11030528 +8:0 Total 11030528 +252:0 Read 11030528 +252:0 Write 23 +252:0 Sync 42 +252:0 Async 11030528 +252:0 Total 11030528 +Total 22061056` + throttleServiced = `8:0 Read 164 +8:0 Write 23 +8:0 Sync 42 +8:0 Async 164 +8:0 Total 164 +252:0 Read 164 +252:0 Write 23 +252:0 Sync 42 +252:0 Async 164 +252:0 Total 164 +Total 328` +) + +func appendBlkioStatEntry(blkioStatEntries *[]cgroups.BlkioStatEntry, major, minor, value uint64, op string) { + *blkioStatEntries = append(*blkioStatEntries, cgroups.BlkioStatEntry{Major: major, Minor: minor, Value: value, Op: op}) +} + +func TestBlkioSetWeight(t *testing.T) { + helper := NewCgroupTestUtil("blkio", t) + defer helper.cleanup() + + const ( + weightBefore = 100 + weightAfter = 200 + ) + + helper.writeFileContents(map[string]string{ + "blkio.weight": strconv.Itoa(weightBefore), + }) + + helper.CgroupData.config.Resources.BlkioWeight = weightAfter + blkio := &BlkioGroup{} + if err := blkio.Set(helper.CgroupPath, helper.CgroupData.config); err != nil { + t.Fatal(err) + } + + value, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "blkio.weight") + if err != nil { + t.Fatalf("Failed to parse blkio.weight - %s", err) + } + + if value != weightAfter { + t.Fatal("Got the wrong value, set blkio.weight failed.") + } +} + +func TestBlkioSetWeightDevice(t *testing.T) { + helper := NewCgroupTestUtil("blkio", t) + defer helper.cleanup() + + const ( + weightDeviceBefore = "8:0 400" + ) + + wd := configs.NewWeightDevice(8, 0, 500, 0) + weightDeviceAfter := wd.WeightString() + + helper.writeFileContents(map[string]string{ + "blkio.weight_device": weightDeviceBefore, + }) + + helper.CgroupData.config.Resources.BlkioWeightDevice = []*configs.WeightDevice{wd} + blkio := &BlkioGroup{} + if err := blkio.Set(helper.CgroupPath, helper.CgroupData.config); err != nil { + t.Fatal(err) + } + + value, err := fscommon.GetCgroupParamString(helper.CgroupPath, "blkio.weight_device") + if err != nil { + t.Fatalf("Failed to parse blkio.weight_device - %s", err) + } + + if value != weightDeviceAfter { + t.Fatal("Got the wrong value, set blkio.weight_device failed.") + } +} + +// regression #274 +func TestBlkioSetMultipleWeightDevice(t *testing.T) { + helper := NewCgroupTestUtil("blkio", t) + defer helper.cleanup() + + const ( + weightDeviceBefore = "8:0 400" + ) + + wd1 := configs.NewWeightDevice(8, 0, 500, 0) + wd2 := configs.NewWeightDevice(8, 16, 500, 0) + // we cannot actually set and check both because normal ioutil.WriteFile + // when writing to cgroup file will overwrite the whole file content instead + // of updating it as the kernel is doing. Just check the second device + // is present will suffice for the test to ensure multiple writes are done. + weightDeviceAfter := wd2.WeightString() + + helper.writeFileContents(map[string]string{ + "blkio.weight_device": weightDeviceBefore, + }) + + helper.CgroupData.config.Resources.BlkioWeightDevice = []*configs.WeightDevice{wd1, wd2} + blkio := &BlkioGroup{} + if err := blkio.Set(helper.CgroupPath, helper.CgroupData.config); err != nil { + t.Fatal(err) + } + + value, err := fscommon.GetCgroupParamString(helper.CgroupPath, "blkio.weight_device") + if err != nil { + t.Fatalf("Failed to parse blkio.weight_device - %s", err) + } + + if value != weightDeviceAfter { + t.Fatal("Got the wrong value, set blkio.weight_device failed.") + } +} + +func TestBlkioStats(t *testing.T) { + helper := NewCgroupTestUtil("blkio", t) + defer helper.cleanup() + helper.writeFileContents(map[string]string{ + "blkio.io_service_bytes_recursive": serviceBytesRecursiveContents, + "blkio.io_serviced_recursive": servicedRecursiveContents, + "blkio.io_queued_recursive": queuedRecursiveContents, + "blkio.io_service_time_recursive": serviceTimeRecursiveContents, + "blkio.io_wait_time_recursive": waitTimeRecursiveContents, + "blkio.io_merged_recursive": mergedRecursiveContents, + "blkio.time_recursive": timeRecursiveContents, + "blkio.sectors_recursive": sectorsRecursiveContents, + }) + + blkio := &BlkioGroup{} + actualStats := *cgroups.NewStats() + err := blkio.GetStats(helper.CgroupPath, &actualStats) + if err != nil { + t.Fatal(err) + } + + // Verify expected stats. + expectedStats := cgroups.BlkioStats{} + appendBlkioStatEntry(&expectedStats.SectorsRecursive, 8, 0, 1024, "") + + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 100, "Read") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 200, "Write") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 300, "Sync") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 500, "Async") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 500, "Total") + + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 10, "Read") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 40, "Write") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 20, "Sync") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 30, "Async") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 50, "Total") + + appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 1, "Read") + appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 4, "Write") + appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 2, "Sync") + appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 3, "Async") + appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 5, "Total") + + appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 173959, "Read") + appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 0, "Write") + appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 0, "Sync") + appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 173959, "Async") + appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 17395, "Total") + + appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 15571, "Read") + appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 0, "Write") + appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 0, "Sync") + appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 15571, "Async") + appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 15571, "Total") + + appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 5, "Read") + appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 10, "Write") + appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 0, "Sync") + appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 0, "Async") + appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 15, "Total") + + appendBlkioStatEntry(&expectedStats.IoTimeRecursive, 8, 0, 8, "") + + expectBlkioStatsEquals(t, expectedStats, actualStats.BlkioStats) +} + +func TestBlkioStatsNoSectorsFile(t *testing.T) { + helper := NewCgroupTestUtil("blkio", t) + defer helper.cleanup() + helper.writeFileContents(map[string]string{ + "blkio.io_service_bytes_recursive": serviceBytesRecursiveContents, + "blkio.io_serviced_recursive": servicedRecursiveContents, + "blkio.io_queued_recursive": queuedRecursiveContents, + "blkio.io_service_time_recursive": serviceTimeRecursiveContents, + "blkio.io_wait_time_recursive": waitTimeRecursiveContents, + "blkio.io_merged_recursive": mergedRecursiveContents, + "blkio.time_recursive": timeRecursiveContents, + }) + + blkio := &BlkioGroup{} + actualStats := *cgroups.NewStats() + err := blkio.GetStats(helper.CgroupPath, &actualStats) + if err != nil { + t.Fatalf("Failed unexpectedly: %s", err) + } +} + +func TestBlkioStatsNoServiceBytesFile(t *testing.T) { + helper := NewCgroupTestUtil("blkio", t) + defer helper.cleanup() + helper.writeFileContents(map[string]string{ + "blkio.io_serviced_recursive": servicedRecursiveContents, + "blkio.io_queued_recursive": queuedRecursiveContents, + "blkio.sectors_recursive": sectorsRecursiveContents, + "blkio.io_service_time_recursive": serviceTimeRecursiveContents, + "blkio.io_wait_time_recursive": waitTimeRecursiveContents, + "blkio.io_merged_recursive": mergedRecursiveContents, + "blkio.time_recursive": timeRecursiveContents, + }) + + blkio := &BlkioGroup{} + actualStats := *cgroups.NewStats() + err := blkio.GetStats(helper.CgroupPath, &actualStats) + if err != nil { + t.Fatalf("Failed unexpectedly: %s", err) + } +} + +func TestBlkioStatsNoServicedFile(t *testing.T) { + helper := NewCgroupTestUtil("blkio", t) + defer helper.cleanup() + helper.writeFileContents(map[string]string{ + "blkio.io_service_bytes_recursive": serviceBytesRecursiveContents, + "blkio.io_queued_recursive": queuedRecursiveContents, + "blkio.sectors_recursive": sectorsRecursiveContents, + "blkio.io_service_time_recursive": serviceTimeRecursiveContents, + "blkio.io_wait_time_recursive": waitTimeRecursiveContents, + "blkio.io_merged_recursive": mergedRecursiveContents, + "blkio.time_recursive": timeRecursiveContents, + }) + + blkio := &BlkioGroup{} + actualStats := *cgroups.NewStats() + err := blkio.GetStats(helper.CgroupPath, &actualStats) + if err != nil { + t.Fatalf("Failed unexpectedly: %s", err) + } +} + +func TestBlkioStatsNoQueuedFile(t *testing.T) { + helper := NewCgroupTestUtil("blkio", t) + defer helper.cleanup() + helper.writeFileContents(map[string]string{ + "blkio.io_service_bytes_recursive": serviceBytesRecursiveContents, + "blkio.io_serviced_recursive": servicedRecursiveContents, + "blkio.sectors_recursive": sectorsRecursiveContents, + "blkio.io_service_time_recursive": serviceTimeRecursiveContents, + "blkio.io_wait_time_recursive": waitTimeRecursiveContents, + "blkio.io_merged_recursive": mergedRecursiveContents, + "blkio.time_recursive": timeRecursiveContents, + }) + + blkio := &BlkioGroup{} + actualStats := *cgroups.NewStats() + err := blkio.GetStats(helper.CgroupPath, &actualStats) + if err != nil { + t.Fatalf("Failed unexpectedly: %s", err) + } +} + +func TestBlkioStatsNoServiceTimeFile(t *testing.T) { + if testing.Short() { + t.Skip("skipping test in short mode.") + } + helper := NewCgroupTestUtil("blkio", t) + defer helper.cleanup() + helper.writeFileContents(map[string]string{ + "blkio.io_service_bytes_recursive": serviceBytesRecursiveContents, + "blkio.io_serviced_recursive": servicedRecursiveContents, + "blkio.io_queued_recursive": queuedRecursiveContents, + "blkio.io_wait_time_recursive": waitTimeRecursiveContents, + "blkio.io_merged_recursive": mergedRecursiveContents, + "blkio.time_recursive": timeRecursiveContents, + "blkio.sectors_recursive": sectorsRecursiveContents, + }) + + blkio := &BlkioGroup{} + actualStats := *cgroups.NewStats() + err := blkio.GetStats(helper.CgroupPath, &actualStats) + if err != nil { + t.Fatalf("Failed unexpectedly: %s", err) + } +} + +func TestBlkioStatsNoWaitTimeFile(t *testing.T) { + if testing.Short() { + t.Skip("skipping test in short mode.") + } + helper := NewCgroupTestUtil("blkio", t) + defer helper.cleanup() + helper.writeFileContents(map[string]string{ + "blkio.io_service_bytes_recursive": serviceBytesRecursiveContents, + "blkio.io_serviced_recursive": servicedRecursiveContents, + "blkio.io_queued_recursive": queuedRecursiveContents, + "blkio.io_service_time_recursive": serviceTimeRecursiveContents, + "blkio.io_merged_recursive": mergedRecursiveContents, + "blkio.time_recursive": timeRecursiveContents, + "blkio.sectors_recursive": sectorsRecursiveContents, + }) + + blkio := &BlkioGroup{} + actualStats := *cgroups.NewStats() + err := blkio.GetStats(helper.CgroupPath, &actualStats) + if err != nil { + t.Fatalf("Failed unexpectedly: %s", err) + } +} + +func TestBlkioStatsNoMergedFile(t *testing.T) { + if testing.Short() { + t.Skip("skipping test in short mode.") + } + helper := NewCgroupTestUtil("blkio", t) + defer helper.cleanup() + helper.writeFileContents(map[string]string{ + "blkio.io_service_bytes_recursive": serviceBytesRecursiveContents, + "blkio.io_serviced_recursive": servicedRecursiveContents, + "blkio.io_queued_recursive": queuedRecursiveContents, + "blkio.io_service_time_recursive": serviceTimeRecursiveContents, + "blkio.io_wait_time_recursive": waitTimeRecursiveContents, + "blkio.time_recursive": timeRecursiveContents, + "blkio.sectors_recursive": sectorsRecursiveContents, + }) + + blkio := &BlkioGroup{} + actualStats := *cgroups.NewStats() + err := blkio.GetStats(helper.CgroupPath, &actualStats) + if err != nil { + t.Fatalf("Failed unexpectedly: %s", err) + } +} + +func TestBlkioStatsNoTimeFile(t *testing.T) { + if testing.Short() { + t.Skip("skipping test in short mode.") + } + helper := NewCgroupTestUtil("blkio", t) + defer helper.cleanup() + helper.writeFileContents(map[string]string{ + "blkio.io_service_bytes_recursive": serviceBytesRecursiveContents, + "blkio.io_serviced_recursive": servicedRecursiveContents, + "blkio.io_queued_recursive": queuedRecursiveContents, + "blkio.io_service_time_recursive": serviceTimeRecursiveContents, + "blkio.io_wait_time_recursive": waitTimeRecursiveContents, + "blkio.io_merged_recursive": mergedRecursiveContents, + "blkio.sectors_recursive": sectorsRecursiveContents, + }) + + blkio := &BlkioGroup{} + actualStats := *cgroups.NewStats() + err := blkio.GetStats(helper.CgroupPath, &actualStats) + if err != nil { + t.Fatalf("Failed unexpectedly: %s", err) + } +} + +func TestBlkioStatsUnexpectedNumberOfFields(t *testing.T) { + helper := NewCgroupTestUtil("blkio", t) + defer helper.cleanup() + helper.writeFileContents(map[string]string{ + "blkio.io_service_bytes_recursive": "8:0 Read 100 100", + "blkio.io_serviced_recursive": servicedRecursiveContents, + "blkio.io_queued_recursive": queuedRecursiveContents, + "blkio.sectors_recursive": sectorsRecursiveContents, + "blkio.io_service_time_recursive": serviceTimeRecursiveContents, + "blkio.io_wait_time_recursive": waitTimeRecursiveContents, + "blkio.io_merged_recursive": mergedRecursiveContents, + "blkio.time_recursive": timeRecursiveContents, + }) + + blkio := &BlkioGroup{} + actualStats := *cgroups.NewStats() + err := blkio.GetStats(helper.CgroupPath, &actualStats) + if err == nil { + t.Fatal("Expected to fail, but did not") + } +} + +func TestBlkioStatsUnexpectedFieldType(t *testing.T) { + helper := NewCgroupTestUtil("blkio", t) + defer helper.cleanup() + helper.writeFileContents(map[string]string{ + "blkio.io_service_bytes_recursive": "8:0 Read Write", + "blkio.io_serviced_recursive": servicedRecursiveContents, + "blkio.io_queued_recursive": queuedRecursiveContents, + "blkio.sectors_recursive": sectorsRecursiveContents, + "blkio.io_service_time_recursive": serviceTimeRecursiveContents, + "blkio.io_wait_time_recursive": waitTimeRecursiveContents, + "blkio.io_merged_recursive": mergedRecursiveContents, + "blkio.time_recursive": timeRecursiveContents, + }) + + blkio := &BlkioGroup{} + actualStats := *cgroups.NewStats() + err := blkio.GetStats(helper.CgroupPath, &actualStats) + if err == nil { + t.Fatal("Expected to fail, but did not") + } +} + +func TestNonCFQBlkioStats(t *testing.T) { + helper := NewCgroupTestUtil("blkio", t) + defer helper.cleanup() + helper.writeFileContents(map[string]string{ + "blkio.io_service_bytes_recursive": "", + "blkio.io_serviced_recursive": "", + "blkio.io_queued_recursive": "", + "blkio.sectors_recursive": "", + "blkio.io_service_time_recursive": "", + "blkio.io_wait_time_recursive": "", + "blkio.io_merged_recursive": "", + "blkio.time_recursive": "", + "blkio.throttle.io_service_bytes": throttleServiceBytes, + "blkio.throttle.io_serviced": throttleServiced, + }) + + blkio := &BlkioGroup{} + actualStats := *cgroups.NewStats() + err := blkio.GetStats(helper.CgroupPath, &actualStats) + if err != nil { + t.Fatal(err) + } + + // Verify expected stats. + expectedStats := cgroups.BlkioStats{} + + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 11030528, "Read") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 23, "Write") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 42, "Sync") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 11030528, "Async") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 11030528, "Total") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 252, 0, 11030528, "Read") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 252, 0, 23, "Write") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 252, 0, 42, "Sync") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 252, 0, 11030528, "Async") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 252, 0, 11030528, "Total") + + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 164, "Read") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 23, "Write") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 42, "Sync") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 164, "Async") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 164, "Total") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 252, 0, 164, "Read") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 252, 0, 23, "Write") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 252, 0, 42, "Sync") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 252, 0, 164, "Async") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 252, 0, 164, "Total") + + expectBlkioStatsEquals(t, expectedStats, actualStats.BlkioStats) +} + +func TestBlkioSetThrottleReadBpsDevice(t *testing.T) { + helper := NewCgroupTestUtil("blkio", t) + defer helper.cleanup() + + const ( + throttleBefore = `8:0 1024` + ) + + td := configs.NewThrottleDevice(8, 0, 2048) + throttleAfter := td.String() + + helper.writeFileContents(map[string]string{ + "blkio.throttle.read_bps_device": throttleBefore, + }) + + helper.CgroupData.config.Resources.BlkioThrottleReadBpsDevice = []*configs.ThrottleDevice{td} + blkio := &BlkioGroup{} + if err := blkio.Set(helper.CgroupPath, helper.CgroupData.config); err != nil { + t.Fatal(err) + } + + value, err := fscommon.GetCgroupParamString(helper.CgroupPath, "blkio.throttle.read_bps_device") + if err != nil { + t.Fatalf("Failed to parse blkio.throttle.read_bps_device - %s", err) + } + + if value != throttleAfter { + t.Fatal("Got the wrong value, set blkio.throttle.read_bps_device failed.") + } +} +func TestBlkioSetThrottleWriteBpsDevice(t *testing.T) { + helper := NewCgroupTestUtil("blkio", t) + defer helper.cleanup() + + const ( + throttleBefore = `8:0 1024` + ) + + td := configs.NewThrottleDevice(8, 0, 2048) + throttleAfter := td.String() + + helper.writeFileContents(map[string]string{ + "blkio.throttle.write_bps_device": throttleBefore, + }) + + helper.CgroupData.config.Resources.BlkioThrottleWriteBpsDevice = []*configs.ThrottleDevice{td} + blkio := &BlkioGroup{} + if err := blkio.Set(helper.CgroupPath, helper.CgroupData.config); err != nil { + t.Fatal(err) + } + + value, err := fscommon.GetCgroupParamString(helper.CgroupPath, "blkio.throttle.write_bps_device") + if err != nil { + t.Fatalf("Failed to parse blkio.throttle.write_bps_device - %s", err) + } + + if value != throttleAfter { + t.Fatal("Got the wrong value, set blkio.throttle.write_bps_device failed.") + } +} +func TestBlkioSetThrottleReadIOpsDevice(t *testing.T) { + helper := NewCgroupTestUtil("blkio", t) + defer helper.cleanup() + + const ( + throttleBefore = `8:0 1024` + ) + + td := configs.NewThrottleDevice(8, 0, 2048) + throttleAfter := td.String() + + helper.writeFileContents(map[string]string{ + "blkio.throttle.read_iops_device": throttleBefore, + }) + + helper.CgroupData.config.Resources.BlkioThrottleReadIOPSDevice = []*configs.ThrottleDevice{td} + blkio := &BlkioGroup{} + if err := blkio.Set(helper.CgroupPath, helper.CgroupData.config); err != nil { + t.Fatal(err) + } + + value, err := fscommon.GetCgroupParamString(helper.CgroupPath, "blkio.throttle.read_iops_device") + if err != nil { + t.Fatalf("Failed to parse blkio.throttle.read_iops_device - %s", err) + } + + if value != throttleAfter { + t.Fatal("Got the wrong value, set blkio.throttle.read_iops_device failed.") + } +} +func TestBlkioSetThrottleWriteIOpsDevice(t *testing.T) { + helper := NewCgroupTestUtil("blkio", t) + defer helper.cleanup() + + const ( + throttleBefore = `8:0 1024` + ) + + td := configs.NewThrottleDevice(8, 0, 2048) + throttleAfter := td.String() + + helper.writeFileContents(map[string]string{ + "blkio.throttle.write_iops_device": throttleBefore, + }) + + helper.CgroupData.config.Resources.BlkioThrottleWriteIOPSDevice = []*configs.ThrottleDevice{td} + blkio := &BlkioGroup{} + if err := blkio.Set(helper.CgroupPath, helper.CgroupData.config); err != nil { + t.Fatal(err) + } + + value, err := fscommon.GetCgroupParamString(helper.CgroupPath, "blkio.throttle.write_iops_device") + if err != nil { + t.Fatalf("Failed to parse blkio.throttle.write_iops_device - %s", err) + } + + if value != throttleAfter { + t.Fatal("Got the wrong value, set blkio.throttle.write_iops_device failed.") + } +} diff --git a/libcontainer/cgroups/fs/cpu.go b/libcontainer/cgroups/fs/cpu.go new file mode 100644 index 0000000..4db7b64 --- /dev/null +++ b/libcontainer/cgroups/fs/cpu.go @@ -0,0 +1,118 @@ +// +build linux + +package fs + +import ( + "bufio" + "os" + "path/filepath" + "strconv" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" +) + +type CpuGroup struct { +} + +func (s *CpuGroup) Name() string { + return "cpu" +} + +func (s *CpuGroup) Apply(d *cgroupData) error { + // We always want to join the cpu group, to allow fair cpu scheduling + // on a container basis + path, err := d.path("cpu") + if err != nil && !cgroups.IsNotFound(err) { + return err + } + return s.ApplyDir(path, d.config, d.pid) +} + +func (s *CpuGroup) ApplyDir(path string, cgroup *configs.Cgroup, pid int) error { + // This might happen if we have no cpu cgroup mounted. + // Just do nothing and don't fail. + if path == "" { + return nil + } + if err := os.MkdirAll(path, 0755); err != nil { + return err + } + // We should set the real-Time group scheduling settings before moving + // in the process because if the process is already in SCHED_RR mode + // and no RT bandwidth is set, adding it will fail. + if err := s.SetRtSched(path, cgroup); err != nil { + return err + } + // because we are not using d.join we need to place the pid into the procs file + // unlike the other subsystems + return cgroups.WriteCgroupProc(path, pid) +} + +func (s *CpuGroup) SetRtSched(path string, cgroup *configs.Cgroup) error { + if cgroup.Resources.CpuRtPeriod != 0 { + if err := fscommon.WriteFile(path, "cpu.rt_period_us", strconv.FormatUint(cgroup.Resources.CpuRtPeriod, 10)); err != nil { + return err + } + } + if cgroup.Resources.CpuRtRuntime != 0 { + if err := fscommon.WriteFile(path, "cpu.rt_runtime_us", strconv.FormatInt(cgroup.Resources.CpuRtRuntime, 10)); err != nil { + return err + } + } + return nil +} + +func (s *CpuGroup) Set(path string, cgroup *configs.Cgroup) error { + if cgroup.Resources.CpuShares != 0 { + if err := fscommon.WriteFile(path, "cpu.shares", strconv.FormatUint(cgroup.Resources.CpuShares, 10)); err != nil { + return err + } + } + if cgroup.Resources.CpuPeriod != 0 { + if err := fscommon.WriteFile(path, "cpu.cfs_period_us", strconv.FormatUint(cgroup.Resources.CpuPeriod, 10)); err != nil { + return err + } + } + if cgroup.Resources.CpuQuota != 0 { + if err := fscommon.WriteFile(path, "cpu.cfs_quota_us", strconv.FormatInt(cgroup.Resources.CpuQuota, 10)); err != nil { + return err + } + } + return s.SetRtSched(path, cgroup) +} + +func (s *CpuGroup) Remove(d *cgroupData) error { + return removePath(d.path("cpu")) +} + +func (s *CpuGroup) GetStats(path string, stats *cgroups.Stats) error { + f, err := os.Open(filepath.Join(path, "cpu.stat")) + if err != nil { + if os.IsNotExist(err) { + return nil + } + return err + } + defer f.Close() + + sc := bufio.NewScanner(f) + for sc.Scan() { + t, v, err := fscommon.GetCgroupParamKeyValue(sc.Text()) + if err != nil { + return err + } + switch t { + case "nr_periods": + stats.CpuStats.ThrottlingData.Periods = v + + case "nr_throttled": + stats.CpuStats.ThrottlingData.ThrottledPeriods = v + + case "throttled_time": + stats.CpuStats.ThrottlingData.ThrottledTime = v + } + } + return nil +} diff --git a/libcontainer/cgroups/fs/cpu_test.go b/libcontainer/cgroups/fs/cpu_test.go new file mode 100644 index 0000000..2eeb489 --- /dev/null +++ b/libcontainer/cgroups/fs/cpu_test.go @@ -0,0 +1,210 @@ +// +build linux + +package fs + +import ( + "fmt" + "strconv" + "testing" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" +) + +func TestCpuSetShares(t *testing.T) { + helper := NewCgroupTestUtil("cpu", t) + defer helper.cleanup() + + const ( + sharesBefore = 1024 + sharesAfter = 512 + ) + + helper.writeFileContents(map[string]string{ + "cpu.shares": strconv.Itoa(sharesBefore), + }) + + helper.CgroupData.config.Resources.CpuShares = sharesAfter + cpu := &CpuGroup{} + if err := cpu.Set(helper.CgroupPath, helper.CgroupData.config); err != nil { + t.Fatal(err) + } + + value, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "cpu.shares") + if err != nil { + t.Fatalf("Failed to parse cpu.shares - %s", err) + } + + if value != sharesAfter { + t.Fatal("Got the wrong value, set cpu.shares failed.") + } +} + +func TestCpuSetBandWidth(t *testing.T) { + helper := NewCgroupTestUtil("cpu", t) + defer helper.cleanup() + + const ( + quotaBefore = 8000 + quotaAfter = 5000 + periodBefore = 10000 + periodAfter = 7000 + rtRuntimeBefore = 8000 + rtRuntimeAfter = 5000 + rtPeriodBefore = 10000 + rtPeriodAfter = 7000 + ) + + helper.writeFileContents(map[string]string{ + "cpu.cfs_quota_us": strconv.Itoa(quotaBefore), + "cpu.cfs_period_us": strconv.Itoa(periodBefore), + "cpu.rt_runtime_us": strconv.Itoa(rtRuntimeBefore), + "cpu.rt_period_us": strconv.Itoa(rtPeriodBefore), + }) + + helper.CgroupData.config.Resources.CpuQuota = quotaAfter + helper.CgroupData.config.Resources.CpuPeriod = periodAfter + helper.CgroupData.config.Resources.CpuRtRuntime = rtRuntimeAfter + helper.CgroupData.config.Resources.CpuRtPeriod = rtPeriodAfter + cpu := &CpuGroup{} + if err := cpu.Set(helper.CgroupPath, helper.CgroupData.config); err != nil { + t.Fatal(err) + } + + quota, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "cpu.cfs_quota_us") + if err != nil { + t.Fatalf("Failed to parse cpu.cfs_quota_us - %s", err) + } + if quota != quotaAfter { + t.Fatal("Got the wrong value, set cpu.cfs_quota_us failed.") + } + + period, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "cpu.cfs_period_us") + if err != nil { + t.Fatalf("Failed to parse cpu.cfs_period_us - %s", err) + } + if period != periodAfter { + t.Fatal("Got the wrong value, set cpu.cfs_period_us failed.") + } + rtRuntime, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "cpu.rt_runtime_us") + if err != nil { + t.Fatalf("Failed to parse cpu.rt_runtime_us - %s", err) + } + if rtRuntime != rtRuntimeAfter { + t.Fatal("Got the wrong value, set cpu.rt_runtime_us failed.") + } + rtPeriod, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "cpu.rt_period_us") + if err != nil { + t.Fatalf("Failed to parse cpu.rt_period_us - %s", err) + } + if rtPeriod != rtPeriodAfter { + t.Fatal("Got the wrong value, set cpu.rt_period_us failed.") + } +} + +func TestCpuStats(t *testing.T) { + helper := NewCgroupTestUtil("cpu", t) + defer helper.cleanup() + + const ( + nrPeriods = 2000 + nrThrottled = 200 + throttledTime = uint64(18446744073709551615) + ) + + cpuStatContent := fmt.Sprintf("nr_periods %d\n nr_throttled %d\n throttled_time %d\n", + nrPeriods, nrThrottled, throttledTime) + helper.writeFileContents(map[string]string{ + "cpu.stat": cpuStatContent, + }) + + cpu := &CpuGroup{} + actualStats := *cgroups.NewStats() + err := cpu.GetStats(helper.CgroupPath, &actualStats) + if err != nil { + t.Fatal(err) + } + + expectedStats := cgroups.ThrottlingData{ + Periods: nrPeriods, + ThrottledPeriods: nrThrottled, + ThrottledTime: throttledTime} + + expectThrottlingDataEquals(t, expectedStats, actualStats.CpuStats.ThrottlingData) +} + +func TestNoCpuStatFile(t *testing.T) { + helper := NewCgroupTestUtil("cpu", t) + defer helper.cleanup() + + cpu := &CpuGroup{} + actualStats := *cgroups.NewStats() + err := cpu.GetStats(helper.CgroupPath, &actualStats) + if err != nil { + t.Fatal("Expected not to fail, but did") + } +} + +func TestInvalidCpuStat(t *testing.T) { + helper := NewCgroupTestUtil("cpu", t) + defer helper.cleanup() + cpuStatContent := `nr_periods 2000 + nr_throttled 200 + throttled_time fortytwo` + helper.writeFileContents(map[string]string{ + "cpu.stat": cpuStatContent, + }) + + cpu := &CpuGroup{} + actualStats := *cgroups.NewStats() + err := cpu.GetStats(helper.CgroupPath, &actualStats) + if err == nil { + t.Fatal("Expected failed stat parsing.") + } +} + +func TestCpuSetRtSchedAtApply(t *testing.T) { + helper := NewCgroupTestUtil("cpu", t) + defer helper.cleanup() + + const ( + rtRuntimeBefore = 0 + rtRuntimeAfter = 5000 + rtPeriodBefore = 0 + rtPeriodAfter = 7000 + ) + + helper.writeFileContents(map[string]string{ + "cpu.rt_runtime_us": strconv.Itoa(rtRuntimeBefore), + "cpu.rt_period_us": strconv.Itoa(rtPeriodBefore), + }) + + helper.CgroupData.config.Resources.CpuRtRuntime = rtRuntimeAfter + helper.CgroupData.config.Resources.CpuRtPeriod = rtPeriodAfter + cpu := &CpuGroup{} + if err := cpu.ApplyDir(helper.CgroupPath, helper.CgroupData.config, 1234); err != nil { + t.Fatal(err) + } + + rtRuntime, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "cpu.rt_runtime_us") + if err != nil { + t.Fatalf("Failed to parse cpu.rt_runtime_us - %s", err) + } + if rtRuntime != rtRuntimeAfter { + t.Fatal("Got the wrong value, set cpu.rt_runtime_us failed.") + } + rtPeriod, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "cpu.rt_period_us") + if err != nil { + t.Fatalf("Failed to parse cpu.rt_period_us - %s", err) + } + if rtPeriod != rtPeriodAfter { + t.Fatal("Got the wrong value, set cpu.rt_period_us failed.") + } + pid, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "cgroup.procs") + if err != nil { + t.Fatalf("Failed to parse cgroup.procs - %s", err) + } + if pid != 1234 { + t.Fatal("Got the wrong value, set cgroup.procs failed.") + } +} diff --git a/libcontainer/cgroups/fs/cpuacct.go b/libcontainer/cgroups/fs/cpuacct.go new file mode 100644 index 0000000..95dc9a1 --- /dev/null +++ b/libcontainer/cgroups/fs/cpuacct.go @@ -0,0 +1,122 @@ +// +build linux + +package fs + +import ( + "fmt" + "io/ioutil" + "path/filepath" + "strconv" + "strings" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/system" +) + +const ( + cgroupCpuacctStat = "cpuacct.stat" + nanosecondsInSecond = 1000000000 +) + +var clockTicks = uint64(system.GetClockTicks()) + +type CpuacctGroup struct { +} + +func (s *CpuacctGroup) Name() string { + return "cpuacct" +} + +func (s *CpuacctGroup) Apply(d *cgroupData) error { + // we just want to join this group even though we don't set anything + if _, err := d.join("cpuacct"); err != nil && !cgroups.IsNotFound(err) { + return err + } + + return nil +} + +func (s *CpuacctGroup) Set(path string, cgroup *configs.Cgroup) error { + return nil +} + +func (s *CpuacctGroup) Remove(d *cgroupData) error { + return removePath(d.path("cpuacct")) +} + +func (s *CpuacctGroup) GetStats(path string, stats *cgroups.Stats) error { + userModeUsage, kernelModeUsage, err := getCpuUsageBreakdown(path) + if err != nil { + return err + } + + totalUsage, err := fscommon.GetCgroupParamUint(path, "cpuacct.usage") + if err != nil { + return err + } + + percpuUsage, err := getPercpuUsage(path) + if err != nil { + return err + } + + stats.CpuStats.CpuUsage.TotalUsage = totalUsage + stats.CpuStats.CpuUsage.PercpuUsage = percpuUsage + stats.CpuStats.CpuUsage.UsageInUsermode = userModeUsage + stats.CpuStats.CpuUsage.UsageInKernelmode = kernelModeUsage + return nil +} + +// Returns user and kernel usage breakdown in nanoseconds. +func getCpuUsageBreakdown(path string) (uint64, uint64, error) { + userModeUsage := uint64(0) + kernelModeUsage := uint64(0) + const ( + userField = "user" + systemField = "system" + ) + + // Expected format: + // user + // system + data, err := ioutil.ReadFile(filepath.Join(path, cgroupCpuacctStat)) + if err != nil { + return 0, 0, err + } + fields := strings.Fields(string(data)) + if len(fields) < 4 { + return 0, 0, fmt.Errorf("failure - %s is expected to have at least 4 fields", filepath.Join(path, cgroupCpuacctStat)) + } + if fields[0] != userField { + return 0, 0, fmt.Errorf("unexpected field %q in %q, expected %q", fields[0], cgroupCpuacctStat, userField) + } + if fields[2] != systemField { + return 0, 0, fmt.Errorf("unexpected field %q in %q, expected %q", fields[2], cgroupCpuacctStat, systemField) + } + if userModeUsage, err = strconv.ParseUint(fields[1], 10, 64); err != nil { + return 0, 0, err + } + if kernelModeUsage, err = strconv.ParseUint(fields[3], 10, 64); err != nil { + return 0, 0, err + } + + return (userModeUsage * nanosecondsInSecond) / clockTicks, (kernelModeUsage * nanosecondsInSecond) / clockTicks, nil +} + +func getPercpuUsage(path string) ([]uint64, error) { + percpuUsage := []uint64{} + data, err := ioutil.ReadFile(filepath.Join(path, "cpuacct.usage_percpu")) + if err != nil { + return percpuUsage, err + } + for _, value := range strings.Fields(string(data)) { + value, err := strconv.ParseUint(value, 10, 64) + if err != nil { + return percpuUsage, fmt.Errorf("Unable to convert param value to uint64: %s", err) + } + percpuUsage = append(percpuUsage, value) + } + return percpuUsage, nil +} diff --git a/libcontainer/cgroups/fs/cpuset.go b/libcontainer/cgroups/fs/cpuset.go new file mode 100644 index 0000000..bfc900e --- /dev/null +++ b/libcontainer/cgroups/fs/cpuset.go @@ -0,0 +1,160 @@ +// +build linux + +package fs + +import ( + "bytes" + "fmt" + "io/ioutil" + "os" + "path/filepath" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" + libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils" +) + +type CpusetGroup struct { +} + +func (s *CpusetGroup) Name() string { + return "cpuset" +} + +func (s *CpusetGroup) Apply(d *cgroupData) error { + dir, err := d.path("cpuset") + if err != nil && !cgroups.IsNotFound(err) { + return err + } + return s.ApplyDir(dir, d.config, d.pid) +} + +func (s *CpusetGroup) Set(path string, cgroup *configs.Cgroup) error { + if cgroup.Resources.CpusetCpus != "" { + if err := fscommon.WriteFile(path, "cpuset.cpus", cgroup.Resources.CpusetCpus); err != nil { + return err + } + } + if cgroup.Resources.CpusetMems != "" { + if err := fscommon.WriteFile(path, "cpuset.mems", cgroup.Resources.CpusetMems); err != nil { + return err + } + } + return nil +} + +func (s *CpusetGroup) Remove(d *cgroupData) error { + return removePath(d.path("cpuset")) +} + +func (s *CpusetGroup) GetStats(path string, stats *cgroups.Stats) error { + return nil +} + +func (s *CpusetGroup) ApplyDir(dir string, cgroup *configs.Cgroup, pid int) error { + // This might happen if we have no cpuset cgroup mounted. + // Just do nothing and don't fail. + if dir == "" { + return nil + } + mountInfo, err := ioutil.ReadFile("/proc/self/mountinfo") + if err != nil { + return err + } + root := filepath.Dir(cgroups.GetClosestMountpointAncestor(dir, string(mountInfo))) + // 'ensureParent' start with parent because we don't want to + // explicitly inherit from parent, it could conflict with + // 'cpuset.cpu_exclusive'. + if err := s.ensureParent(filepath.Dir(dir), root); err != nil { + return err + } + if err := os.MkdirAll(dir, 0755); err != nil { + return err + } + // We didn't inherit cpuset configs from parent, but we have + // to ensure cpuset configs are set before moving task into the + // cgroup. + // The logic is, if user specified cpuset configs, use these + // specified configs, otherwise, inherit from parent. This makes + // cpuset configs work correctly with 'cpuset.cpu_exclusive', and + // keep backward compatibility. + if err := s.ensureCpusAndMems(dir, cgroup); err != nil { + return err + } + + // because we are not using d.join we need to place the pid into the procs file + // unlike the other subsystems + return cgroups.WriteCgroupProc(dir, pid) +} + +func (s *CpusetGroup) getSubsystemSettings(parent string) (cpus []byte, mems []byte, err error) { + if cpus, err = ioutil.ReadFile(filepath.Join(parent, "cpuset.cpus")); err != nil { + return + } + if mems, err = ioutil.ReadFile(filepath.Join(parent, "cpuset.mems")); err != nil { + return + } + return cpus, mems, nil +} + +// ensureParent makes sure that the parent directory of current is created +// and populated with the proper cpus and mems files copied from +// it's parent. +func (s *CpusetGroup) ensureParent(current, root string) error { + parent := filepath.Dir(current) + if libcontainerUtils.CleanPath(parent) == root { + return nil + } + // Avoid infinite recursion. + if parent == current { + return fmt.Errorf("cpuset: cgroup parent path outside cgroup root") + } + if err := s.ensureParent(parent, root); err != nil { + return err + } + if err := os.MkdirAll(current, 0755); err != nil { + return err + } + return s.copyIfNeeded(current, parent) +} + +// copyIfNeeded copies the cpuset.cpus and cpuset.mems from the parent +// directory to the current directory if the file's contents are 0 +func (s *CpusetGroup) copyIfNeeded(current, parent string) error { + var ( + err error + currentCpus, currentMems []byte + parentCpus, parentMems []byte + ) + + if currentCpus, currentMems, err = s.getSubsystemSettings(current); err != nil { + return err + } + if parentCpus, parentMems, err = s.getSubsystemSettings(parent); err != nil { + return err + } + + if s.isEmpty(currentCpus) { + if err := fscommon.WriteFile(current, "cpuset.cpus", string(parentCpus)); err != nil { + return err + } + } + if s.isEmpty(currentMems) { + if err := fscommon.WriteFile(current, "cpuset.mems", string(parentMems)); err != nil { + return err + } + } + return nil +} + +func (s *CpusetGroup) isEmpty(b []byte) bool { + return len(bytes.Trim(b, "\n")) == 0 +} + +func (s *CpusetGroup) ensureCpusAndMems(path string, cgroup *configs.Cgroup) error { + if err := s.Set(path, cgroup); err != nil { + return err + } + return s.copyIfNeeded(path, filepath.Dir(path)) +} diff --git a/libcontainer/cgroups/fs/cpuset_test.go b/libcontainer/cgroups/fs/cpuset_test.go new file mode 100644 index 0000000..927e631 --- /dev/null +++ b/libcontainer/cgroups/fs/cpuset_test.go @@ -0,0 +1,67 @@ +// +build linux + +package fs + +import ( + "testing" + + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" +) + +func TestCpusetSetCpus(t *testing.T) { + helper := NewCgroupTestUtil("cpuset", t) + defer helper.cleanup() + + const ( + cpusBefore = "0" + cpusAfter = "1-3" + ) + + helper.writeFileContents(map[string]string{ + "cpuset.cpus": cpusBefore, + }) + + helper.CgroupData.config.Resources.CpusetCpus = cpusAfter + cpuset := &CpusetGroup{} + if err := cpuset.Set(helper.CgroupPath, helper.CgroupData.config); err != nil { + t.Fatal(err) + } + + value, err := fscommon.GetCgroupParamString(helper.CgroupPath, "cpuset.cpus") + if err != nil { + t.Fatalf("Failed to parse cpuset.cpus - %s", err) + } + + if value != cpusAfter { + t.Fatal("Got the wrong value, set cpuset.cpus failed.") + } +} + +func TestCpusetSetMems(t *testing.T) { + helper := NewCgroupTestUtil("cpuset", t) + defer helper.cleanup() + + const ( + memsBefore = "0" + memsAfter = "1" + ) + + helper.writeFileContents(map[string]string{ + "cpuset.mems": memsBefore, + }) + + helper.CgroupData.config.Resources.CpusetMems = memsAfter + cpuset := &CpusetGroup{} + if err := cpuset.Set(helper.CgroupPath, helper.CgroupData.config); err != nil { + t.Fatal(err) + } + + value, err := fscommon.GetCgroupParamString(helper.CgroupPath, "cpuset.mems") + if err != nil { + t.Fatalf("Failed to parse cpuset.mems - %s", err) + } + + if value != memsAfter { + t.Fatal("Got the wrong value, set cpuset.mems failed.") + } +} diff --git a/libcontainer/cgroups/fs/devices.go b/libcontainer/cgroups/fs/devices.go new file mode 100644 index 0000000..036c8db --- /dev/null +++ b/libcontainer/cgroups/fs/devices.go @@ -0,0 +1,81 @@ +// +build linux + +package fs + +import ( + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/system" +) + +type DevicesGroup struct { +} + +func (s *DevicesGroup) Name() string { + return "devices" +} + +func (s *DevicesGroup) Apply(d *cgroupData) error { + _, err := d.join("devices") + if err != nil { + // We will return error even it's `not found` error, devices + // cgroup is hard requirement for container's security. + return err + } + return nil +} + +func (s *DevicesGroup) Set(path string, cgroup *configs.Cgroup) error { + if system.RunningInUserNS() { + return nil + } + + devices := cgroup.Resources.Devices + if len(devices) > 0 { + for _, dev := range devices { + file := "devices.deny" + if dev.Allow { + file = "devices.allow" + } + if err := fscommon.WriteFile(path, file, dev.CgroupString()); err != nil { + return err + } + } + return nil + } + if cgroup.Resources.AllowAllDevices != nil { + if *cgroup.Resources.AllowAllDevices == false { + if err := fscommon.WriteFile(path, "devices.deny", "a"); err != nil { + return err + } + + for _, dev := range cgroup.Resources.AllowedDevices { + if err := fscommon.WriteFile(path, "devices.allow", dev.CgroupString()); err != nil { + return err + } + } + return nil + } + + if err := fscommon.WriteFile(path, "devices.allow", "a"); err != nil { + return err + } + } + + for _, dev := range cgroup.Resources.DeniedDevices { + if err := fscommon.WriteFile(path, "devices.deny", dev.CgroupString()); err != nil { + return err + } + } + + return nil +} + +func (s *DevicesGroup) Remove(d *cgroupData) error { + return removePath(d.path("devices")) +} + +func (s *DevicesGroup) GetStats(path string, stats *cgroups.Stats) error { + return nil +} diff --git a/libcontainer/cgroups/fs/devices_test.go b/libcontainer/cgroups/fs/devices_test.go new file mode 100644 index 0000000..648f4a2 --- /dev/null +++ b/libcontainer/cgroups/fs/devices_test.go @@ -0,0 +1,99 @@ +// +build linux + +package fs + +import ( + "testing" + + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" +) + +var ( + allowedDevices = []*configs.Device{ + { + Path: "/dev/zero", + Type: 'c', + Major: 1, + Minor: 5, + Permissions: "rwm", + FileMode: 0666, + }, + } + allowedList = "c 1:5 rwm" + deniedDevices = []*configs.Device{ + { + Path: "/dev/null", + Type: 'c', + Major: 1, + Minor: 3, + Permissions: "rwm", + FileMode: 0666, + }, + } + deniedList = "c 1:3 rwm" +) + +func TestDevicesSetAllow(t *testing.T) { + helper := NewCgroupTestUtil("devices", t) + defer helper.cleanup() + + helper.writeFileContents(map[string]string{ + "devices.deny": "a", + }) + allowAllDevices := false + helper.CgroupData.config.Resources.AllowAllDevices = &allowAllDevices + helper.CgroupData.config.Resources.AllowedDevices = allowedDevices + devices := &DevicesGroup{} + if err := devices.Set(helper.CgroupPath, helper.CgroupData.config); err != nil { + t.Fatal(err) + } + + value, err := fscommon.GetCgroupParamString(helper.CgroupPath, "devices.allow") + if err != nil { + t.Fatalf("Failed to parse devices.allow - %s", err) + } + + if value != allowedList { + t.Fatal("Got the wrong value, set devices.allow failed.") + } + + // When AllowAllDevices is nil, devices.allow file should not be modified. + helper.CgroupData.config.Resources.AllowAllDevices = nil + if err := devices.Set(helper.CgroupPath, helper.CgroupData.config); err != nil { + t.Fatal(err) + } + value, err = fscommon.GetCgroupParamString(helper.CgroupPath, "devices.allow") + if err != nil { + t.Fatalf("Failed to parse devices.allow - %s", err) + } + if value != allowedList { + t.Fatal("devices policy shouldn't have changed on AllowedAllDevices=nil.") + } +} + +func TestDevicesSetDeny(t *testing.T) { + helper := NewCgroupTestUtil("devices", t) + defer helper.cleanup() + + helper.writeFileContents(map[string]string{ + "devices.allow": "a", + }) + + allowAllDevices := true + helper.CgroupData.config.Resources.AllowAllDevices = &allowAllDevices + helper.CgroupData.config.Resources.DeniedDevices = deniedDevices + devices := &DevicesGroup{} + if err := devices.Set(helper.CgroupPath, helper.CgroupData.config); err != nil { + t.Fatal(err) + } + + value, err := fscommon.GetCgroupParamString(helper.CgroupPath, "devices.deny") + if err != nil { + t.Fatalf("Failed to parse devices.deny - %s", err) + } + + if value != deniedList { + t.Fatal("Got the wrong value, set devices.deny failed.") + } +} diff --git a/libcontainer/cgroups/fs/freezer.go b/libcontainer/cgroups/fs/freezer.go new file mode 100644 index 0000000..9dc81bd --- /dev/null +++ b/libcontainer/cgroups/fs/freezer.go @@ -0,0 +1,67 @@ +// +build linux + +package fs + +import ( + "fmt" + "strings" + "time" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" +) + +type FreezerGroup struct { +} + +func (s *FreezerGroup) Name() string { + return "freezer" +} + +func (s *FreezerGroup) Apply(d *cgroupData) error { + _, err := d.join("freezer") + if err != nil && !cgroups.IsNotFound(err) { + return err + } + return nil +} + +func (s *FreezerGroup) Set(path string, cgroup *configs.Cgroup) error { + switch cgroup.Resources.Freezer { + case configs.Frozen, configs.Thawed: + for { + // In case this loop does not exit because it doesn't get the expected + // state, let's write again this state, hoping it's going to be properly + // set this time. Otherwise, this loop could run infinitely, waiting for + // a state change that would never happen. + if err := fscommon.WriteFile(path, "freezer.state", string(cgroup.Resources.Freezer)); err != nil { + return err + } + + state, err := fscommon.ReadFile(path, "freezer.state") + if err != nil { + return err + } + if strings.TrimSpace(state) == string(cgroup.Resources.Freezer) { + break + } + + time.Sleep(1 * time.Millisecond) + } + case configs.Undefined: + return nil + default: + return fmt.Errorf("Invalid argument '%s' to freezer.state", string(cgroup.Resources.Freezer)) + } + + return nil +} + +func (s *FreezerGroup) Remove(d *cgroupData) error { + return removePath(d.path("freezer")) +} + +func (s *FreezerGroup) GetStats(path string, stats *cgroups.Stats) error { + return nil +} diff --git a/libcontainer/cgroups/fs/freezer_test.go b/libcontainer/cgroups/fs/freezer_test.go new file mode 100644 index 0000000..ad80261 --- /dev/null +++ b/libcontainer/cgroups/fs/freezer_test.go @@ -0,0 +1,48 @@ +// +build linux + +package fs + +import ( + "testing" + + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" +) + +func TestFreezerSetState(t *testing.T) { + helper := NewCgroupTestUtil("freezer", t) + defer helper.cleanup() + + helper.writeFileContents(map[string]string{ + "freezer.state": string(configs.Frozen), + }) + + helper.CgroupData.config.Resources.Freezer = configs.Thawed + freezer := &FreezerGroup{} + if err := freezer.Set(helper.CgroupPath, helper.CgroupData.config); err != nil { + t.Fatal(err) + } + + value, err := fscommon.GetCgroupParamString(helper.CgroupPath, "freezer.state") + if err != nil { + t.Fatalf("Failed to parse freezer.state - %s", err) + } + if value != string(configs.Thawed) { + t.Fatal("Got the wrong value, set freezer.state failed.") + } +} + +func TestFreezerSetInvalidState(t *testing.T) { + helper := NewCgroupTestUtil("freezer", t) + defer helper.cleanup() + + const ( + invalidArg configs.FreezerState = "Invalid" + ) + + helper.CgroupData.config.Resources.Freezer = invalidArg + freezer := &FreezerGroup{} + if err := freezer.Set(helper.CgroupPath, helper.CgroupData.config); err == nil { + t.Fatal("Failed to return invalid argument error") + } +} diff --git a/libcontainer/cgroups/fs/fs_unsupported.go b/libcontainer/cgroups/fs/fs_unsupported.go new file mode 100644 index 0000000..3ef9e03 --- /dev/null +++ b/libcontainer/cgroups/fs/fs_unsupported.go @@ -0,0 +1,3 @@ +// +build !linux + +package fs diff --git a/libcontainer/cgroups/fs/hugetlb.go b/libcontainer/cgroups/fs/hugetlb.go new file mode 100644 index 0000000..68719c2 --- /dev/null +++ b/libcontainer/cgroups/fs/hugetlb.go @@ -0,0 +1,72 @@ +// +build linux + +package fs + +import ( + "fmt" + "strconv" + "strings" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" +) + +type HugetlbGroup struct { +} + +func (s *HugetlbGroup) Name() string { + return "hugetlb" +} + +func (s *HugetlbGroup) Apply(d *cgroupData) error { + _, err := d.join("hugetlb") + if err != nil && !cgroups.IsNotFound(err) { + return err + } + return nil +} + +func (s *HugetlbGroup) Set(path string, cgroup *configs.Cgroup) error { + for _, hugetlb := range cgroup.Resources.HugetlbLimit { + if err := fscommon.WriteFile(path, strings.Join([]string{"hugetlb", hugetlb.Pagesize, "limit_in_bytes"}, "."), strconv.FormatUint(hugetlb.Limit, 10)); err != nil { + return err + } + } + + return nil +} + +func (s *HugetlbGroup) Remove(d *cgroupData) error { + return removePath(d.path("hugetlb")) +} + +func (s *HugetlbGroup) GetStats(path string, stats *cgroups.Stats) error { + hugetlbStats := cgroups.HugetlbStats{} + for _, pageSize := range HugePageSizes { + usage := strings.Join([]string{"hugetlb", pageSize, "usage_in_bytes"}, ".") + value, err := fscommon.GetCgroupParamUint(path, usage) + if err != nil { + return fmt.Errorf("failed to parse %s - %v", usage, err) + } + hugetlbStats.Usage = value + + maxUsage := strings.Join([]string{"hugetlb", pageSize, "max_usage_in_bytes"}, ".") + value, err = fscommon.GetCgroupParamUint(path, maxUsage) + if err != nil { + return fmt.Errorf("failed to parse %s - %v", maxUsage, err) + } + hugetlbStats.MaxUsage = value + + failcnt := strings.Join([]string{"hugetlb", pageSize, "failcnt"}, ".") + value, err = fscommon.GetCgroupParamUint(path, failcnt) + if err != nil { + return fmt.Errorf("failed to parse %s - %v", failcnt, err) + } + hugetlbStats.Failcnt = value + + stats.HugetlbStats[pageSize] = hugetlbStats + } + + return nil +} diff --git a/libcontainer/cgroups/fs/hugetlb_test.go b/libcontainer/cgroups/fs/hugetlb_test.go new file mode 100644 index 0000000..9ddacfe --- /dev/null +++ b/libcontainer/cgroups/fs/hugetlb_test.go @@ -0,0 +1,155 @@ +// +build linux + +package fs + +import ( + "fmt" + "strconv" + "testing" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" +) + +const ( + hugetlbUsageContents = "128\n" + hugetlbMaxUsageContents = "256\n" + hugetlbFailcnt = "100\n" +) + +var ( + usage = "hugetlb.%s.usage_in_bytes" + limit = "hugetlb.%s.limit_in_bytes" + maxUsage = "hugetlb.%s.max_usage_in_bytes" + failcnt = "hugetlb.%s.failcnt" +) + +func TestHugetlbSetHugetlb(t *testing.T) { + helper := NewCgroupTestUtil("hugetlb", t) + defer helper.cleanup() + + const ( + hugetlbBefore = 256 + hugetlbAfter = 512 + ) + + for _, pageSize := range HugePageSizes { + helper.writeFileContents(map[string]string{ + fmt.Sprintf(limit, pageSize): strconv.Itoa(hugetlbBefore), + }) + } + + for _, pageSize := range HugePageSizes { + helper.CgroupData.config.Resources.HugetlbLimit = []*configs.HugepageLimit{ + { + Pagesize: pageSize, + Limit: hugetlbAfter, + }, + } + hugetlb := &HugetlbGroup{} + if err := hugetlb.Set(helper.CgroupPath, helper.CgroupData.config); err != nil { + t.Fatal(err) + } + } + + for _, pageSize := range HugePageSizes { + limit := fmt.Sprintf(limit, pageSize) + value, err := fscommon.GetCgroupParamUint(helper.CgroupPath, limit) + if err != nil { + t.Fatalf("Failed to parse %s - %s", limit, err) + } + if value != hugetlbAfter { + t.Fatalf("Set hugetlb.limit_in_bytes failed. Expected: %v, Got: %v", hugetlbAfter, value) + } + } +} + +func TestHugetlbStats(t *testing.T) { + helper := NewCgroupTestUtil("hugetlb", t) + defer helper.cleanup() + for _, pageSize := range HugePageSizes { + helper.writeFileContents(map[string]string{ + fmt.Sprintf(usage, pageSize): hugetlbUsageContents, + fmt.Sprintf(maxUsage, pageSize): hugetlbMaxUsageContents, + fmt.Sprintf(failcnt, pageSize): hugetlbFailcnt, + }) + } + + hugetlb := &HugetlbGroup{} + actualStats := *cgroups.NewStats() + err := hugetlb.GetStats(helper.CgroupPath, &actualStats) + if err != nil { + t.Fatal(err) + } + expectedStats := cgroups.HugetlbStats{Usage: 128, MaxUsage: 256, Failcnt: 100} + for _, pageSize := range HugePageSizes { + expectHugetlbStatEquals(t, expectedStats, actualStats.HugetlbStats[pageSize]) + } +} + +func TestHugetlbStatsNoUsageFile(t *testing.T) { + helper := NewCgroupTestUtil("hugetlb", t) + defer helper.cleanup() + helper.writeFileContents(map[string]string{ + maxUsage: hugetlbMaxUsageContents, + }) + + hugetlb := &HugetlbGroup{} + actualStats := *cgroups.NewStats() + err := hugetlb.GetStats(helper.CgroupPath, &actualStats) + if err == nil { + t.Fatal("Expected failure") + } +} + +func TestHugetlbStatsNoMaxUsageFile(t *testing.T) { + helper := NewCgroupTestUtil("hugetlb", t) + defer helper.cleanup() + for _, pageSize := range HugePageSizes { + helper.writeFileContents(map[string]string{ + fmt.Sprintf(usage, pageSize): hugetlbUsageContents, + }) + } + + hugetlb := &HugetlbGroup{} + actualStats := *cgroups.NewStats() + err := hugetlb.GetStats(helper.CgroupPath, &actualStats) + if err == nil { + t.Fatal("Expected failure") + } +} + +func TestHugetlbStatsBadUsageFile(t *testing.T) { + helper := NewCgroupTestUtil("hugetlb", t) + defer helper.cleanup() + for _, pageSize := range HugePageSizes { + helper.writeFileContents(map[string]string{ + fmt.Sprintf(usage, pageSize): "bad", + maxUsage: hugetlbMaxUsageContents, + }) + } + + hugetlb := &HugetlbGroup{} + actualStats := *cgroups.NewStats() + err := hugetlb.GetStats(helper.CgroupPath, &actualStats) + if err == nil { + t.Fatal("Expected failure") + } +} + +func TestHugetlbStatsBadMaxUsageFile(t *testing.T) { + helper := NewCgroupTestUtil("hugetlb", t) + defer helper.cleanup() + helper.writeFileContents(map[string]string{ + usage: hugetlbUsageContents, + maxUsage: "bad", + }) + + hugetlb := &HugetlbGroup{} + actualStats := *cgroups.NewStats() + err := hugetlb.GetStats(helper.CgroupPath, &actualStats) + if err == nil { + t.Fatal("Expected failure") + } +} diff --git a/libcontainer/cgroups/fs/kmem.go b/libcontainer/cgroups/fs/kmem.go new file mode 100644 index 0000000..69b5a19 --- /dev/null +++ b/libcontainer/cgroups/fs/kmem.go @@ -0,0 +1,62 @@ +// +build linux,!nokmem + +package fs + +import ( + "errors" + "fmt" + "io/ioutil" + "os" + "path/filepath" + "strconv" + "syscall" // for Errno type only + + "github.com/opencontainers/runc/libcontainer/cgroups" + "golang.org/x/sys/unix" +) + +const cgroupKernelMemoryLimit = "memory.kmem.limit_in_bytes" + +func EnableKernelMemoryAccounting(path string) error { + // Ensure that kernel memory is available in this kernel build. If it + // isn't, we just ignore it because EnableKernelMemoryAccounting is + // automatically called for all memory limits. + if !cgroups.PathExists(filepath.Join(path, cgroupKernelMemoryLimit)) { + return nil + } + // We have to limit the kernel memory here as it won't be accounted at all + // until a limit is set on the cgroup and limit cannot be set once the + // cgroup has children, or if there are already tasks in the cgroup. + for _, i := range []int64{1, -1} { + if err := setKernelMemory(path, i); err != nil { + return err + } + } + return nil +} + +func setKernelMemory(path string, kernelMemoryLimit int64) error { + if path == "" { + return fmt.Errorf("no such directory for %s", cgroupKernelMemoryLimit) + } + if !cgroups.PathExists(filepath.Join(path, cgroupKernelMemoryLimit)) { + // We have specifically been asked to set a kmem limit. If the kernel + // doesn't support it we *must* error out. + return errors.New("kernel memory accounting not supported by this kernel") + } + if err := ioutil.WriteFile(filepath.Join(path, cgroupKernelMemoryLimit), []byte(strconv.FormatInt(kernelMemoryLimit, 10)), 0700); err != nil { + // Check if the error number returned by the syscall is "EBUSY" + // The EBUSY signal is returned on attempts to write to the + // memory.kmem.limit_in_bytes file if the cgroup has children or + // once tasks have been attached to the cgroup + if pathErr, ok := err.(*os.PathError); ok { + if errNo, ok := pathErr.Err.(syscall.Errno); ok { + if errNo == unix.EBUSY { + return fmt.Errorf("failed to set %s, because either tasks have already joined this cgroup or it has children", cgroupKernelMemoryLimit) + } + } + } + return fmt.Errorf("failed to write %v to %v: %v", kernelMemoryLimit, cgroupKernelMemoryLimit, err) + } + return nil +} diff --git a/libcontainer/cgroups/fs/kmem_disabled.go b/libcontainer/cgroups/fs/kmem_disabled.go new file mode 100644 index 0000000..ac290fd --- /dev/null +++ b/libcontainer/cgroups/fs/kmem_disabled.go @@ -0,0 +1,15 @@ +// +build linux,nokmem + +package fs + +import ( + "errors" +) + +func EnableKernelMemoryAccounting(path string) error { + return nil +} + +func setKernelMemory(path string, kernelMemoryLimit int64) error { + return errors.New("kernel memory accounting disabled in this runc build") +} diff --git a/libcontainer/cgroups/fs/memory.go b/libcontainer/cgroups/fs/memory.go new file mode 100644 index 0000000..f81ed05 --- /dev/null +++ b/libcontainer/cgroups/fs/memory.go @@ -0,0 +1,271 @@ +// +build linux + +package fs + +import ( + "bufio" + "fmt" + "os" + "path/filepath" + "strconv" + "strings" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" +) + +const ( + cgroupMemorySwapLimit = "memory.memsw.limit_in_bytes" + cgroupMemoryLimit = "memory.limit_in_bytes" +) + +type MemoryGroup struct { +} + +func (s *MemoryGroup) Name() string { + return "memory" +} + +func (s *MemoryGroup) Apply(d *cgroupData) (err error) { + path, err := d.path("memory") + if err != nil && !cgroups.IsNotFound(err) { + return err + } else if path == "" { + return nil + } + if memoryAssigned(d.config) { + if _, err := os.Stat(path); os.IsNotExist(err) { + if err := os.MkdirAll(path, 0755); err != nil { + return err + } + // Only enable kernel memory accouting when this cgroup + // is created by libcontainer, otherwise we might get + // error when people use `cgroupsPath` to join an existed + // cgroup whose kernel memory is not initialized. + if err := EnableKernelMemoryAccounting(path); err != nil { + return err + } + } + } + defer func() { + if err != nil { + os.RemoveAll(path) + } + }() + + // We need to join memory cgroup after set memory limits, because + // kmem.limit_in_bytes can only be set when the cgroup is empty. + _, err = d.join("memory") + if err != nil && !cgroups.IsNotFound(err) { + return err + } + return nil +} + +func setMemoryAndSwap(path string, cgroup *configs.Cgroup) error { + // If the memory update is set to -1 we should also + // set swap to -1, it means unlimited memory. + if cgroup.Resources.Memory == -1 { + // Only set swap if it's enabled in kernel + if cgroups.PathExists(filepath.Join(path, cgroupMemorySwapLimit)) { + cgroup.Resources.MemorySwap = -1 + } + } + + // When memory and swap memory are both set, we need to handle the cases + // for updating container. + if cgroup.Resources.Memory != 0 && cgroup.Resources.MemorySwap != 0 { + memoryUsage, err := getMemoryData(path, "") + if err != nil { + return err + } + + // When update memory limit, we should adapt the write sequence + // for memory and swap memory, so it won't fail because the new + // value and the old value don't fit kernel's validation. + if cgroup.Resources.MemorySwap == -1 || memoryUsage.Limit < uint64(cgroup.Resources.MemorySwap) { + if err := fscommon.WriteFile(path, cgroupMemorySwapLimit, strconv.FormatInt(cgroup.Resources.MemorySwap, 10)); err != nil { + return err + } + if err := fscommon.WriteFile(path, cgroupMemoryLimit, strconv.FormatInt(cgroup.Resources.Memory, 10)); err != nil { + return err + } + } else { + if err := fscommon.WriteFile(path, cgroupMemoryLimit, strconv.FormatInt(cgroup.Resources.Memory, 10)); err != nil { + return err + } + if err := fscommon.WriteFile(path, cgroupMemorySwapLimit, strconv.FormatInt(cgroup.Resources.MemorySwap, 10)); err != nil { + return err + } + } + } else { + if cgroup.Resources.Memory != 0 { + if err := fscommon.WriteFile(path, cgroupMemoryLimit, strconv.FormatInt(cgroup.Resources.Memory, 10)); err != nil { + return err + } + } + if cgroup.Resources.MemorySwap != 0 { + if err := fscommon.WriteFile(path, cgroupMemorySwapLimit, strconv.FormatInt(cgroup.Resources.MemorySwap, 10)); err != nil { + return err + } + } + } + + return nil +} + +func (s *MemoryGroup) Set(path string, cgroup *configs.Cgroup) error { + if err := setMemoryAndSwap(path, cgroup); err != nil { + return err + } + + if cgroup.Resources.KernelMemory != 0 { + if err := setKernelMemory(path, cgroup.Resources.KernelMemory); err != nil { + return err + } + } + + if cgroup.Resources.MemoryReservation != 0 { + if err := fscommon.WriteFile(path, "memory.soft_limit_in_bytes", strconv.FormatInt(cgroup.Resources.MemoryReservation, 10)); err != nil { + return err + } + } + + if cgroup.Resources.KernelMemoryTCP != 0 { + if err := fscommon.WriteFile(path, "memory.kmem.tcp.limit_in_bytes", strconv.FormatInt(cgroup.Resources.KernelMemoryTCP, 10)); err != nil { + return err + } + } + if cgroup.Resources.OomKillDisable { + if err := fscommon.WriteFile(path, "memory.oom_control", "1"); err != nil { + return err + } + } + if cgroup.Resources.MemorySwappiness == nil || int64(*cgroup.Resources.MemorySwappiness) == -1 { + return nil + } else if *cgroup.Resources.MemorySwappiness <= 100 { + if err := fscommon.WriteFile(path, "memory.swappiness", strconv.FormatUint(*cgroup.Resources.MemorySwappiness, 10)); err != nil { + return err + } + } else { + return fmt.Errorf("invalid value:%d. valid memory swappiness range is 0-100", *cgroup.Resources.MemorySwappiness) + } + + return nil +} + +func (s *MemoryGroup) Remove(d *cgroupData) error { + return removePath(d.path("memory")) +} + +func (s *MemoryGroup) GetStats(path string, stats *cgroups.Stats) error { + // Set stats from memory.stat. + statsFile, err := os.Open(filepath.Join(path, "memory.stat")) + if err != nil { + if os.IsNotExist(err) { + return nil + } + return err + } + defer statsFile.Close() + + sc := bufio.NewScanner(statsFile) + for sc.Scan() { + t, v, err := fscommon.GetCgroupParamKeyValue(sc.Text()) + if err != nil { + return fmt.Errorf("failed to parse memory.stat (%q) - %v", sc.Text(), err) + } + stats.MemoryStats.Stats[t] = v + } + stats.MemoryStats.Cache = stats.MemoryStats.Stats["cache"] + + memoryUsage, err := getMemoryData(path, "") + if err != nil { + return err + } + stats.MemoryStats.Usage = memoryUsage + swapUsage, err := getMemoryData(path, "memsw") + if err != nil { + return err + } + stats.MemoryStats.SwapUsage = swapUsage + kernelUsage, err := getMemoryData(path, "kmem") + if err != nil { + return err + } + stats.MemoryStats.KernelUsage = kernelUsage + kernelTCPUsage, err := getMemoryData(path, "kmem.tcp") + if err != nil { + return err + } + stats.MemoryStats.KernelTCPUsage = kernelTCPUsage + + useHierarchy := strings.Join([]string{"memory", "use_hierarchy"}, ".") + value, err := fscommon.GetCgroupParamUint(path, useHierarchy) + if err != nil { + return err + } + if value == 1 { + stats.MemoryStats.UseHierarchy = true + } + return nil +} + +func memoryAssigned(cgroup *configs.Cgroup) bool { + return cgroup.Resources.Memory != 0 || + cgroup.Resources.MemoryReservation != 0 || + cgroup.Resources.MemorySwap > 0 || + cgroup.Resources.KernelMemory > 0 || + cgroup.Resources.KernelMemoryTCP > 0 || + cgroup.Resources.OomKillDisable || + (cgroup.Resources.MemorySwappiness != nil && int64(*cgroup.Resources.MemorySwappiness) != -1) +} + +func getMemoryData(path, name string) (cgroups.MemoryData, error) { + memoryData := cgroups.MemoryData{} + + moduleName := "memory" + if name != "" { + moduleName = strings.Join([]string{"memory", name}, ".") + } + usage := strings.Join([]string{moduleName, "usage_in_bytes"}, ".") + maxUsage := strings.Join([]string{moduleName, "max_usage_in_bytes"}, ".") + failcnt := strings.Join([]string{moduleName, "failcnt"}, ".") + limit := strings.Join([]string{moduleName, "limit_in_bytes"}, ".") + + value, err := fscommon.GetCgroupParamUint(path, usage) + if err != nil { + if moduleName != "memory" && os.IsNotExist(err) { + return cgroups.MemoryData{}, nil + } + return cgroups.MemoryData{}, fmt.Errorf("failed to parse %s - %v", usage, err) + } + memoryData.Usage = value + value, err = fscommon.GetCgroupParamUint(path, maxUsage) + if err != nil { + if moduleName != "memory" && os.IsNotExist(err) { + return cgroups.MemoryData{}, nil + } + return cgroups.MemoryData{}, fmt.Errorf("failed to parse %s - %v", maxUsage, err) + } + memoryData.MaxUsage = value + value, err = fscommon.GetCgroupParamUint(path, failcnt) + if err != nil { + if moduleName != "memory" && os.IsNotExist(err) { + return cgroups.MemoryData{}, nil + } + return cgroups.MemoryData{}, fmt.Errorf("failed to parse %s - %v", failcnt, err) + } + memoryData.Failcnt = value + value, err = fscommon.GetCgroupParamUint(path, limit) + if err != nil { + if moduleName != "memory" && os.IsNotExist(err) { + return cgroups.MemoryData{}, nil + } + return cgroups.MemoryData{}, fmt.Errorf("failed to parse %s - %v", limit, err) + } + memoryData.Limit = value + + return memoryData, nil +} diff --git a/libcontainer/cgroups/fs/memory_test.go b/libcontainer/cgroups/fs/memory_test.go new file mode 100644 index 0000000..62de563 --- /dev/null +++ b/libcontainer/cgroups/fs/memory_test.go @@ -0,0 +1,456 @@ +// +build linux + +package fs + +import ( + "strconv" + "testing" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" +) + +const ( + memoryStatContents = `cache 512 +rss 1024` + memoryUsageContents = "2048\n" + memoryMaxUsageContents = "4096\n" + memoryFailcnt = "100\n" + memoryLimitContents = "8192\n" + memoryUseHierarchyContents = "1\n" +) + +func TestMemorySetMemory(t *testing.T) { + helper := NewCgroupTestUtil("memory", t) + defer helper.cleanup() + + const ( + memoryBefore = 314572800 // 300M + memoryAfter = 524288000 // 500M + reservationBefore = 209715200 // 200M + reservationAfter = 314572800 // 300M + ) + + helper.writeFileContents(map[string]string{ + "memory.limit_in_bytes": strconv.Itoa(memoryBefore), + "memory.soft_limit_in_bytes": strconv.Itoa(reservationBefore), + }) + + helper.CgroupData.config.Resources.Memory = memoryAfter + helper.CgroupData.config.Resources.MemoryReservation = reservationAfter + memory := &MemoryGroup{} + if err := memory.Set(helper.CgroupPath, helper.CgroupData.config); err != nil { + t.Fatal(err) + } + + value, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "memory.limit_in_bytes") + if err != nil { + t.Fatalf("Failed to parse memory.limit_in_bytes - %s", err) + } + if value != memoryAfter { + t.Fatal("Got the wrong value, set memory.limit_in_bytes failed.") + } + + value, err = fscommon.GetCgroupParamUint(helper.CgroupPath, "memory.soft_limit_in_bytes") + if err != nil { + t.Fatalf("Failed to parse memory.soft_limit_in_bytes - %s", err) + } + if value != reservationAfter { + t.Fatal("Got the wrong value, set memory.soft_limit_in_bytes failed.") + } +} + +func TestMemorySetMemoryswap(t *testing.T) { + helper := NewCgroupTestUtil("memory", t) + defer helper.cleanup() + + const ( + memoryswapBefore = 314572800 // 300M + memoryswapAfter = 524288000 // 500M + ) + + helper.writeFileContents(map[string]string{ + "memory.memsw.limit_in_bytes": strconv.Itoa(memoryswapBefore), + }) + + helper.CgroupData.config.Resources.MemorySwap = memoryswapAfter + memory := &MemoryGroup{} + if err := memory.Set(helper.CgroupPath, helper.CgroupData.config); err != nil { + t.Fatal(err) + } + + value, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "memory.memsw.limit_in_bytes") + if err != nil { + t.Fatalf("Failed to parse memory.memsw.limit_in_bytes - %s", err) + } + if value != memoryswapAfter { + t.Fatal("Got the wrong value, set memory.memsw.limit_in_bytes failed.") + } +} + +func TestMemorySetMemoryLargerThanSwap(t *testing.T) { + helper := NewCgroupTestUtil("memory", t) + defer helper.cleanup() + + const ( + memoryBefore = 314572800 // 300M + memoryswapBefore = 524288000 // 500M + memoryAfter = 629145600 // 600M + memoryswapAfter = 838860800 // 800M + ) + + helper.writeFileContents(map[string]string{ + "memory.limit_in_bytes": strconv.Itoa(memoryBefore), + "memory.memsw.limit_in_bytes": strconv.Itoa(memoryswapBefore), + // Set will call getMemoryData when memory and swap memory are + // both set, fake these fields so we don't get error. + "memory.usage_in_bytes": "0", + "memory.max_usage_in_bytes": "0", + "memory.failcnt": "0", + }) + + helper.CgroupData.config.Resources.Memory = memoryAfter + helper.CgroupData.config.Resources.MemorySwap = memoryswapAfter + memory := &MemoryGroup{} + if err := memory.Set(helper.CgroupPath, helper.CgroupData.config); err != nil { + t.Fatal(err) + } + + value, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "memory.limit_in_bytes") + if err != nil { + t.Fatalf("Failed to parse memory.limit_in_bytes - %s", err) + } + if value != memoryAfter { + t.Fatal("Got the wrong value, set memory.limit_in_bytes failed.") + } + value, err = fscommon.GetCgroupParamUint(helper.CgroupPath, "memory.memsw.limit_in_bytes") + if err != nil { + t.Fatalf("Failed to parse memory.memsw.limit_in_bytes - %s", err) + } + if value != memoryswapAfter { + t.Fatal("Got the wrong value, set memory.memsw.limit_in_bytes failed.") + } +} + +func TestMemorySetSwapSmallerThanMemory(t *testing.T) { + helper := NewCgroupTestUtil("memory", t) + defer helper.cleanup() + + const ( + memoryBefore = 629145600 // 600M + memoryswapBefore = 838860800 // 800M + memoryAfter = 314572800 // 300M + memoryswapAfter = 524288000 // 500M + ) + + helper.writeFileContents(map[string]string{ + "memory.limit_in_bytes": strconv.Itoa(memoryBefore), + "memory.memsw.limit_in_bytes": strconv.Itoa(memoryswapBefore), + // Set will call getMemoryData when memory and swap memory are + // both set, fake these fields so we don't get error. + "memory.usage_in_bytes": "0", + "memory.max_usage_in_bytes": "0", + "memory.failcnt": "0", + }) + + helper.CgroupData.config.Resources.Memory = memoryAfter + helper.CgroupData.config.Resources.MemorySwap = memoryswapAfter + memory := &MemoryGroup{} + if err := memory.Set(helper.CgroupPath, helper.CgroupData.config); err != nil { + t.Fatal(err) + } + + value, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "memory.limit_in_bytes") + if err != nil { + t.Fatalf("Failed to parse memory.limit_in_bytes - %s", err) + } + if value != memoryAfter { + t.Fatal("Got the wrong value, set memory.limit_in_bytes failed.") + } + value, err = fscommon.GetCgroupParamUint(helper.CgroupPath, "memory.memsw.limit_in_bytes") + if err != nil { + t.Fatalf("Failed to parse memory.memsw.limit_in_bytes - %s", err) + } + if value != memoryswapAfter { + t.Fatal("Got the wrong value, set memory.memsw.limit_in_bytes failed.") + } +} + +func TestMemorySetKernelMemory(t *testing.T) { + helper := NewCgroupTestUtil("memory", t) + defer helper.cleanup() + + const ( + kernelMemoryBefore = 314572800 // 300M + kernelMemoryAfter = 524288000 // 500M + ) + + helper.writeFileContents(map[string]string{ + "memory.kmem.limit_in_bytes": strconv.Itoa(kernelMemoryBefore), + }) + + helper.CgroupData.config.Resources.KernelMemory = kernelMemoryAfter + memory := &MemoryGroup{} + if err := memory.Set(helper.CgroupPath, helper.CgroupData.config); err != nil { + t.Fatal(err) + } + + value, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "memory.kmem.limit_in_bytes") + if err != nil { + t.Fatalf("Failed to parse memory.kmem.limit_in_bytes - %s", err) + } + if value != kernelMemoryAfter { + t.Fatal("Got the wrong value, set memory.kmem.limit_in_bytes failed.") + } +} + +func TestMemorySetKernelMemoryTCP(t *testing.T) { + helper := NewCgroupTestUtil("memory", t) + defer helper.cleanup() + + const ( + kernelMemoryTCPBefore = 314572800 // 300M + kernelMemoryTCPAfter = 524288000 // 500M + ) + + helper.writeFileContents(map[string]string{ + "memory.kmem.tcp.limit_in_bytes": strconv.Itoa(kernelMemoryTCPBefore), + }) + + helper.CgroupData.config.Resources.KernelMemoryTCP = kernelMemoryTCPAfter + memory := &MemoryGroup{} + if err := memory.Set(helper.CgroupPath, helper.CgroupData.config); err != nil { + t.Fatal(err) + } + + value, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "memory.kmem.tcp.limit_in_bytes") + if err != nil { + t.Fatalf("Failed to parse memory.kmem.tcp.limit_in_bytes - %s", err) + } + if value != kernelMemoryTCPAfter { + t.Fatal("Got the wrong value, set memory.kmem.tcp.limit_in_bytes failed.") + } +} + +func TestMemorySetMemorySwappinessDefault(t *testing.T) { + helper := NewCgroupTestUtil("memory", t) + defer helper.cleanup() + + swappinessBefore := 60 //default is 60 + swappinessAfter := uint64(0) + + helper.writeFileContents(map[string]string{ + "memory.swappiness": strconv.Itoa(swappinessBefore), + }) + + helper.CgroupData.config.Resources.MemorySwappiness = &swappinessAfter + memory := &MemoryGroup{} + if err := memory.Set(helper.CgroupPath, helper.CgroupData.config); err != nil { + t.Fatal(err) + } + + value, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "memory.swappiness") + if err != nil { + t.Fatalf("Failed to parse memory.swappiness - %s", err) + } + if value != swappinessAfter { + t.Fatalf("Got the wrong value (%d), set memory.swappiness = %d failed.", value, swappinessAfter) + } +} + +func TestMemoryStats(t *testing.T) { + helper := NewCgroupTestUtil("memory", t) + defer helper.cleanup() + helper.writeFileContents(map[string]string{ + "memory.stat": memoryStatContents, + "memory.usage_in_bytes": memoryUsageContents, + "memory.limit_in_bytes": memoryLimitContents, + "memory.max_usage_in_bytes": memoryMaxUsageContents, + "memory.failcnt": memoryFailcnt, + "memory.memsw.usage_in_bytes": memoryUsageContents, + "memory.memsw.max_usage_in_bytes": memoryMaxUsageContents, + "memory.memsw.failcnt": memoryFailcnt, + "memory.memsw.limit_in_bytes": memoryLimitContents, + "memory.kmem.usage_in_bytes": memoryUsageContents, + "memory.kmem.max_usage_in_bytes": memoryMaxUsageContents, + "memory.kmem.failcnt": memoryFailcnt, + "memory.kmem.limit_in_bytes": memoryLimitContents, + "memory.use_hierarchy": memoryUseHierarchyContents, + }) + + memory := &MemoryGroup{} + actualStats := *cgroups.NewStats() + err := memory.GetStats(helper.CgroupPath, &actualStats) + if err != nil { + t.Fatal(err) + } + expectedStats := cgroups.MemoryStats{Cache: 512, Usage: cgroups.MemoryData{Usage: 2048, MaxUsage: 4096, Failcnt: 100, Limit: 8192}, SwapUsage: cgroups.MemoryData{Usage: 2048, MaxUsage: 4096, Failcnt: 100, Limit: 8192}, KernelUsage: cgroups.MemoryData{Usage: 2048, MaxUsage: 4096, Failcnt: 100, Limit: 8192}, Stats: map[string]uint64{"cache": 512, "rss": 1024}, UseHierarchy: true} + expectMemoryStatEquals(t, expectedStats, actualStats.MemoryStats) +} + +func TestMemoryStatsNoStatFile(t *testing.T) { + helper := NewCgroupTestUtil("memory", t) + defer helper.cleanup() + helper.writeFileContents(map[string]string{ + "memory.usage_in_bytes": memoryUsageContents, + "memory.max_usage_in_bytes": memoryMaxUsageContents, + "memory.limit_in_bytes": memoryLimitContents, + }) + + memory := &MemoryGroup{} + actualStats := *cgroups.NewStats() + err := memory.GetStats(helper.CgroupPath, &actualStats) + if err != nil { + t.Fatal(err) + } +} + +func TestMemoryStatsNoUsageFile(t *testing.T) { + helper := NewCgroupTestUtil("memory", t) + defer helper.cleanup() + helper.writeFileContents(map[string]string{ + "memory.stat": memoryStatContents, + "memory.max_usage_in_bytes": memoryMaxUsageContents, + "memory.limit_in_bytes": memoryLimitContents, + }) + + memory := &MemoryGroup{} + actualStats := *cgroups.NewStats() + err := memory.GetStats(helper.CgroupPath, &actualStats) + if err == nil { + t.Fatal("Expected failure") + } +} + +func TestMemoryStatsNoMaxUsageFile(t *testing.T) { + helper := NewCgroupTestUtil("memory", t) + defer helper.cleanup() + helper.writeFileContents(map[string]string{ + "memory.stat": memoryStatContents, + "memory.usage_in_bytes": memoryUsageContents, + "memory.limit_in_bytes": memoryLimitContents, + }) + + memory := &MemoryGroup{} + actualStats := *cgroups.NewStats() + err := memory.GetStats(helper.CgroupPath, &actualStats) + if err == nil { + t.Fatal("Expected failure") + } +} + +func TestMemoryStatsNoLimitInBytesFile(t *testing.T) { + helper := NewCgroupTestUtil("memory", t) + defer helper.cleanup() + helper.writeFileContents(map[string]string{ + "memory.stat": memoryStatContents, + "memory.usage_in_bytes": memoryUsageContents, + "memory.max_usage_in_bytes": memoryMaxUsageContents, + }) + + memory := &MemoryGroup{} + actualStats := *cgroups.NewStats() + err := memory.GetStats(helper.CgroupPath, &actualStats) + if err == nil { + t.Fatal("Expected failure") + } +} + +func TestMemoryStatsBadStatFile(t *testing.T) { + helper := NewCgroupTestUtil("memory", t) + defer helper.cleanup() + helper.writeFileContents(map[string]string{ + "memory.stat": "rss rss", + "memory.usage_in_bytes": memoryUsageContents, + "memory.max_usage_in_bytes": memoryMaxUsageContents, + "memory.limit_in_bytes": memoryLimitContents, + }) + + memory := &MemoryGroup{} + actualStats := *cgroups.NewStats() + err := memory.GetStats(helper.CgroupPath, &actualStats) + if err == nil { + t.Fatal("Expected failure") + } +} + +func TestMemoryStatsBadUsageFile(t *testing.T) { + helper := NewCgroupTestUtil("memory", t) + defer helper.cleanup() + helper.writeFileContents(map[string]string{ + "memory.stat": memoryStatContents, + "memory.usage_in_bytes": "bad", + "memory.max_usage_in_bytes": memoryMaxUsageContents, + "memory.limit_in_bytes": memoryLimitContents, + }) + + memory := &MemoryGroup{} + actualStats := *cgroups.NewStats() + err := memory.GetStats(helper.CgroupPath, &actualStats) + if err == nil { + t.Fatal("Expected failure") + } +} + +func TestMemoryStatsBadMaxUsageFile(t *testing.T) { + helper := NewCgroupTestUtil("memory", t) + defer helper.cleanup() + helper.writeFileContents(map[string]string{ + "memory.stat": memoryStatContents, + "memory.usage_in_bytes": memoryUsageContents, + "memory.max_usage_in_bytes": "bad", + "memory.limit_in_bytes": memoryLimitContents, + }) + + memory := &MemoryGroup{} + actualStats := *cgroups.NewStats() + err := memory.GetStats(helper.CgroupPath, &actualStats) + if err == nil { + t.Fatal("Expected failure") + } +} + +func TestMemoryStatsBadLimitInBytesFile(t *testing.T) { + helper := NewCgroupTestUtil("memory", t) + defer helper.cleanup() + helper.writeFileContents(map[string]string{ + "memory.stat": memoryStatContents, + "memory.usage_in_bytes": memoryUsageContents, + "memory.max_usage_in_bytes": memoryMaxUsageContents, + "memory.limit_in_bytes": "bad", + }) + + memory := &MemoryGroup{} + actualStats := *cgroups.NewStats() + err := memory.GetStats(helper.CgroupPath, &actualStats) + if err == nil { + t.Fatal("Expected failure") + } +} + +func TestMemorySetOomControl(t *testing.T) { + helper := NewCgroupTestUtil("memory", t) + defer helper.cleanup() + + const ( + oomKillDisable = 1 // disable oom killer, default is 0 + ) + + helper.writeFileContents(map[string]string{ + "memory.oom_control": strconv.Itoa(oomKillDisable), + }) + + memory := &MemoryGroup{} + if err := memory.Set(helper.CgroupPath, helper.CgroupData.config); err != nil { + t.Fatal(err) + } + + value, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "memory.oom_control") + if err != nil { + t.Fatalf("Failed to parse memory.oom_control - %s", err) + } + + if value != oomKillDisable { + t.Fatalf("Got the wrong value, set memory.oom_control failed.") + } +} diff --git a/libcontainer/cgroups/fs/name.go b/libcontainer/cgroups/fs/name.go new file mode 100644 index 0000000..d8cf1d8 --- /dev/null +++ b/libcontainer/cgroups/fs/name.go @@ -0,0 +1,40 @@ +// +build linux + +package fs + +import ( + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" +) + +type NameGroup struct { + GroupName string + Join bool +} + +func (s *NameGroup) Name() string { + return s.GroupName +} + +func (s *NameGroup) Apply(d *cgroupData) error { + if s.Join { + // ignore errors if the named cgroup does not exist + d.join(s.GroupName) + } + return nil +} + +func (s *NameGroup) Set(path string, cgroup *configs.Cgroup) error { + return nil +} + +func (s *NameGroup) Remove(d *cgroupData) error { + if s.Join { + removePath(d.path(s.GroupName)) + } + return nil +} + +func (s *NameGroup) GetStats(path string, stats *cgroups.Stats) error { + return nil +} diff --git a/libcontainer/cgroups/fs/net_cls.go b/libcontainer/cgroups/fs/net_cls.go new file mode 100644 index 0000000..0212015 --- /dev/null +++ b/libcontainer/cgroups/fs/net_cls.go @@ -0,0 +1,44 @@ +// +build linux + +package fs + +import ( + "strconv" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" +) + +type NetClsGroup struct { +} + +func (s *NetClsGroup) Name() string { + return "net_cls" +} + +func (s *NetClsGroup) Apply(d *cgroupData) error { + _, err := d.join("net_cls") + if err != nil && !cgroups.IsNotFound(err) { + return err + } + return nil +} + +func (s *NetClsGroup) Set(path string, cgroup *configs.Cgroup) error { + if cgroup.Resources.NetClsClassid != 0 { + if err := fscommon.WriteFile(path, "net_cls.classid", strconv.FormatUint(uint64(cgroup.Resources.NetClsClassid), 10)); err != nil { + return err + } + } + + return nil +} + +func (s *NetClsGroup) Remove(d *cgroupData) error { + return removePath(d.path("net_cls")) +} + +func (s *NetClsGroup) GetStats(path string, stats *cgroups.Stats) error { + return nil +} diff --git a/libcontainer/cgroups/fs/net_cls_test.go b/libcontainer/cgroups/fs/net_cls_test.go new file mode 100644 index 0000000..602133a --- /dev/null +++ b/libcontainer/cgroups/fs/net_cls_test.go @@ -0,0 +1,41 @@ +// +build linux + +package fs + +import ( + "strconv" + "testing" + + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" +) + +const ( + classidBefore = 0x100002 + classidAfter = 0x100001 +) + +func TestNetClsSetClassid(t *testing.T) { + helper := NewCgroupTestUtil("net_cls", t) + defer helper.cleanup() + + helper.writeFileContents(map[string]string{ + "net_cls.classid": strconv.FormatUint(classidBefore, 10), + }) + + helper.CgroupData.config.Resources.NetClsClassid = classidAfter + netcls := &NetClsGroup{} + if err := netcls.Set(helper.CgroupPath, helper.CgroupData.config); err != nil { + t.Fatal(err) + } + + // As we are in mock environment, we can't get correct value of classid from + // net_cls.classid. + // So. we just judge if we successfully write classid into file + value, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "net_cls.classid") + if err != nil { + t.Fatalf("Failed to parse net_cls.classid - %s", err) + } + if value != classidAfter { + t.Fatal("Got the wrong value, set net_cls.classid failed.") + } +} diff --git a/libcontainer/cgroups/fs/net_prio.go b/libcontainer/cgroups/fs/net_prio.go new file mode 100644 index 0000000..2bdeedf --- /dev/null +++ b/libcontainer/cgroups/fs/net_prio.go @@ -0,0 +1,42 @@ +// +build linux + +package fs + +import ( + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" +) + +type NetPrioGroup struct { +} + +func (s *NetPrioGroup) Name() string { + return "net_prio" +} + +func (s *NetPrioGroup) Apply(d *cgroupData) error { + _, err := d.join("net_prio") + if err != nil && !cgroups.IsNotFound(err) { + return err + } + return nil +} + +func (s *NetPrioGroup) Set(path string, cgroup *configs.Cgroup) error { + for _, prioMap := range cgroup.Resources.NetPrioIfpriomap { + if err := fscommon.WriteFile(path, "net_prio.ifpriomap", prioMap.CgroupString()); err != nil { + return err + } + } + + return nil +} + +func (s *NetPrioGroup) Remove(d *cgroupData) error { + return removePath(d.path("net_prio")) +} + +func (s *NetPrioGroup) GetStats(path string, stats *cgroups.Stats) error { + return nil +} diff --git a/libcontainer/cgroups/fs/net_prio_test.go b/libcontainer/cgroups/fs/net_prio_test.go new file mode 100644 index 0000000..2ce8e19 --- /dev/null +++ b/libcontainer/cgroups/fs/net_prio_test.go @@ -0,0 +1,39 @@ +// +build linux + +package fs + +import ( + "strings" + "testing" + + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" +) + +var ( + prioMap = []*configs.IfPrioMap{ + { + Interface: "test", + Priority: 5, + }, + } +) + +func TestNetPrioSetIfPrio(t *testing.T) { + helper := NewCgroupTestUtil("net_prio", t) + defer helper.cleanup() + + helper.CgroupData.config.Resources.NetPrioIfpriomap = prioMap + netPrio := &NetPrioGroup{} + if err := netPrio.Set(helper.CgroupPath, helper.CgroupData.config); err != nil { + t.Fatal(err) + } + + value, err := fscommon.GetCgroupParamString(helper.CgroupPath, "net_prio.ifpriomap") + if err != nil { + t.Fatalf("Failed to parse net_prio.ifpriomap - %s", err) + } + if !strings.Contains(value, "test 5") { + t.Fatal("Got the wrong value, set net_prio.ifpriomap failed.") + } +} diff --git a/libcontainer/cgroups/fs/perf_event.go b/libcontainer/cgroups/fs/perf_event.go new file mode 100644 index 0000000..5693676 --- /dev/null +++ b/libcontainer/cgroups/fs/perf_event.go @@ -0,0 +1,35 @@ +// +build linux + +package fs + +import ( + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" +) + +type PerfEventGroup struct { +} + +func (s *PerfEventGroup) Name() string { + return "perf_event" +} + +func (s *PerfEventGroup) Apply(d *cgroupData) error { + // we just want to join this group even though we don't set anything + if _, err := d.join("perf_event"); err != nil && !cgroups.IsNotFound(err) { + return err + } + return nil +} + +func (s *PerfEventGroup) Set(path string, cgroup *configs.Cgroup) error { + return nil +} + +func (s *PerfEventGroup) Remove(d *cgroupData) error { + return removePath(d.path("perf_event")) +} + +func (s *PerfEventGroup) GetStats(path string, stats *cgroups.Stats) error { + return nil +} diff --git a/libcontainer/cgroups/fs/pids.go b/libcontainer/cgroups/fs/pids.go new file mode 100644 index 0000000..7bf6801 --- /dev/null +++ b/libcontainer/cgroups/fs/pids.go @@ -0,0 +1,74 @@ +// +build linux + +package fs + +import ( + "fmt" + "path/filepath" + "strconv" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" +) + +type PidsGroup struct { +} + +func (s *PidsGroup) Name() string { + return "pids" +} + +func (s *PidsGroup) Apply(d *cgroupData) error { + _, err := d.join("pids") + if err != nil && !cgroups.IsNotFound(err) { + return err + } + return nil +} + +func (s *PidsGroup) Set(path string, cgroup *configs.Cgroup) error { + if cgroup.Resources.PidsLimit != 0 { + // "max" is the fallback value. + limit := "max" + + if cgroup.Resources.PidsLimit > 0 { + limit = strconv.FormatInt(cgroup.Resources.PidsLimit, 10) + } + + if err := fscommon.WriteFile(path, "pids.max", limit); err != nil { + return err + } + } + + return nil +} + +func (s *PidsGroup) Remove(d *cgroupData) error { + return removePath(d.path("pids")) +} + +func (s *PidsGroup) GetStats(path string, stats *cgroups.Stats) error { + current, err := fscommon.GetCgroupParamUint(path, "pids.current") + if err != nil { + return fmt.Errorf("failed to parse pids.current - %s", err) + } + + maxString, err := fscommon.GetCgroupParamString(path, "pids.max") + if err != nil { + return fmt.Errorf("failed to parse pids.max - %s", err) + } + + // Default if pids.max == "max" is 0 -- which represents "no limit". + var max uint64 + if maxString != "max" { + max, err = fscommon.ParseUint(maxString, 10, 64) + if err != nil { + return fmt.Errorf("failed to parse pids.max - unable to parse %q as a uint from Cgroup file %q", maxString, filepath.Join(path, "pids.max")) + } + } + + stats.PidsStats.Current = current + stats.PidsStats.Limit = max + return nil +} diff --git a/libcontainer/cgroups/fs/pids_test.go b/libcontainer/cgroups/fs/pids_test.go new file mode 100644 index 0000000..66f3aa3 --- /dev/null +++ b/libcontainer/cgroups/fs/pids_test.go @@ -0,0 +1,112 @@ +// +build linux + +package fs + +import ( + "strconv" + "testing" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" +) + +const ( + maxUnlimited = -1 + maxLimited = 1024 +) + +func TestPidsSetMax(t *testing.T) { + helper := NewCgroupTestUtil("pids", t) + defer helper.cleanup() + + helper.writeFileContents(map[string]string{ + "pids.max": "max", + }) + + helper.CgroupData.config.Resources.PidsLimit = maxLimited + pids := &PidsGroup{} + if err := pids.Set(helper.CgroupPath, helper.CgroupData.config); err != nil { + t.Fatal(err) + } + + value, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "pids.max") + if err != nil { + t.Fatalf("Failed to parse pids.max - %s", err) + } + + if value != maxLimited { + t.Fatalf("Expected %d, got %d for setting pids.max - limited", maxLimited, value) + } +} + +func TestPidsSetUnlimited(t *testing.T) { + helper := NewCgroupTestUtil("pids", t) + defer helper.cleanup() + + helper.writeFileContents(map[string]string{ + "pids.max": strconv.Itoa(maxLimited), + }) + + helper.CgroupData.config.Resources.PidsLimit = maxUnlimited + pids := &PidsGroup{} + if err := pids.Set(helper.CgroupPath, helper.CgroupData.config); err != nil { + t.Fatal(err) + } + + value, err := fscommon.GetCgroupParamString(helper.CgroupPath, "pids.max") + if err != nil { + t.Fatalf("Failed to parse pids.max - %s", err) + } + + if value != "max" { + t.Fatalf("Expected %s, got %s for setting pids.max - unlimited", "max", value) + } +} + +func TestPidsStats(t *testing.T) { + helper := NewCgroupTestUtil("pids", t) + defer helper.cleanup() + + helper.writeFileContents(map[string]string{ + "pids.current": strconv.Itoa(1337), + "pids.max": strconv.Itoa(maxLimited), + }) + + pids := &PidsGroup{} + stats := *cgroups.NewStats() + if err := pids.GetStats(helper.CgroupPath, &stats); err != nil { + t.Fatal(err) + } + + if stats.PidsStats.Current != 1337 { + t.Fatalf("Expected %d, got %d for pids.current", 1337, stats.PidsStats.Current) + } + + if stats.PidsStats.Limit != maxLimited { + t.Fatalf("Expected %d, got %d for pids.max", maxLimited, stats.PidsStats.Limit) + } +} + +func TestPidsStatsUnlimited(t *testing.T) { + helper := NewCgroupTestUtil("pids", t) + defer helper.cleanup() + + helper.writeFileContents(map[string]string{ + "pids.current": strconv.Itoa(4096), + "pids.max": "max", + }) + + pids := &PidsGroup{} + stats := *cgroups.NewStats() + if err := pids.GetStats(helper.CgroupPath, &stats); err != nil { + t.Fatal(err) + } + + if stats.PidsStats.Current != 4096 { + t.Fatalf("Expected %d, got %d for pids.current", 4096, stats.PidsStats.Current) + } + + if stats.PidsStats.Limit != 0 { + t.Fatalf("Expected %d, got %d for pids.max", 0, stats.PidsStats.Limit) + } +} diff --git a/libcontainer/cgroups/fs/stats_util_test.go b/libcontainer/cgroups/fs/stats_util_test.go new file mode 100644 index 0000000..c5a8d18 --- /dev/null +++ b/libcontainer/cgroups/fs/stats_util_test.go @@ -0,0 +1,123 @@ +// +build linux + +package fs + +import ( + "fmt" + "testing" + + "github.com/opencontainers/runc/libcontainer/cgroups" + + "github.com/sirupsen/logrus" +) + +func blkioStatEntryEquals(expected, actual []cgroups.BlkioStatEntry) error { + if len(expected) != len(actual) { + return fmt.Errorf("blkioStatEntries length do not match") + } + for i, expValue := range expected { + actValue := actual[i] + if expValue != actValue { + return fmt.Errorf("Expected blkio stat entry %v but found %v", expValue, actValue) + } + } + return nil +} + +func expectBlkioStatsEquals(t *testing.T, expected, actual cgroups.BlkioStats) { + if err := blkioStatEntryEquals(expected.IoServiceBytesRecursive, actual.IoServiceBytesRecursive); err != nil { + logrus.Printf("blkio IoServiceBytesRecursive do not match - %s\n", err) + t.Fail() + } + + if err := blkioStatEntryEquals(expected.IoServicedRecursive, actual.IoServicedRecursive); err != nil { + logrus.Printf("blkio IoServicedRecursive do not match - %s\n", err) + t.Fail() + } + + if err := blkioStatEntryEquals(expected.IoQueuedRecursive, actual.IoQueuedRecursive); err != nil { + logrus.Printf("blkio IoQueuedRecursive do not match - %s\n", err) + t.Fail() + } + + if err := blkioStatEntryEquals(expected.SectorsRecursive, actual.SectorsRecursive); err != nil { + logrus.Printf("blkio SectorsRecursive do not match - %s\n", err) + t.Fail() + } + + if err := blkioStatEntryEquals(expected.IoServiceTimeRecursive, actual.IoServiceTimeRecursive); err != nil { + logrus.Printf("blkio IoServiceTimeRecursive do not match - %s\n", err) + t.Fail() + } + + if err := blkioStatEntryEquals(expected.IoWaitTimeRecursive, actual.IoWaitTimeRecursive); err != nil { + logrus.Printf("blkio IoWaitTimeRecursive do not match - %s\n", err) + t.Fail() + } + + if err := blkioStatEntryEquals(expected.IoMergedRecursive, actual.IoMergedRecursive); err != nil { + logrus.Printf("blkio IoMergedRecursive do not match - %v vs %v\n", expected.IoMergedRecursive, actual.IoMergedRecursive) + t.Fail() + } + + if err := blkioStatEntryEquals(expected.IoTimeRecursive, actual.IoTimeRecursive); err != nil { + logrus.Printf("blkio IoTimeRecursive do not match - %s\n", err) + t.Fail() + } +} + +func expectThrottlingDataEquals(t *testing.T, expected, actual cgroups.ThrottlingData) { + if expected != actual { + logrus.Printf("Expected throttling data %v but found %v\n", expected, actual) + t.Fail() + } +} + +func expectHugetlbStatEquals(t *testing.T, expected, actual cgroups.HugetlbStats) { + if expected != actual { + logrus.Printf("Expected hugetlb stats %v but found %v\n", expected, actual) + t.Fail() + } +} + +func expectMemoryStatEquals(t *testing.T, expected, actual cgroups.MemoryStats) { + expectMemoryDataEquals(t, expected.Usage, actual.Usage) + expectMemoryDataEquals(t, expected.SwapUsage, actual.SwapUsage) + expectMemoryDataEquals(t, expected.KernelUsage, actual.KernelUsage) + + if expected.UseHierarchy != actual.UseHierarchy { + logrus.Printf("Expected memory use hierarchy %v, but found %v\n", expected.UseHierarchy, actual.UseHierarchy) + t.Fail() + } + + for key, expValue := range expected.Stats { + actValue, ok := actual.Stats[key] + if !ok { + logrus.Printf("Expected memory stat key %s not found\n", key) + t.Fail() + } + if expValue != actValue { + logrus.Printf("Expected memory stat value %d but found %d\n", expValue, actValue) + t.Fail() + } + } +} + +func expectMemoryDataEquals(t *testing.T, expected, actual cgroups.MemoryData) { + if expected.Usage != actual.Usage { + logrus.Printf("Expected memory usage %d but found %d\n", expected.Usage, actual.Usage) + t.Fail() + } + if expected.MaxUsage != actual.MaxUsage { + logrus.Printf("Expected memory max usage %d but found %d\n", expected.MaxUsage, actual.MaxUsage) + t.Fail() + } + if expected.Failcnt != actual.Failcnt { + logrus.Printf("Expected memory failcnt %d but found %d\n", expected.Failcnt, actual.Failcnt) + t.Fail() + } + if expected.Limit != actual.Limit { + logrus.Printf("Expected memory limit %d but found %d\n", expected.Limit, actual.Limit) + t.Fail() + } +} diff --git a/libcontainer/cgroups/fs/util_test.go b/libcontainer/cgroups/fs/util_test.go new file mode 100644 index 0000000..2c50d6f --- /dev/null +++ b/libcontainer/cgroups/fs/util_test.go @@ -0,0 +1,68 @@ +// +build linux + +/* +Utility for testing cgroup operations. + +Creates a mock of the cgroup filesystem for the duration of the test. +*/ +package fs + +import ( + "io/ioutil" + "os" + "path/filepath" + "testing" + + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" +) + +type cgroupTestUtil struct { + // cgroup data to use in tests. + CgroupData *cgroupData + + // Path to the mock cgroup directory. + CgroupPath string + + // Temporary directory to store mock cgroup filesystem. + tempDir string + t *testing.T +} + +// Creates a new test util for the specified subsystem +func NewCgroupTestUtil(subsystem string, t *testing.T) *cgroupTestUtil { + d := &cgroupData{ + config: &configs.Cgroup{}, + } + d.config.Resources = &configs.Resources{} + tempDir, err := ioutil.TempDir("", "cgroup_test") + if err != nil { + t.Fatal(err) + } + d.root = tempDir + testCgroupPath := filepath.Join(d.root, subsystem) + if err != nil { + t.Fatal(err) + } + + // Ensure the full mock cgroup path exists. + err = os.MkdirAll(testCgroupPath, 0755) + if err != nil { + t.Fatal(err) + } + return &cgroupTestUtil{CgroupData: d, CgroupPath: testCgroupPath, tempDir: tempDir, t: t} +} + +func (c *cgroupTestUtil) cleanup() { + os.RemoveAll(c.tempDir) +} + +// Write the specified contents on the mock of the specified cgroup files. +func (c *cgroupTestUtil) writeFileContents(fileContents map[string]string) { + for file, contents := range fileContents { + err := fscommon.WriteFile(c.CgroupPath, file, contents) + if err != nil { + c.t.Fatal(err) + } + } +} diff --git a/libcontainer/cgroups/fs2/cpu.go b/libcontainer/cgroups/fs2/cpu.go new file mode 100644 index 0000000..f0f5df0 --- /dev/null +++ b/libcontainer/cgroups/fs2/cpu.go @@ -0,0 +1,56 @@ +// +build linux + +package fs2 + +import ( + "bufio" + "os" + "path/filepath" + "strconv" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" +) + +func setCpu(dirPath string, cgroup *configs.Cgroup) error { + if cgroup.Resources.CpuWeight != 0 { + if err := fscommon.WriteFile(dirPath, "cpu.weight", strconv.FormatUint(cgroup.Resources.CpuWeight, 10)); err != nil { + return err + } + } + + if cgroup.Resources.CpuMax != "" { + if err := fscommon.WriteFile(dirPath, "cpu.max", cgroup.Resources.CpuMax); err != nil { + return err + } + } + + return nil +} +func statCpu(dirPath string, stats *cgroups.Stats) error { + f, err := os.Open(filepath.Join(dirPath, "cpu.stat")) + if err != nil { + return err + } + defer f.Close() + + sc := bufio.NewScanner(f) + for sc.Scan() { + t, v, err := fscommon.GetCgroupParamKeyValue(sc.Text()) + if err != nil { + return err + } + switch t { + case "usage_usec": + stats.CpuStats.CpuUsage.TotalUsage = v * 1000 + + case "user_usec": + stats.CpuStats.CpuUsage.UsageInUsermode = v * 1000 + + case "system_usec": + stats.CpuStats.CpuUsage.UsageInKernelmode = v * 1000 + } + } + return nil +} diff --git a/libcontainer/cgroups/fs2/cpuset.go b/libcontainer/cgroups/fs2/cpuset.go new file mode 100644 index 0000000..6492ac9 --- /dev/null +++ b/libcontainer/cgroups/fs2/cpuset.go @@ -0,0 +1,22 @@ +// +build linux + +package fs2 + +import ( + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" +) + +func setCpuset(dirPath string, cgroup *configs.Cgroup) error { + if cgroup.Resources.CpusetCpus != "" { + if err := fscommon.WriteFile(dirPath, "cpuset.cpus", cgroup.Resources.CpusetCpus); err != nil { + return err + } + } + if cgroup.Resources.CpusetMems != "" { + if err := fscommon.WriteFile(dirPath, "cpuset.mems", cgroup.Resources.CpusetMems); err != nil { + return err + } + } + return nil +} diff --git a/libcontainer/cgroups/fs2/defaultpath.go b/libcontainer/cgroups/fs2/defaultpath.go new file mode 100644 index 0000000..e84b33f --- /dev/null +++ b/libcontainer/cgroups/fs2/defaultpath.go @@ -0,0 +1,99 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package fs2 + +import ( + "bufio" + "io" + "os" + "path/filepath" + "strings" + + "github.com/opencontainers/runc/libcontainer/configs" + libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils" + "github.com/pkg/errors" +) + +const UnifiedMountpoint = "/sys/fs/cgroup" + +func defaultDirPath(c *configs.Cgroup) (string, error) { + if (c.Name != "" || c.Parent != "") && c.Path != "" { + return "", errors.Errorf("cgroup: either Path or Name and Parent should be used, got %+v", c) + } + if len(c.Paths) != 0 { + // never set by specconv + return "", errors.Errorf("cgroup: Paths is unsupported, use Path, got %+v", c) + } + + // XXX: Do not remove this code. Path safety is important! -- cyphar + cgPath := libcontainerUtils.CleanPath(c.Path) + cgParent := libcontainerUtils.CleanPath(c.Parent) + cgName := libcontainerUtils.CleanPath(c.Name) + + ownCgroup, err := parseCgroupFile("/proc/self/cgroup") + if err != nil { + return "", err + } + return _defaultDirPath(UnifiedMountpoint, cgPath, cgParent, cgName, ownCgroup) +} + +func _defaultDirPath(root, cgPath, cgParent, cgName, ownCgroup string) (string, error) { + if (cgName != "" || cgParent != "") && cgPath != "" { + return "", errors.New("cgroup: either Path or Name and Parent should be used") + } + innerPath := cgPath + if innerPath == "" { + innerPath = filepath.Join(cgParent, cgName) + } + if filepath.IsAbs(innerPath) { + return filepath.Join(root, innerPath), nil + } + return filepath.Join(root, ownCgroup, innerPath), nil +} + +// parseCgroupFile parses /proc/PID/cgroup file and return string +func parseCgroupFile(path string) (string, error) { + f, err := os.Open(path) + if err != nil { + return "", err + } + defer f.Close() + return parseCgroupFromReader(f) +} + +func parseCgroupFromReader(r io.Reader) (string, error) { + var ( + s = bufio.NewScanner(r) + ) + for s.Scan() { + if err := s.Err(); err != nil { + return "", err + } + var ( + text = s.Text() + parts = strings.SplitN(text, ":", 3) + ) + if len(parts) < 3 { + return "", errors.Errorf("invalid cgroup entry: %q", text) + } + // text is like "0::/user.slice/user-1001.slice/session-1.scope" + if parts[0] == "0" && parts[1] == "" { + return parts[2], nil + } + } + return "", errors.New("cgroup path not found") +} diff --git a/libcontainer/cgroups/fs2/defaultpath_test.go b/libcontainer/cgroups/fs2/defaultpath_test.go new file mode 100644 index 0000000..6d5d117 --- /dev/null +++ b/libcontainer/cgroups/fs2/defaultpath_test.go @@ -0,0 +1,76 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package fs2 + +import ( + "strings" + "testing" +) + +func TestParseCgroupFromReader(t *testing.T) { + cases := map[string]string{ + "0::/user.slice/user-1001.slice/session-1.scope\n": "/user.slice/user-1001.slice/session-1.scope", + "2:cpuset:/foo\n1:name=systemd:/\n": "", + "2:cpuset:/foo\n1:name=systemd:/\n0::/user.slice/user-1001.slice/session-1.scope\n": "/user.slice/user-1001.slice/session-1.scope", + } + for s, expected := range cases { + g, err := parseCgroupFromReader(strings.NewReader(s)) + if expected != "" { + if string(g) != expected { + t.Errorf("expected %q, got %q", expected, string(g)) + } + if err != nil { + t.Error(err) + } + } else { + if err == nil { + t.Error("error is expected") + } + } + } +} + +func TestDefaultDirPath(t *testing.T) { + root := "/sys/fs/cgroup" + cases := []struct { + cgPath string + cgParent string + cgName string + ownCgroup string + expected string + }{ + { + cgPath: "/foo/bar", + ownCgroup: "/apple/banana", + expected: "/sys/fs/cgroup/foo/bar", + }, + { + cgPath: "foo/bar", + ownCgroup: "/apple/banana", + expected: "/sys/fs/cgroup/apple/banana/foo/bar", + }, + } + for _, c := range cases { + got, err := _defaultDirPath(root, c.cgPath, c.cgParent, c.cgName, c.ownCgroup) + if err != nil { + t.Fatal(err) + } + if got != c.expected { + t.Fatalf("expected %q, got %q", c.expected, got) + } + } +} diff --git a/libcontainer/cgroups/fs2/devices.go b/libcontainer/cgroups/fs2/devices.go new file mode 100644 index 0000000..e0fd685 --- /dev/null +++ b/libcontainer/cgroups/fs2/devices.go @@ -0,0 +1,73 @@ +// +build linux + +package fs2 + +import ( + "github.com/opencontainers/runc/libcontainer/cgroups/ebpf" + "github.com/opencontainers/runc/libcontainer/cgroups/ebpf/devicefilter" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/pkg/errors" + "golang.org/x/sys/unix" +) + +func isRWM(cgroupPermissions string) bool { + r := false + w := false + m := false + for _, rn := range cgroupPermissions { + switch rn { + case 'r': + r = true + case 'w': + w = true + case 'm': + m = true + } + } + return r && w && m +} + +// the logic is from crun +// https://github.com/containers/crun/blob/0.10.2/src/libcrun/cgroup.c#L1644-L1652 +func canSkipEBPFError(cgroup *configs.Cgroup) bool { + for _, dev := range cgroup.Resources.Devices { + if dev.Allow || !isRWM(dev.Permissions) { + return false + } + } + return true +} + +func setDevices(dirPath string, cgroup *configs.Cgroup) error { + devices := cgroup.Devices + if allowAllDevices := cgroup.Resources.AllowAllDevices; allowAllDevices != nil { + // never set by OCI specconv, but *allowAllDevices=false is still used by the integration test + if *allowAllDevices == true { + return errors.New("libcontainer AllowAllDevices is not supported, use Devices") + } + for _, ad := range cgroup.Resources.AllowedDevices { + d := *ad + d.Allow = true + devices = append(devices, &d) + } + } + if len(cgroup.Resources.DeniedDevices) != 0 { + // never set by OCI specconv + return errors.New("libcontainer DeniedDevices is not supported, use Devices") + } + insts, license, err := devicefilter.DeviceFilter(devices) + if err != nil { + return err + } + dirFD, err := unix.Open(dirPath, unix.O_DIRECTORY|unix.O_RDONLY, 0600) + if err != nil { + return errors.Errorf("cannot get dir FD for %s", dirPath) + } + defer unix.Close(dirFD) + if _, err := ebpf.LoadAttachCgroupDeviceFilter(insts, license, dirFD); err != nil { + if !canSkipEBPFError(cgroup) { + return err + } + } + return nil +} diff --git a/libcontainer/cgroups/fs2/freezer.go b/libcontainer/cgroups/fs2/freezer.go new file mode 100644 index 0000000..130c63f --- /dev/null +++ b/libcontainer/cgroups/fs2/freezer.go @@ -0,0 +1,53 @@ +// +build linux + +package fs2 + +import ( + "strconv" + "strings" + + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/pkg/errors" +) + +func setFreezer(dirPath string, state configs.FreezerState) error { + var desired int + switch state { + case configs.Undefined: + return nil + case configs.Frozen: + desired = 1 + case configs.Thawed: + desired = 0 + default: + return errors.Errorf("unknown freezer state %+v", state) + } + supportedErr := supportsFreezer(dirPath) + if supportedErr != nil && desired != 0 { + // can ignore error if desired == 1 + return errors.Wrap(supportedErr, "freezer not supported") + } + return freezeWithInt(dirPath, desired) +} + +func supportsFreezer(dirPath string) error { + _, err := fscommon.ReadFile(dirPath, "cgroup.freeze") + return err +} + +// freeze writes desired int to "cgroup.freeze". +func freezeWithInt(dirPath string, desired int) error { + desiredS := strconv.Itoa(desired) + if err := fscommon.WriteFile(dirPath, "cgroup.freeze", desiredS); err != nil { + return err + } + got, err := fscommon.ReadFile(dirPath, "cgroup.freeze") + if err != nil { + return err + } + if gotS := strings.TrimSpace(string(got)); gotS != desiredS { + return errors.Errorf("expected \"cgroup.freeze\" in %q to be %q, got %q", dirPath, desiredS, gotS) + } + return nil +} diff --git a/libcontainer/cgroups/fs2/fs2.go b/libcontainer/cgroups/fs2/fs2.go new file mode 100644 index 0000000..4bb7091 --- /dev/null +++ b/libcontainer/cgroups/fs2/fs2.go @@ -0,0 +1,214 @@ +// +build linux + +package fs2 + +import ( + "io/ioutil" + "os" + "path/filepath" + "strings" + + securejoin "github.com/cyphar/filepath-securejoin" + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/pkg/errors" +) + +// NewManager creates a manager for cgroup v2 unified hierarchy. +// dirPath is like "/sys/fs/cgroup/user.slice/user-1001.slice/session-1.scope". +// If dirPath is empty, it is automatically set using config. +func NewManager(config *configs.Cgroup, dirPath string, rootless bool) (cgroups.Manager, error) { + if config == nil { + config = &configs.Cgroup{} + } + if dirPath != "" { + if filepath.Clean(dirPath) != dirPath || !filepath.IsAbs(dirPath) { + return nil, errors.Errorf("invalid dir path %q", dirPath) + } + } else { + var err error + dirPath, err = defaultDirPath(config) + if err != nil { + return nil, err + } + } + controllers, err := detectControllers(dirPath) + if err != nil && !rootless { + return nil, err + } + + m := &manager{ + config: config, + dirPath: dirPath, + controllers: controllers, + rootless: rootless, + } + return m, nil +} + +func detectControllers(dirPath string) (map[string]struct{}, error) { + if err := os.MkdirAll(dirPath, 0755); err != nil { + return nil, err + } + controllersPath, err := securejoin.SecureJoin(dirPath, "cgroup.controllers") + if err != nil { + return nil, err + } + controllersData, err := ioutil.ReadFile(controllersPath) + if err != nil { + return nil, err + } + controllersFields := strings.Fields(string(controllersData)) + controllers := make(map[string]struct{}, len(controllersFields)) + for _, c := range controllersFields { + controllers[c] = struct{}{} + } + return controllers, nil +} + +type manager struct { + config *configs.Cgroup + // dirPath is like "/sys/fs/cgroup/user.slice/user-1001.slice/session-1.scope" + dirPath string + // controllers is content of "cgroup.controllers" file. + // excludes pseudo-controllers ("devices" and "freezer"). + controllers map[string]struct{} + rootless bool +} + +func (m *manager) Apply(pid int) error { + if err := cgroups.WriteCgroupProc(m.dirPath, pid); err != nil && !m.rootless { + return err + } + return nil +} + +func (m *manager) GetPids() ([]int, error) { + return cgroups.GetPids(m.dirPath) +} + +func (m *manager) GetAllPids() ([]int, error) { + return cgroups.GetAllPids(m.dirPath) +} + +func (m *manager) GetStats() (*cgroups.Stats, error) { + var ( + st cgroups.Stats + errs []error + ) + // pids (since kernel 4.5) + if _, ok := m.controllers["pids"]; ok { + if err := statPids(m.dirPath, &st); err != nil { + errs = append(errs, err) + } + } else { + if err := statPidsWithoutController(m.dirPath, &st); err != nil { + errs = append(errs, err) + } + } + // memory (since kenrel 4.5) + if _, ok := m.controllers["memory"]; ok { + if err := statMemory(m.dirPath, &st); err != nil { + errs = append(errs, err) + } + } + // io (since kernel 4.5) + if _, ok := m.controllers["io"]; ok { + if err := statIo(m.dirPath, &st); err != nil { + errs = append(errs, err) + } + } + // cpu (since kernel 4.15) + if _, ok := m.controllers["cpu"]; ok { + if err := statCpu(m.dirPath, &st); err != nil { + errs = append(errs, err) + } + } + if len(errs) > 0 && !m.rootless { + return &st, errors.Errorf("error while statting cgroup v2: %+v", errs) + } + return &st, nil +} + +func (m *manager) Freeze(state configs.FreezerState) error { + if err := setFreezer(m.dirPath, state); err != nil { + return err + } + m.config.Resources.Freezer = state + return nil +} + +func (m *manager) Destroy() error { + return os.RemoveAll(m.dirPath) +} + +// GetPaths is for compatibility purpose and should be removed in future +func (m *manager) GetPaths() map[string]string { + paths := map[string]string{ + // pseudo-controller for compatibility + "devices": m.dirPath, + "freezer": m.dirPath, + } + for c := range m.controllers { + paths[c] = m.dirPath + } + return paths +} + +func (m *manager) GetUnifiedPath() (string, error) { + return m.dirPath, nil +} + +func (m *manager) Set(container *configs.Config) error { + if container == nil || container.Cgroups == nil { + return nil + } + var errs []error + // pids (since kernel 4.5) + if _, ok := m.controllers["pids"]; ok { + if err := setPids(m.dirPath, container.Cgroups); err != nil { + errs = append(errs, err) + } + } + // memory (since kernel 4.5) + if _, ok := m.controllers["memory"]; ok { + if err := setMemory(m.dirPath, container.Cgroups); err != nil { + errs = append(errs, err) + } + } + // io (since kernel 4.5) + if _, ok := m.controllers["io"]; ok { + if err := setIo(m.dirPath, container.Cgroups); err != nil { + errs = append(errs, err) + } + } + // cpu (since kernel 4.15) + if _, ok := m.controllers["cpu"]; ok { + if err := setCpu(m.dirPath, container.Cgroups); err != nil { + errs = append(errs, err) + } + } + // devices (since kernel 4.15, pseudo-controller) + if err := setDevices(m.dirPath, container.Cgroups); err != nil { + errs = append(errs, err) + } + // cpuset (since kernel 5.0) + if _, ok := m.controllers["cpuset"]; ok { + if err := setCpuset(m.dirPath, container.Cgroups); err != nil { + errs = append(errs, err) + } + } + // freezer (since kernel 5.2, pseudo-controller) + if err := setFreezer(m.dirPath, container.Cgroups.Freezer); err != nil { + errs = append(errs, err) + } + if len(errs) > 0 && !m.rootless { + return errors.Errorf("error while setting cgroup v2: %+v", errs) + } + m.config = container.Cgroups + return nil +} + +func (m *manager) GetCgroups() (*configs.Cgroup, error) { + return m.config, nil +} diff --git a/libcontainer/cgroups/fs2/io.go b/libcontainer/cgroups/fs2/io.go new file mode 100644 index 0000000..9a07308 --- /dev/null +++ b/libcontainer/cgroups/fs2/io.go @@ -0,0 +1,124 @@ +// +build linux + +package fs2 + +import ( + "bufio" + "os" + "path/filepath" + "strconv" + "strings" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" +) + +func setIo(dirPath string, cgroup *configs.Cgroup) error { + if cgroup.Resources.BlkioWeight != 0 { + filename := "io.bfq.weight" + if err := fscommon.WriteFile(dirPath, filename, strconv.FormatUint(uint64(cgroup.Resources.BlkioWeight), 10)); err != nil { + return err + } + } + + for _, td := range cgroup.Resources.BlkioThrottleReadBpsDevice { + if err := fscommon.WriteFile(dirPath, "io.max", td.StringName("rbps")); err != nil { + return err + } + } + for _, td := range cgroup.Resources.BlkioThrottleWriteBpsDevice { + if err := fscommon.WriteFile(dirPath, "io.max", td.StringName("wbps")); err != nil { + return err + } + } + for _, td := range cgroup.Resources.BlkioThrottleReadIOPSDevice { + if err := fscommon.WriteFile(dirPath, "io.max", td.StringName("riops")); err != nil { + return err + } + } + for _, td := range cgroup.Resources.BlkioThrottleWriteIOPSDevice { + if err := fscommon.WriteFile(dirPath, "io.max", td.StringName("wiops")); err != nil { + return err + } + } + + return nil +} + +func readCgroup2MapFile(dirPath string, name string) (map[string][]string, error) { + ret := map[string][]string{} + p := filepath.Join(dirPath, name) + f, err := os.Open(p) + if err != nil { + return nil, err + } + defer f.Close() + scanner := bufio.NewScanner(f) + for scanner.Scan() { + line := scanner.Text() + parts := strings.Fields(line) + if len(parts) < 2 { + continue + } + ret[parts[0]] = parts[1:] + } + if err := scanner.Err(); err != nil { + return nil, err + } + return ret, nil +} + +func statIo(dirPath string, stats *cgroups.Stats) error { + // more details on the io.stat file format: https://www.kernel.org/doc/Documentation/cgroup-v2.txt + var ioServiceBytesRecursive []cgroups.BlkioStatEntry + values, err := readCgroup2MapFile(dirPath, "io.stat") + if err != nil { + return err + } + for k, v := range values { + d := strings.Split(k, ":") + if len(d) != 2 { + continue + } + minor, err := strconv.ParseUint(d[0], 10, 0) + if err != nil { + return err + } + major, err := strconv.ParseUint(d[1], 10, 0) + if err != nil { + return err + } + + for _, item := range v { + d := strings.Split(item, "=") + if len(d) != 2 { + continue + } + op := d[0] + + // Accommodate the cgroup v1 naming + switch op { + case "rbytes": + op = "read" + case "wbytes": + op = "write" + } + + value, err := strconv.ParseUint(d[1], 10, 0) + if err != nil { + return err + } + + entry := cgroups.BlkioStatEntry{ + Op: op, + Major: major, + Minor: minor, + Value: value, + } + ioServiceBytesRecursive = append(ioServiceBytesRecursive, entry) + } + } + stats.BlkioStats = cgroups.BlkioStats{IoServiceBytesRecursive: ioServiceBytesRecursive} + return nil +} diff --git a/libcontainer/cgroups/fs2/memory.go b/libcontainer/cgroups/fs2/memory.go new file mode 100644 index 0000000..23eccbe --- /dev/null +++ b/libcontainer/cgroups/fs2/memory.go @@ -0,0 +1,103 @@ +// +build linux + +package fs2 + +import ( + "bufio" + "os" + "path/filepath" + "strconv" + "strings" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/pkg/errors" +) + +func setMemory(dirPath string, cgroup *configs.Cgroup) error { + if cgroup.Resources.MemorySwap != 0 { + if err := fscommon.WriteFile(dirPath, "memory.swap.max", strconv.FormatInt(cgroup.Resources.MemorySwap, 10)); err != nil { + return err + } + } + if cgroup.Resources.Memory != 0 { + if err := fscommon.WriteFile(dirPath, "memory.max", strconv.FormatInt(cgroup.Resources.Memory, 10)); err != nil { + return err + } + } + + // cgroup.Resources.KernelMemory is ignored + + if cgroup.Resources.MemoryReservation != 0 { + if err := fscommon.WriteFile(dirPath, "memory.low", strconv.FormatInt(cgroup.Resources.MemoryReservation, 10)); err != nil { + return err + } + } + + return nil +} + +func statMemory(dirPath string, stats *cgroups.Stats) error { + // Set stats from memory.stat. + statsFile, err := os.Open(filepath.Join(dirPath, "memory.stat")) + if err != nil { + return err + } + defer statsFile.Close() + + sc := bufio.NewScanner(statsFile) + for sc.Scan() { + t, v, err := fscommon.GetCgroupParamKeyValue(sc.Text()) + if err != nil { + return errors.Wrapf(err, "failed to parse memory.stat (%q)", sc.Text()) + } + stats.MemoryStats.Stats[t] = v + } + stats.MemoryStats.Cache = stats.MemoryStats.Stats["cache"] + + memoryUsage, err := getMemoryDataV2(dirPath, "") + if err != nil { + return err + } + stats.MemoryStats.Usage = memoryUsage + swapUsage, err := getMemoryDataV2(dirPath, "swap") + if err != nil { + return err + } + stats.MemoryStats.SwapUsage = swapUsage + + stats.MemoryStats.UseHierarchy = true + return nil +} + +func getMemoryDataV2(path, name string) (cgroups.MemoryData, error) { + memoryData := cgroups.MemoryData{} + + moduleName := "memory" + if name != "" { + moduleName = strings.Join([]string{"memory", name}, ".") + } + usage := strings.Join([]string{moduleName, "current"}, ".") + limit := strings.Join([]string{moduleName, "max"}, ".") + + value, err := fscommon.GetCgroupParamUint(path, usage) + if err != nil { + if moduleName != "memory" && os.IsNotExist(err) { + return cgroups.MemoryData{}, nil + } + return cgroups.MemoryData{}, errors.Wrapf(err, "failed to parse %s", usage) + } + memoryData.Usage = value + + value, err = fscommon.GetCgroupParamUint(path, limit) + if err != nil { + if moduleName != "memory" && os.IsNotExist(err) { + return cgroups.MemoryData{}, nil + } + return cgroups.MemoryData{}, errors.Wrapf(err, "failed to parse %s", limit) + } + memoryData.Limit = value + + return memoryData, nil +} diff --git a/libcontainer/cgroups/fs2/pids.go b/libcontainer/cgroups/fs2/pids.go new file mode 100644 index 0000000..db2d7ac --- /dev/null +++ b/libcontainer/cgroups/fs2/pids.go @@ -0,0 +1,90 @@ +// +build linux + +package fs2 + +import ( + "io/ioutil" + "os" + "path/filepath" + "strconv" + "strings" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/pkg/errors" + "golang.org/x/sys/unix" +) + +func setPids(dirPath string, cgroup *configs.Cgroup) error { + if cgroup.Resources.PidsLimit != 0 { + // "max" is the fallback value. + limit := "max" + + if cgroup.Resources.PidsLimit > 0 { + limit = strconv.FormatInt(cgroup.Resources.PidsLimit, 10) + } + + if err := fscommon.WriteFile(dirPath, "pids.max", limit); err != nil { + return err + } + } + + return nil +} + +func isNOTSUP(err error) bool { + switch err := err.(type) { + case *os.PathError: + return err.Err == unix.ENOTSUP + default: + return false + } +} + +func statPidsWithoutController(dirPath string, stats *cgroups.Stats) error { + // if the controller is not enabled, let's read PIDS from cgroups.procs + // (or threads if cgroup.threads is enabled) + contents, err := ioutil.ReadFile(filepath.Join(dirPath, "cgroup.procs")) + if err != nil && isNOTSUP(err) { + contents, err = ioutil.ReadFile(filepath.Join(dirPath, "cgroup.threads")) + } + if err != nil { + return err + } + pids := make(map[string]string) + for _, i := range strings.Split(string(contents), "\n") { + if i != "" { + pids[i] = i + } + } + stats.PidsStats.Current = uint64(len(pids)) + stats.PidsStats.Limit = 0 + return nil +} + +func statPids(dirPath string, stats *cgroups.Stats) error { + current, err := fscommon.GetCgroupParamUint(dirPath, "pids.current") + if err != nil { + return errors.Wrap(err, "failed to parse pids.current") + } + + maxString, err := fscommon.GetCgroupParamString(dirPath, "pids.max") + if err != nil { + return errors.Wrap(err, "failed to parse pids.max") + } + + // Default if pids.max == "max" is 0 -- which represents "no limit". + var max uint64 + if maxString != "max" { + max, err = fscommon.ParseUint(maxString, 10, 64) + if err != nil { + return errors.Wrapf(err, "failed to parse pids.max - unable to parse %q as a uint from Cgroup file %q", + maxString, filepath.Join(dirPath, "pids.max")) + } + } + + stats.PidsStats.Current = current + stats.PidsStats.Limit = max + return nil +} diff --git a/libcontainer/cgroups/fscommon/fscommon.go b/libcontainer/cgroups/fscommon/fscommon.go new file mode 100644 index 0000000..dd92e8c --- /dev/null +++ b/libcontainer/cgroups/fscommon/fscommon.go @@ -0,0 +1,36 @@ +// +build linux + +package fscommon + +import ( + "io/ioutil" + + securejoin "github.com/cyphar/filepath-securejoin" + "github.com/pkg/errors" +) + +func WriteFile(dir, file, data string) error { + if dir == "" { + return errors.Errorf("no directory specified for %s", file) + } + path, err := securejoin.SecureJoin(dir, file) + if err != nil { + return err + } + if err := ioutil.WriteFile(path, []byte(data), 0700); err != nil { + return errors.Wrapf(err, "failed to write %q to %q", data, path) + } + return nil +} + +func ReadFile(dir, file string) (string, error) { + if dir == "" { + return "", errors.Errorf("no directory specified for %s", file) + } + path, err := securejoin.SecureJoin(dir, file) + if err != nil { + return "", err + } + data, err := ioutil.ReadFile(path) + return string(data), err +} diff --git a/libcontainer/cgroups/fscommon/utils.go b/libcontainer/cgroups/fscommon/utils.go new file mode 100644 index 0000000..46c3c77 --- /dev/null +++ b/libcontainer/cgroups/fscommon/utils.go @@ -0,0 +1,83 @@ +// +build linux + +package fscommon + +import ( + "errors" + "fmt" + "io/ioutil" + "math" + "path/filepath" + "strconv" + "strings" +) + +var ( + ErrNotValidFormat = errors.New("line is not a valid key value format") +) + +// Saturates negative values at zero and returns a uint64. +// Due to kernel bugs, some of the memory cgroup stats can be negative. +func ParseUint(s string, base, bitSize int) (uint64, error) { + value, err := strconv.ParseUint(s, base, bitSize) + if err != nil { + intValue, intErr := strconv.ParseInt(s, base, bitSize) + // 1. Handle negative values greater than MinInt64 (and) + // 2. Handle negative values lesser than MinInt64 + if intErr == nil && intValue < 0 { + return 0, nil + } else if intErr != nil && intErr.(*strconv.NumError).Err == strconv.ErrRange && intValue < 0 { + return 0, nil + } + + return value, err + } + + return value, nil +} + +// Parses a cgroup param and returns as name, value +// i.e. "io_service_bytes 1234" will return as io_service_bytes, 1234 +func GetCgroupParamKeyValue(t string) (string, uint64, error) { + parts := strings.Fields(t) + switch len(parts) { + case 2: + value, err := ParseUint(parts[1], 10, 64) + if err != nil { + return "", 0, fmt.Errorf("unable to convert param value (%q) to uint64: %v", parts[1], err) + } + + return parts[0], value, nil + default: + return "", 0, ErrNotValidFormat + } +} + +// Gets a single uint64 value from the specified cgroup file. +func GetCgroupParamUint(cgroupPath, cgroupFile string) (uint64, error) { + fileName := filepath.Join(cgroupPath, cgroupFile) + contents, err := ioutil.ReadFile(fileName) + if err != nil { + return 0, err + } + trimmed := strings.TrimSpace(string(contents)) + if trimmed == "max" { + return math.MaxUint64, nil + } + + res, err := ParseUint(trimmed, 10, 64) + if err != nil { + return res, fmt.Errorf("unable to parse %q as a uint from Cgroup file %q", string(contents), fileName) + } + return res, nil +} + +// Gets a string value from the specified cgroup file +func GetCgroupParamString(cgroupPath, cgroupFile string) (string, error) { + contents, err := ioutil.ReadFile(filepath.Join(cgroupPath, cgroupFile)) + if err != nil { + return "", err + } + + return strings.TrimSpace(string(contents)), nil +} diff --git a/libcontainer/cgroups/fscommon/utils_test.go b/libcontainer/cgroups/fscommon/utils_test.go new file mode 100644 index 0000000..d0c5668 --- /dev/null +++ b/libcontainer/cgroups/fscommon/utils_test.go @@ -0,0 +1,97 @@ +// +build linux + +package fscommon + +import ( + "io/ioutil" + "math" + "os" + "path/filepath" + "strconv" + "testing" +) + +const ( + cgroupFile = "cgroup.file" + floatValue = 2048.0 + floatString = "2048" +) + +func TestGetCgroupParamsInt(t *testing.T) { + // Setup tempdir. + tempDir, err := ioutil.TempDir("", "cgroup_utils_test") + if err != nil { + t.Fatal(err) + } + defer os.RemoveAll(tempDir) + tempFile := filepath.Join(tempDir, cgroupFile) + + // Success. + err = ioutil.WriteFile(tempFile, []byte(floatString), 0755) + if err != nil { + t.Fatal(err) + } + value, err := GetCgroupParamUint(tempDir, cgroupFile) + if err != nil { + t.Fatal(err) + } else if value != floatValue { + t.Fatalf("Expected %d to equal %f", value, floatValue) + } + + // Success with new line. + err = ioutil.WriteFile(tempFile, []byte(floatString+"\n"), 0755) + if err != nil { + t.Fatal(err) + } + value, err = GetCgroupParamUint(tempDir, cgroupFile) + if err != nil { + t.Fatal(err) + } else if value != floatValue { + t.Fatalf("Expected %d to equal %f", value, floatValue) + } + + // Success with negative values + err = ioutil.WriteFile(tempFile, []byte("-12345"), 0755) + if err != nil { + t.Fatal(err) + } + value, err = GetCgroupParamUint(tempDir, cgroupFile) + if err != nil { + t.Fatal(err) + } else if value != 0 { + t.Fatalf("Expected %d to equal %d", value, 0) + } + + // Success with negative values lesser than min int64 + s := strconv.FormatFloat(math.MinInt64, 'f', -1, 64) + err = ioutil.WriteFile(tempFile, []byte(s), 0755) + if err != nil { + t.Fatal(err) + } + value, err = GetCgroupParamUint(tempDir, cgroupFile) + if err != nil { + t.Fatal(err) + } else if value != 0 { + t.Fatalf("Expected %d to equal %d", value, 0) + } + + // Not a float. + err = ioutil.WriteFile(tempFile, []byte("not-a-float"), 0755) + if err != nil { + t.Fatal(err) + } + _, err = GetCgroupParamUint(tempDir, cgroupFile) + if err == nil { + t.Fatal("Expecting error, got none") + } + + // Unknown file. + err = os.Remove(tempFile) + if err != nil { + t.Fatal(err) + } + _, err = GetCgroupParamUint(tempDir, cgroupFile) + if err == nil { + t.Fatal("Expecting error, got none") + } +} diff --git a/libcontainer/cgroups/stats.go b/libcontainer/cgroups/stats.go new file mode 100644 index 0000000..8eeedc5 --- /dev/null +++ b/libcontainer/cgroups/stats.go @@ -0,0 +1,108 @@ +// +build linux + +package cgroups + +type ThrottlingData struct { + // Number of periods with throttling active + Periods uint64 `json:"periods,omitempty"` + // Number of periods when the container hit its throttling limit. + ThrottledPeriods uint64 `json:"throttled_periods,omitempty"` + // Aggregate time the container was throttled for in nanoseconds. + ThrottledTime uint64 `json:"throttled_time,omitempty"` +} + +// CpuUsage denotes the usage of a CPU. +// All CPU stats are aggregate since container inception. +type CpuUsage struct { + // Total CPU time consumed. + // Units: nanoseconds. + TotalUsage uint64 `json:"total_usage,omitempty"` + // Total CPU time consumed per core. + // Units: nanoseconds. + PercpuUsage []uint64 `json:"percpu_usage,omitempty"` + // Time spent by tasks of the cgroup in kernel mode. + // Units: nanoseconds. + UsageInKernelmode uint64 `json:"usage_in_kernelmode"` + // Time spent by tasks of the cgroup in user mode. + // Units: nanoseconds. + UsageInUsermode uint64 `json:"usage_in_usermode"` +} + +type CpuStats struct { + CpuUsage CpuUsage `json:"cpu_usage,omitempty"` + ThrottlingData ThrottlingData `json:"throttling_data,omitempty"` +} + +type MemoryData struct { + Usage uint64 `json:"usage,omitempty"` + MaxUsage uint64 `json:"max_usage,omitempty"` + Failcnt uint64 `json:"failcnt"` + Limit uint64 `json:"limit"` +} + +type MemoryStats struct { + // memory used for cache + Cache uint64 `json:"cache,omitempty"` + // usage of memory + Usage MemoryData `json:"usage,omitempty"` + // usage of memory + swap + SwapUsage MemoryData `json:"swap_usage,omitempty"` + // usage of kernel memory + KernelUsage MemoryData `json:"kernel_usage,omitempty"` + // usage of kernel TCP memory + KernelTCPUsage MemoryData `json:"kernel_tcp_usage,omitempty"` + // if true, memory usage is accounted for throughout a hierarchy of cgroups. + UseHierarchy bool `json:"use_hierarchy"` + + Stats map[string]uint64 `json:"stats,omitempty"` +} + +type PidsStats struct { + // number of pids in the cgroup + Current uint64 `json:"current,omitempty"` + // active pids hard limit + Limit uint64 `json:"limit,omitempty"` +} + +type BlkioStatEntry struct { + Major uint64 `json:"major,omitempty"` + Minor uint64 `json:"minor,omitempty"` + Op string `json:"op,omitempty"` + Value uint64 `json:"value,omitempty"` +} + +type BlkioStats struct { + // number of bytes tranferred to and from the block device + IoServiceBytesRecursive []BlkioStatEntry `json:"io_service_bytes_recursive,omitempty"` + IoServicedRecursive []BlkioStatEntry `json:"io_serviced_recursive,omitempty"` + IoQueuedRecursive []BlkioStatEntry `json:"io_queue_recursive,omitempty"` + IoServiceTimeRecursive []BlkioStatEntry `json:"io_service_time_recursive,omitempty"` + IoWaitTimeRecursive []BlkioStatEntry `json:"io_wait_time_recursive,omitempty"` + IoMergedRecursive []BlkioStatEntry `json:"io_merged_recursive,omitempty"` + IoTimeRecursive []BlkioStatEntry `json:"io_time_recursive,omitempty"` + SectorsRecursive []BlkioStatEntry `json:"sectors_recursive,omitempty"` +} + +type HugetlbStats struct { + // current res_counter usage for hugetlb + Usage uint64 `json:"usage,omitempty"` + // maximum usage ever recorded. + MaxUsage uint64 `json:"max_usage,omitempty"` + // number of times hugetlb usage allocation failure. + Failcnt uint64 `json:"failcnt"` +} + +type Stats struct { + CpuStats CpuStats `json:"cpu_stats,omitempty"` + MemoryStats MemoryStats `json:"memory_stats,omitempty"` + PidsStats PidsStats `json:"pids_stats,omitempty"` + BlkioStats BlkioStats `json:"blkio_stats,omitempty"` + // the map is in the format "size of hugepage: stats of the hugepage" + HugetlbStats map[string]HugetlbStats `json:"hugetlb_stats,omitempty"` +} + +func NewStats() *Stats { + memoryStats := MemoryStats{Stats: make(map[string]uint64)} + hugetlbStats := make(map[string]HugetlbStats) + return &Stats{MemoryStats: memoryStats, HugetlbStats: hugetlbStats} +} diff --git a/libcontainer/cgroups/systemd/apply_nosystemd.go b/libcontainer/cgroups/systemd/apply_nosystemd.go new file mode 100644 index 0000000..ef0db5a --- /dev/null +++ b/libcontainer/cgroups/systemd/apply_nosystemd.go @@ -0,0 +1,67 @@ +// +build !linux + +package systemd + +import ( + "fmt" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" +) + +type Manager struct { + Cgroups *configs.Cgroup + Paths map[string]string +} + +func UseSystemd() bool { + return false +} + +func NewSystemdCgroupsManager() (func(config *configs.Cgroup, paths map[string]string) cgroups.Manager, error) { + return nil, fmt.Errorf("Systemd not supported") +} + +func (m *Manager) Apply(pid int) error { + return fmt.Errorf("Systemd not supported") +} + +func (m *Manager) GetPids() ([]int, error) { + return nil, fmt.Errorf("Systemd not supported") +} + +func (m *Manager) GetAllPids() ([]int, error) { + return nil, fmt.Errorf("Systemd not supported") +} + +func (m *Manager) Destroy() error { + return fmt.Errorf("Systemd not supported") +} + +func (m *Manager) GetPaths() map[string]string { + return nil +} + +func (m *Manager) GetUnifiedPath() (string, error) { + return "", fmt.Errorf("Systemd not supported") +} + +func (m *Manager) GetStats() (*cgroups.Stats, error) { + return nil, fmt.Errorf("Systemd not supported") +} + +func (m *Manager) Set(container *configs.Config) error { + return fmt.Errorf("Systemd not supported") +} + +func (m *Manager) Freeze(state configs.FreezerState) error { + return fmt.Errorf("Systemd not supported") +} + +func Freeze(c *configs.Cgroup, state configs.FreezerState) error { + return fmt.Errorf("Systemd not supported") +} + +func (m *Manager) GetCgroups() (*configs.Cgroup, error) { + return nil, fmt.Errorf("Systemd not supported") +} diff --git a/libcontainer/cgroups/systemd/apply_systemd.go b/libcontainer/cgroups/systemd/apply_systemd.go new file mode 100644 index 0000000..c4b19b3 --- /dev/null +++ b/libcontainer/cgroups/systemd/apply_systemd.go @@ -0,0 +1,534 @@ +// +build linux + +package systemd + +import ( + "errors" + "fmt" + "io/ioutil" + "math" + "os" + "path/filepath" + "strings" + "sync" + "time" + + systemdDbus "github.com/coreos/go-systemd/dbus" + "github.com/godbus/dbus" + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fs" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/sirupsen/logrus" +) + +type LegacyManager struct { + mu sync.Mutex + Cgroups *configs.Cgroup + Paths map[string]string +} + +type subsystem interface { + // Name returns the name of the subsystem. + Name() string + // Returns the stats, as 'stats', corresponding to the cgroup under 'path'. + GetStats(path string, stats *cgroups.Stats) error + // Set the cgroup represented by cgroup. + Set(path string, cgroup *configs.Cgroup) error +} + +var errSubsystemDoesNotExist = errors.New("cgroup: subsystem does not exist") + +type subsystemSet []subsystem + +func (s subsystemSet) Get(name string) (subsystem, error) { + for _, ss := range s { + if ss.Name() == name { + return ss, nil + } + } + return nil, errSubsystemDoesNotExist +} + +var legacySubsystems = subsystemSet{ + &fs.CpusetGroup{}, + &fs.DevicesGroup{}, + &fs.MemoryGroup{}, + &fs.CpuGroup{}, + &fs.CpuacctGroup{}, + &fs.PidsGroup{}, + &fs.BlkioGroup{}, + &fs.HugetlbGroup{}, + &fs.PerfEventGroup{}, + &fs.FreezerGroup{}, + &fs.NetPrioGroup{}, + &fs.NetClsGroup{}, + &fs.NameGroup{GroupName: "name=systemd"}, +} + +const ( + testScopeWait = 4 + testSliceWait = 4 +) + +var ( + connLock sync.Mutex + theConn *systemdDbus.Conn +) + +func newProp(name string, units interface{}) systemdDbus.Property { + return systemdDbus.Property{ + Name: name, + Value: dbus.MakeVariant(units), + } +} + +// NOTE: This function comes from package github.com/coreos/go-systemd/util +// It was borrowed here to avoid a dependency on cgo. +// +// IsRunningSystemd checks whether the host was booted with systemd as its init +// system. This functions similarly to systemd's `sd_booted(3)`: internally, it +// checks whether /run/systemd/system/ exists and is a directory. +// http://www.freedesktop.org/software/systemd/man/sd_booted.html +func isRunningSystemd() bool { + fi, err := os.Lstat("/run/systemd/system") + if err != nil { + return false + } + return fi.IsDir() +} + +func UseSystemd() bool { + if !isRunningSystemd() { + return false + } + + connLock.Lock() + defer connLock.Unlock() + + if theConn == nil { + var err error + theConn, err = systemdDbus.New() + if err != nil { + return false + } + } + return true +} + +func NewSystemdCgroupsManager() (func(config *configs.Cgroup, paths map[string]string) cgroups.Manager, error) { + if !isRunningSystemd() { + return nil, fmt.Errorf("systemd not running on this host, can't use systemd as a cgroups.Manager") + } + if cgroups.IsCgroup2UnifiedMode() { + return func(config *configs.Cgroup, paths map[string]string) cgroups.Manager { + return &UnifiedManager{ + Cgroups: config, + Paths: paths, + } + }, nil + } + return func(config *configs.Cgroup, paths map[string]string) cgroups.Manager { + return &LegacyManager{ + Cgroups: config, + Paths: paths, + } + }, nil +} + +func (m *LegacyManager) Apply(pid int) error { + var ( + c = m.Cgroups + unitName = getUnitName(c) + slice = "system.slice" + properties []systemdDbus.Property + ) + + if c.Paths != nil { + paths := make(map[string]string) + for name, path := range c.Paths { + _, err := getSubsystemPath(m.Cgroups, name) + if err != nil { + // Don't fail if a cgroup hierarchy was not found, just skip this subsystem + if cgroups.IsNotFound(err) { + continue + } + return err + } + paths[name] = path + } + m.Paths = paths + return cgroups.EnterPid(m.Paths, pid) + } + + if c.Parent != "" { + slice = c.Parent + } + + properties = append(properties, systemdDbus.PropDescription("libcontainer container "+c.Name)) + + // if we create a slice, the parent is defined via a Wants= + if strings.HasSuffix(unitName, ".slice") { + properties = append(properties, systemdDbus.PropWants(slice)) + } else { + // otherwise, we use Slice= + properties = append(properties, systemdDbus.PropSlice(slice)) + } + + // only add pid if its valid, -1 is used w/ general slice creation. + if pid != -1 { + properties = append(properties, newProp("PIDs", []uint32{uint32(pid)})) + } + + // Check if we can delegate. This is only supported on systemd versions 218 and above. + if !strings.HasSuffix(unitName, ".slice") { + // Assume scopes always support delegation. + properties = append(properties, newProp("Delegate", true)) + } + + // Always enable accounting, this gets us the same behaviour as the fs implementation, + // plus the kernel has some problems with joining the memory cgroup at a later time. + properties = append(properties, + newProp("MemoryAccounting", true), + newProp("CPUAccounting", true), + newProp("BlockIOAccounting", true)) + + // Assume DefaultDependencies= will always work (the check for it was previously broken.) + properties = append(properties, + newProp("DefaultDependencies", false)) + + if c.Resources.Memory != 0 { + properties = append(properties, + newProp("MemoryLimit", uint64(c.Resources.Memory))) + } + + if c.Resources.CpuShares != 0 { + properties = append(properties, + newProp("CPUShares", c.Resources.CpuShares)) + } + + // cpu.cfs_quota_us and cpu.cfs_period_us are controlled by systemd. + if c.Resources.CpuQuota != 0 && c.Resources.CpuPeriod != 0 { + // corresponds to USEC_INFINITY in systemd + // if USEC_INFINITY is provided, CPUQuota is left unbound by systemd + // always setting a property value ensures we can apply a quota and remove it later + cpuQuotaPerSecUSec := uint64(math.MaxUint64) + if c.Resources.CpuQuota > 0 { + // systemd converts CPUQuotaPerSecUSec (microseconds per CPU second) to CPUQuota + // (integer percentage of CPU) internally. This means that if a fractional percent of + // CPU is indicated by Resources.CpuQuota, we need to round up to the nearest + // 10ms (1% of a second) such that child cgroups can set the cpu.cfs_quota_us they expect. + cpuQuotaPerSecUSec = uint64(c.Resources.CpuQuota*1000000) / c.Resources.CpuPeriod + if cpuQuotaPerSecUSec%10000 != 0 { + cpuQuotaPerSecUSec = ((cpuQuotaPerSecUSec / 10000) + 1) * 10000 + } + } + properties = append(properties, + newProp("CPUQuotaPerSecUSec", cpuQuotaPerSecUSec)) + } + + if c.Resources.BlkioWeight != 0 { + properties = append(properties, + newProp("BlockIOWeight", uint64(c.Resources.BlkioWeight))) + } + + if c.Resources.PidsLimit > 0 { + properties = append(properties, + newProp("TasksAccounting", true), + newProp("TasksMax", uint64(c.Resources.PidsLimit))) + } + + // We have to set kernel memory here, as we can't change it once + // processes have been attached to the cgroup. + if c.Resources.KernelMemory != 0 { + if err := setKernelMemory(c); err != nil { + return err + } + } + + statusChan := make(chan string, 1) + if _, err := theConn.StartTransientUnit(unitName, "replace", properties, statusChan); err == nil { + select { + case <-statusChan: + case <-time.After(time.Second): + logrus.Warnf("Timed out while waiting for StartTransientUnit(%s) completion signal from dbus. Continuing...", unitName) + } + } else if !isUnitExists(err) { + return err + } + + if err := joinCgroups(c, pid); err != nil { + return err + } + + paths := make(map[string]string) + for _, s := range legacySubsystems { + subsystemPath, err := getSubsystemPath(m.Cgroups, s.Name()) + if err != nil { + // Don't fail if a cgroup hierarchy was not found, just skip this subsystem + if cgroups.IsNotFound(err) { + continue + } + return err + } + paths[s.Name()] = subsystemPath + } + m.Paths = paths + return nil +} + +func (m *LegacyManager) Destroy() error { + if m.Cgroups.Paths != nil { + return nil + } + m.mu.Lock() + defer m.mu.Unlock() + theConn.StopUnit(getUnitName(m.Cgroups), "replace", nil) + if err := cgroups.RemovePaths(m.Paths); err != nil { + return err + } + m.Paths = make(map[string]string) + return nil +} + +func (m *LegacyManager) GetPaths() map[string]string { + m.mu.Lock() + paths := m.Paths + m.mu.Unlock() + return paths +} + +func (m *LegacyManager) GetUnifiedPath() (string, error) { + return "", errors.New("unified path is only supported when running in unified mode") +} + +func join(c *configs.Cgroup, subsystem string, pid int) (string, error) { + path, err := getSubsystemPath(c, subsystem) + if err != nil { + return "", err + } + + if err := os.MkdirAll(path, 0755); err != nil { + return "", err + } + if err := cgroups.WriteCgroupProc(path, pid); err != nil { + return "", err + } + return path, nil +} + +func joinCgroups(c *configs.Cgroup, pid int) error { + for _, sys := range legacySubsystems { + name := sys.Name() + switch name { + case "name=systemd": + // let systemd handle this + case "cpuset": + path, err := getSubsystemPath(c, name) + if err != nil && !cgroups.IsNotFound(err) { + return err + } + s := &fs.CpusetGroup{} + if err := s.ApplyDir(path, c, pid); err != nil { + return err + } + default: + _, err := join(c, name, pid) + if err != nil { + // Even if it's `not found` error, we'll return err + // because devices cgroup is hard requirement for + // container security. + if name == "devices" { + return err + } + // For other subsystems, omit the `not found` error + // because they are optional. + if !cgroups.IsNotFound(err) { + return err + } + } + } + } + + return nil +} + +// systemd represents slice hierarchy using `-`, so we need to follow suit when +// generating the path of slice. Essentially, test-a-b.slice becomes +// /test.slice/test-a.slice/test-a-b.slice. +func ExpandSlice(slice string) (string, error) { + suffix := ".slice" + // Name has to end with ".slice", but can't be just ".slice". + if len(slice) < len(suffix) || !strings.HasSuffix(slice, suffix) { + return "", fmt.Errorf("invalid slice name: %s", slice) + } + + // Path-separators are not allowed. + if strings.Contains(slice, "/") { + return "", fmt.Errorf("invalid slice name: %s", slice) + } + + var path, prefix string + sliceName := strings.TrimSuffix(slice, suffix) + // if input was -.slice, we should just return root now + if sliceName == "-" { + return "/", nil + } + for _, component := range strings.Split(sliceName, "-") { + // test--a.slice isn't permitted, nor is -test.slice. + if component == "" { + return "", fmt.Errorf("invalid slice name: %s", slice) + } + + // Append the component to the path and to the prefix. + path += "/" + prefix + component + suffix + prefix += component + "-" + } + return path, nil +} + +func getSubsystemPath(c *configs.Cgroup, subsystem string) (string, error) { + mountpoint, err := cgroups.FindCgroupMountpoint(c.Path, subsystem) + if err != nil { + return "", err + } + + initPath, err := cgroups.GetInitCgroup(subsystem) + if err != nil { + return "", err + } + // if pid 1 is systemd 226 or later, it will be in init.scope, not the root + initPath = strings.TrimSuffix(filepath.Clean(initPath), "init.scope") + + slice := "system.slice" + if c.Parent != "" { + slice = c.Parent + } + + slice, err = ExpandSlice(slice) + if err != nil { + return "", err + } + + return filepath.Join(mountpoint, initPath, slice, getUnitName(c)), nil +} + +func (m *LegacyManager) Freeze(state configs.FreezerState) error { + path, err := getSubsystemPath(m.Cgroups, "freezer") + if err != nil { + return err + } + prevState := m.Cgroups.Resources.Freezer + m.Cgroups.Resources.Freezer = state + freezer, err := legacySubsystems.Get("freezer") + if err != nil { + return err + } + err = freezer.Set(path, m.Cgroups) + if err != nil { + m.Cgroups.Resources.Freezer = prevState + return err + } + return nil +} + +func (m *LegacyManager) GetPids() ([]int, error) { + path, err := getSubsystemPath(m.Cgroups, "devices") + if err != nil { + return nil, err + } + return cgroups.GetPids(path) +} + +func (m *LegacyManager) GetAllPids() ([]int, error) { + path, err := getSubsystemPath(m.Cgroups, "devices") + if err != nil { + return nil, err + } + return cgroups.GetAllPids(path) +} + +func (m *LegacyManager) GetStats() (*cgroups.Stats, error) { + m.mu.Lock() + defer m.mu.Unlock() + stats := cgroups.NewStats() + for name, path := range m.Paths { + sys, err := legacySubsystems.Get(name) + if err == errSubsystemDoesNotExist || !cgroups.PathExists(path) { + continue + } + if err := sys.GetStats(path, stats); err != nil { + return nil, err + } + } + + return stats, nil +} + +func (m *LegacyManager) Set(container *configs.Config) error { + // If Paths are set, then we are just joining cgroups paths + // and there is no need to set any values. + if m.Cgroups.Paths != nil { + return nil + } + for _, sys := range legacySubsystems { + // Get the subsystem path, but don't error out for not found cgroups. + path, err := getSubsystemPath(container.Cgroups, sys.Name()) + if err != nil && !cgroups.IsNotFound(err) { + return err + } + + if err := sys.Set(path, container.Cgroups); err != nil { + return err + } + } + + if m.Paths["cpu"] != "" { + if err := fs.CheckCpushares(m.Paths["cpu"], container.Cgroups.Resources.CpuShares); err != nil { + return err + } + } + return nil +} + +func getUnitName(c *configs.Cgroup) string { + // by default, we create a scope unless the user explicitly asks for a slice. + if !strings.HasSuffix(c.Name, ".slice") { + return fmt.Sprintf("%s-%s.scope", c.ScopePrefix, c.Name) + } + return c.Name +} + +func setKernelMemory(c *configs.Cgroup) error { + path, err := getSubsystemPath(c, "memory") + if err != nil && !cgroups.IsNotFound(err) { + return err + } + + if err := os.MkdirAll(path, 0755); err != nil { + return err + } + // do not try to enable the kernel memory if we already have + // tasks in the cgroup. + content, err := ioutil.ReadFile(filepath.Join(path, "tasks")) + if err != nil { + return err + } + if len(content) > 0 { + return nil + } + return fs.EnableKernelMemoryAccounting(path) +} + +// isUnitExists returns true if the error is that a systemd unit already exists. +func isUnitExists(err error) bool { + if err != nil { + if dbusError, ok := err.(dbus.Error); ok { + return strings.Contains(dbusError.Name, "org.freedesktop.systemd1.UnitExists") + } + } + return false +} + +func (m *LegacyManager) GetCgroups() (*configs.Cgroup, error) { + return m.Cgroups, nil +} diff --git a/libcontainer/cgroups/systemd/unified_hierarchy.go b/libcontainer/cgroups/systemd/unified_hierarchy.go new file mode 100644 index 0000000..6605099 --- /dev/null +++ b/libcontainer/cgroups/systemd/unified_hierarchy.go @@ -0,0 +1,312 @@ +// +build linux + +package systemd + +import ( + "fmt" + "io/ioutil" + "math" + "os" + "path/filepath" + "strings" + "sync" + "time" + + systemdDbus "github.com/coreos/go-systemd/dbus" + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fs2" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/pkg/errors" + "github.com/sirupsen/logrus" +) + +type UnifiedManager struct { + mu sync.Mutex + Cgroups *configs.Cgroup + Paths map[string]string +} + +func (m *UnifiedManager) Apply(pid int) error { + var ( + c = m.Cgroups + unitName = getUnitName(c) + slice = "system.slice" + properties []systemdDbus.Property + ) + + if c.Paths != nil { + paths := make(map[string]string) + for name, path := range c.Paths { + _, err := getSubsystemPath(m.Cgroups, name) + if err != nil { + // Don't fail if a cgroup hierarchy was not found, just skip this subsystem + if cgroups.IsNotFound(err) { + continue + } + return err + } + paths[name] = path + } + m.Paths = paths + return cgroups.EnterPid(m.Paths, pid) + } + + if c.Parent != "" { + slice = c.Parent + } + + properties = append(properties, systemdDbus.PropDescription("libcontainer container "+c.Name)) + + // if we create a slice, the parent is defined via a Wants= + if strings.HasSuffix(unitName, ".slice") { + properties = append(properties, systemdDbus.PropWants(slice)) + } else { + // otherwise, we use Slice= + properties = append(properties, systemdDbus.PropSlice(slice)) + } + + // only add pid if its valid, -1 is used w/ general slice creation. + if pid != -1 { + properties = append(properties, newProp("PIDs", []uint32{uint32(pid)})) + } + + // Check if we can delegate. This is only supported on systemd versions 218 and above. + if !strings.HasSuffix(unitName, ".slice") { + // Assume scopes always support delegation. + properties = append(properties, newProp("Delegate", true)) + } + + // Always enable accounting, this gets us the same behaviour as the fs implementation, + // plus the kernel has some problems with joining the memory cgroup at a later time. + properties = append(properties, + newProp("MemoryAccounting", true), + newProp("CPUAccounting", true), + newProp("BlockIOAccounting", true)) + + // Assume DefaultDependencies= will always work (the check for it was previously broken.) + properties = append(properties, + newProp("DefaultDependencies", false)) + + if c.Resources.Memory != 0 { + properties = append(properties, + newProp("MemoryLimit", uint64(c.Resources.Memory))) + } + + if c.Resources.CpuShares != 0 { + properties = append(properties, + newProp("CPUShares", c.Resources.CpuShares)) + } + + // cpu.cfs_quota_us and cpu.cfs_period_us are controlled by systemd. + if c.Resources.CpuQuota != 0 && c.Resources.CpuPeriod != 0 { + // corresponds to USEC_INFINITY in systemd + // if USEC_INFINITY is provided, CPUQuota is left unbound by systemd + // always setting a property value ensures we can apply a quota and remove it later + cpuQuotaPerSecUSec := uint64(math.MaxUint64) + if c.Resources.CpuQuota > 0 { + // systemd converts CPUQuotaPerSecUSec (microseconds per CPU second) to CPUQuota + // (integer percentage of CPU) internally. This means that if a fractional percent of + // CPU is indicated by Resources.CpuQuota, we need to round up to the nearest + // 10ms (1% of a second) such that child cgroups can set the cpu.cfs_quota_us they expect. + cpuQuotaPerSecUSec = uint64(c.Resources.CpuQuota*1000000) / c.Resources.CpuPeriod + if cpuQuotaPerSecUSec%10000 != 0 { + cpuQuotaPerSecUSec = ((cpuQuotaPerSecUSec / 10000) + 1) * 10000 + } + } + properties = append(properties, + newProp("CPUQuotaPerSecUSec", cpuQuotaPerSecUSec)) + } + + if c.Resources.BlkioWeight != 0 { + properties = append(properties, + newProp("BlockIOWeight", uint64(c.Resources.BlkioWeight))) + } + + if c.Resources.PidsLimit > 0 { + properties = append(properties, + newProp("TasksAccounting", true), + newProp("TasksMax", uint64(c.Resources.PidsLimit))) + } + + // We have to set kernel memory here, as we can't change it once + // processes have been attached to the cgroup. + if c.Resources.KernelMemory != 0 { + if err := setKernelMemory(c); err != nil { + return err + } + } + + statusChan := make(chan string, 1) + if _, err := theConn.StartTransientUnit(unitName, "replace", properties, statusChan); err == nil { + select { + case <-statusChan: + case <-time.After(time.Second): + logrus.Warnf("Timed out while waiting for StartTransientUnit(%s) completion signal from dbus. Continuing...", unitName) + } + } else if !isUnitExists(err) { + return err + } + + if err := joinCgroupsV2(c, pid); err != nil { + return err + } + + path, err := getSubsystemPath(m.Cgroups, "") + if err != nil { + return err + } + m.Paths = map[string]string{ + "pids": path, + "memory": path, + "io": path, + "cpu": path, + "devices": path, + "cpuset": path, + "freezer": path, + } + return nil +} + +func (m *UnifiedManager) Destroy() error { + if m.Cgroups.Paths != nil { + return nil + } + m.mu.Lock() + defer m.mu.Unlock() + theConn.StopUnit(getUnitName(m.Cgroups), "replace", nil) + if err := cgroups.RemovePaths(m.Paths); err != nil { + return err + } + m.Paths = make(map[string]string) + return nil +} + +func (m *UnifiedManager) GetPaths() map[string]string { + m.mu.Lock() + paths := m.Paths + m.mu.Unlock() + return paths +} +func (m *UnifiedManager) GetUnifiedPath() (string, error) { + unifiedPath := "" + m.mu.Lock() + defer m.mu.Unlock() + for k, v := range m.Paths { + if unifiedPath == "" { + unifiedPath = v + } else if v != unifiedPath { + return unifiedPath, + errors.Errorf("expected %q path to be unified path %q, got %q", k, unifiedPath, v) + } + } + if unifiedPath == "" { + // FIXME: unified path could be detected even when no controller is available + return unifiedPath, errors.New("cannot detect unified path") + } + return unifiedPath, nil +} +func createCgroupsv2Path(path string) (Err error) { + content, err := ioutil.ReadFile("/sys/fs/cgroup/cgroup.controllers") + if err != nil { + return err + } + if !filepath.HasPrefix(path, "/sys/fs/cgroup") { + return fmt.Errorf("invalid cgroup path %s", path) + } + + res := "" + for i, c := range strings.Split(strings.TrimSpace(string(content)), " ") { + if i == 0 { + res = fmt.Sprintf("+%s", c) + } else { + res = res + fmt.Sprintf(" +%s", c) + } + } + resByte := []byte(res) + + current := "/sys/fs" + elements := strings.Split(path, "/") + for i, e := range elements[3:] { + current = filepath.Join(current, e) + if i > 0 { + if err := os.Mkdir(current, 0755); err != nil { + if !os.IsExist(err) { + return err + } + } else { + // If the directory was created, be sure it is not left around on errors. + defer func() { + if Err != nil { + os.Remove(current) + } + }() + } + } + if i < len(elements[3:])-1 { + if err := ioutil.WriteFile(filepath.Join(current, "cgroup.subtree_control"), resByte, 0755); err != nil { + return err + } + } + } + return nil +} + +func joinCgroupsV2(c *configs.Cgroup, pid int) error { + path, err := getSubsystemPath(c, "memory") + if err != nil { + return err + } + return createCgroupsv2Path(path) +} + +func (m *UnifiedManager) fsManager() (cgroups.Manager, error) { + path, err := m.GetUnifiedPath() + if err != nil { + return nil, err + } + return fs2.NewManager(m.Cgroups, path, false) +} + +func (m *UnifiedManager) Freeze(state configs.FreezerState) error { + fsMgr, err := m.fsManager() + if err != nil { + return err + } + return fsMgr.Freeze(state) +} + +func (m *UnifiedManager) GetPids() ([]int, error) { + path, err := m.GetUnifiedPath() + if err != nil { + return nil, err + } + return cgroups.GetPids(path) +} + +func (m *UnifiedManager) GetAllPids() ([]int, error) { + path, err := m.GetUnifiedPath() + if err != nil { + return nil, err + } + return cgroups.GetAllPids(path) +} + +func (m *UnifiedManager) GetStats() (*cgroups.Stats, error) { + fsMgr, err := m.fsManager() + if err != nil { + return nil, err + } + return fsMgr.GetStats() +} + +func (m *UnifiedManager) Set(container *configs.Config) error { + fsMgr, err := m.fsManager() + if err != nil { + return err + } + return fsMgr.Set(container) +} + +func (m *UnifiedManager) GetCgroups() (*configs.Cgroup, error) { + return m.Cgroups, nil +} diff --git a/libcontainer/cgroups/utils.go b/libcontainer/cgroups/utils.go new file mode 100644 index 0000000..dbcc58f --- /dev/null +++ b/libcontainer/cgroups/utils.go @@ -0,0 +1,588 @@ +// +build linux + +package cgroups + +import ( + "bufio" + "fmt" + "io" + "io/ioutil" + "os" + "path/filepath" + "strconv" + "strings" + "sync" + "syscall" + "time" + + units "github.com/docker/go-units" + "golang.org/x/sys/unix" +) + +const ( + CgroupNamePrefix = "name=" + CgroupProcesses = "cgroup.procs" + unifiedMountpoint = "/sys/fs/cgroup" +) + +var ( + isUnifiedOnce sync.Once + isUnified bool +) + +// HugePageSizeUnitList is a list of the units used by the linux kernel when +// naming the HugePage control files. +// https://www.kernel.org/doc/Documentation/cgroup-v1/hugetlb.txt +// TODO Since the kernel only use KB, MB and GB; TB and PB should be removed, +// depends on https://github.com/docker/go-units/commit/a09cd47f892041a4fac473133d181f5aea6fa393 +var HugePageSizeUnitList = []string{"B", "KB", "MB", "GB", "TB", "PB"} + +// IsCgroup2UnifiedMode returns whether we are running in cgroup v2 unified mode. +func IsCgroup2UnifiedMode() bool { + isUnifiedOnce.Do(func() { + var st syscall.Statfs_t + if err := syscall.Statfs(unifiedMountpoint, &st); err != nil { + panic("cannot statfs cgroup root") + } + isUnified = st.Type == unix.CGROUP2_SUPER_MAGIC + }) + return isUnified +} + +// https://www.kernel.org/doc/Documentation/cgroup-v1/cgroups.txt +func FindCgroupMountpoint(cgroupPath, subsystem string) (string, error) { + if IsCgroup2UnifiedMode() { + return unifiedMountpoint, nil + } + mnt, _, err := FindCgroupMountpointAndRoot(cgroupPath, subsystem) + return mnt, err +} + +func FindCgroupMountpointAndRoot(cgroupPath, subsystem string) (string, string, error) { + // We are not using mount.GetMounts() because it's super-inefficient, + // parsing it directly sped up x10 times because of not using Sscanf. + // It was one of two major performance drawbacks in container start. + if !isSubsystemAvailable(subsystem) { + return "", "", NewNotFoundError(subsystem) + } + + f, err := os.Open("/proc/self/mountinfo") + if err != nil { + return "", "", err + } + defer f.Close() + + if IsCgroup2UnifiedMode() { + subsystem = "" + } + + return findCgroupMountpointAndRootFromReader(f, cgroupPath, subsystem) +} + +func findCgroupMountpointAndRootFromReader(reader io.Reader, cgroupPath, subsystem string) (string, string, error) { + scanner := bufio.NewScanner(reader) + for scanner.Scan() { + txt := scanner.Text() + fields := strings.Fields(txt) + if len(fields) < 9 { + continue + } + if strings.HasPrefix(fields[4], cgroupPath) { + for _, opt := range strings.Split(fields[len(fields)-1], ",") { + if (subsystem == "" && fields[9] == "cgroup2") || opt == subsystem { + return fields[4], fields[3], nil + } + } + } + } + if err := scanner.Err(); err != nil { + return "", "", err + } + + return "", "", NewNotFoundError(subsystem) +} + +func isSubsystemAvailable(subsystem string) bool { + if IsCgroup2UnifiedMode() { + controllers, err := GetAllSubsystems() + if err != nil { + return false + } + for _, c := range controllers { + if c == subsystem { + return true + } + } + return false + } + + cgroups, err := ParseCgroupFile("/proc/self/cgroup") + if err != nil { + return false + } + _, avail := cgroups[subsystem] + return avail +} + +func GetClosestMountpointAncestor(dir, mountinfo string) string { + deepestMountPoint := "" + for _, mountInfoEntry := range strings.Split(mountinfo, "\n") { + mountInfoParts := strings.Fields(mountInfoEntry) + if len(mountInfoParts) < 5 { + continue + } + mountPoint := mountInfoParts[4] + if strings.HasPrefix(mountPoint, deepestMountPoint) && strings.HasPrefix(dir, mountPoint) { + deepestMountPoint = mountPoint + } + } + return deepestMountPoint +} + +func FindCgroupMountpointDir() (string, error) { + f, err := os.Open("/proc/self/mountinfo") + if err != nil { + return "", err + } + defer f.Close() + + scanner := bufio.NewScanner(f) + for scanner.Scan() { + text := scanner.Text() + fields := strings.Split(text, " ") + // Safe as mountinfo encodes mountpoints with spaces as \040. + index := strings.Index(text, " - ") + postSeparatorFields := strings.Fields(text[index+3:]) + numPostFields := len(postSeparatorFields) + + // This is an error as we can't detect if the mount is for "cgroup" + if numPostFields == 0 { + return "", fmt.Errorf("Found no fields post '-' in %q", text) + } + + if postSeparatorFields[0] == "cgroup" || postSeparatorFields[0] == "cgroup2" { + // Check that the mount is properly formatted. + if numPostFields < 3 { + return "", fmt.Errorf("Error found less than 3 fields post '-' in %q", text) + } + + return filepath.Dir(fields[4]), nil + } + } + if err := scanner.Err(); err != nil { + return "", err + } + + return "", NewNotFoundError("cgroup") +} + +type Mount struct { + Mountpoint string + Root string + Subsystems []string +} + +func (m Mount) GetOwnCgroup(cgroups map[string]string) (string, error) { + if len(m.Subsystems) == 0 { + return "", fmt.Errorf("no subsystem for mount") + } + + return getControllerPath(m.Subsystems[0], cgroups) +} + +func getCgroupMountsHelper(ss map[string]bool, mi io.Reader, all bool) ([]Mount, error) { + res := make([]Mount, 0, len(ss)) + scanner := bufio.NewScanner(mi) + numFound := 0 + for scanner.Scan() && numFound < len(ss) { + txt := scanner.Text() + sepIdx := strings.Index(txt, " - ") + if sepIdx == -1 { + return nil, fmt.Errorf("invalid mountinfo format") + } + if txt[sepIdx+3:sepIdx+10] == "cgroup2" || txt[sepIdx+3:sepIdx+9] != "cgroup" { + continue + } + fields := strings.Split(txt, " ") + m := Mount{ + Mountpoint: fields[4], + Root: fields[3], + } + for _, opt := range strings.Split(fields[len(fields)-1], ",") { + seen, known := ss[opt] + if !known || (!all && seen) { + continue + } + ss[opt] = true + if strings.HasPrefix(opt, CgroupNamePrefix) { + opt = opt[len(CgroupNamePrefix):] + } + m.Subsystems = append(m.Subsystems, opt) + numFound++ + } + if len(m.Subsystems) > 0 || all { + res = append(res, m) + } + } + if err := scanner.Err(); err != nil { + return nil, err + } + return res, nil +} + +// GetCgroupMounts returns the mounts for the cgroup subsystems. +// all indicates whether to return just the first instance or all the mounts. +func GetCgroupMounts(all bool) ([]Mount, error) { + if IsCgroup2UnifiedMode() { + availableControllers, err := GetAllSubsystems() + if err != nil { + return nil, err + } + m := Mount{ + Mountpoint: unifiedMountpoint, + Root: unifiedMountpoint, + Subsystems: availableControllers, + } + return []Mount{m}, nil + } + + f, err := os.Open("/proc/self/mountinfo") + if err != nil { + return nil, err + } + defer f.Close() + + allSubsystems, err := ParseCgroupFile("/proc/self/cgroup") + if err != nil { + return nil, err + } + + allMap := make(map[string]bool) + for s := range allSubsystems { + allMap[s] = false + } + return getCgroupMountsHelper(allMap, f, all) +} + +// GetAllSubsystems returns all the cgroup subsystems supported by the kernel +func GetAllSubsystems() ([]string, error) { + // /proc/cgroups is meaningless for v2 + // https://github.com/torvalds/linux/blob/v5.3/Documentation/admin-guide/cgroup-v2.rst#deprecated-v1-core-features + if IsCgroup2UnifiedMode() { + // "pseudo" controllers do not appear in /sys/fs/cgroup/cgroup.controllers. + // - devices: implemented in kernel 4.15 + // - freezer: implemented in kernel 5.2 + // We assume these are always available, as it is hard to detect availability. + pseudo := []string{"devices", "freezer"} + data, err := ioutil.ReadFile("/sys/fs/cgroup/cgroup.controllers") + if err != nil { + return nil, err + } + subsystems := append(pseudo, strings.Fields(string(data))...) + return subsystems, nil + } + f, err := os.Open("/proc/cgroups") + if err != nil { + return nil, err + } + defer f.Close() + + subsystems := []string{} + + s := bufio.NewScanner(f) + for s.Scan() { + text := s.Text() + if text[0] != '#' { + parts := strings.Fields(text) + if len(parts) >= 4 && parts[3] != "0" { + subsystems = append(subsystems, parts[0]) + } + } + } + if err := s.Err(); err != nil { + return nil, err + } + return subsystems, nil +} + +// GetOwnCgroup returns the relative path to the cgroup docker is running in. +func GetOwnCgroup(subsystem string) (string, error) { + cgroups, err := ParseCgroupFile("/proc/self/cgroup") + if err != nil { + return "", err + } + + return getControllerPath(subsystem, cgroups) +} + +func GetOwnCgroupPath(subsystem string) (string, error) { + cgroup, err := GetOwnCgroup(subsystem) + if err != nil { + return "", err + } + + return getCgroupPathHelper(subsystem, cgroup) +} + +func GetInitCgroup(subsystem string) (string, error) { + cgroups, err := ParseCgroupFile("/proc/1/cgroup") + if err != nil { + return "", err + } + + return getControllerPath(subsystem, cgroups) +} + +func GetInitCgroupPath(subsystem string) (string, error) { + cgroup, err := GetInitCgroup(subsystem) + if err != nil { + return "", err + } + + return getCgroupPathHelper(subsystem, cgroup) +} + +func getCgroupPathHelper(subsystem, cgroup string) (string, error) { + mnt, root, err := FindCgroupMountpointAndRoot("", subsystem) + if err != nil { + return "", err + } + + // This is needed for nested containers, because in /proc/self/cgroup we + // see paths from host, which don't exist in container. + relCgroup, err := filepath.Rel(root, cgroup) + if err != nil { + return "", err + } + + return filepath.Join(mnt, relCgroup), nil +} + +func readProcsFile(dir string) ([]int, error) { + f, err := os.Open(filepath.Join(dir, CgroupProcesses)) + if err != nil { + return nil, err + } + defer f.Close() + + var ( + s = bufio.NewScanner(f) + out = []int{} + ) + + for s.Scan() { + if t := s.Text(); t != "" { + pid, err := strconv.Atoi(t) + if err != nil { + return nil, err + } + out = append(out, pid) + } + } + return out, nil +} + +// ParseCgroupFile parses the given cgroup file, typically from +// /proc//cgroup, into a map of subgroups to cgroup names. +func ParseCgroupFile(path string) (map[string]string, error) { + f, err := os.Open(path) + if err != nil { + return nil, err + } + defer f.Close() + + return parseCgroupFromReader(f) +} + +// helper function for ParseCgroupFile to make testing easier +func parseCgroupFromReader(r io.Reader) (map[string]string, error) { + s := bufio.NewScanner(r) + cgroups := make(map[string]string) + + for s.Scan() { + text := s.Text() + // from cgroups(7): + // /proc/[pid]/cgroup + // ... + // For each cgroup hierarchy ... there is one entry + // containing three colon-separated fields of the form: + // hierarchy-ID:subsystem-list:cgroup-path + parts := strings.SplitN(text, ":", 3) + if len(parts) < 3 { + return nil, fmt.Errorf("invalid cgroup entry: must contain at least two colons: %v", text) + } + + for _, subs := range strings.Split(parts[1], ",") { + cgroups[subs] = parts[2] + } + } + if err := s.Err(); err != nil { + return nil, err + } + + return cgroups, nil +} + +func getControllerPath(subsystem string, cgroups map[string]string) (string, error) { + if IsCgroup2UnifiedMode() { + return "/", nil + } + + if p, ok := cgroups[subsystem]; ok { + return p, nil + } + + if p, ok := cgroups[CgroupNamePrefix+subsystem]; ok { + return p, nil + } + + return "", NewNotFoundError(subsystem) +} + +func PathExists(path string) bool { + if _, err := os.Stat(path); err != nil { + return false + } + return true +} + +func EnterPid(cgroupPaths map[string]string, pid int) error { + for _, path := range cgroupPaths { + if PathExists(path) { + if err := WriteCgroupProc(path, pid); err != nil { + return err + } + } + } + return nil +} + +// RemovePaths iterates over the provided paths removing them. +// We trying to remove all paths five times with increasing delay between tries. +// If after all there are not removed cgroups - appropriate error will be +// returned. +func RemovePaths(paths map[string]string) (err error) { + delay := 10 * time.Millisecond + for i := 0; i < 5; i++ { + if i != 0 { + time.Sleep(delay) + delay *= 2 + } + for s, p := range paths { + os.RemoveAll(p) + // TODO: here probably should be logging + _, err := os.Stat(p) + // We need this strange way of checking cgroups existence because + // RemoveAll almost always returns error, even on already removed + // cgroups + if os.IsNotExist(err) { + delete(paths, s) + } + } + if len(paths) == 0 { + return nil + } + } + return fmt.Errorf("Failed to remove paths: %v", paths) +} + +func GetHugePageSize() ([]string, error) { + files, err := ioutil.ReadDir("/sys/kernel/mm/hugepages") + if err != nil { + return []string{}, err + } + var fileNames []string + for _, st := range files { + fileNames = append(fileNames, st.Name()) + } + return getHugePageSizeFromFilenames(fileNames) +} + +func getHugePageSizeFromFilenames(fileNames []string) ([]string, error) { + var pageSizes []string + for _, fileName := range fileNames { + nameArray := strings.Split(fileName, "-") + pageSize, err := units.RAMInBytes(nameArray[1]) + if err != nil { + return []string{}, err + } + sizeString := units.CustomSize("%g%s", float64(pageSize), 1024.0, HugePageSizeUnitList) + pageSizes = append(pageSizes, sizeString) + } + + return pageSizes, nil +} + +// GetPids returns all pids, that were added to cgroup at path. +func GetPids(path string) ([]int, error) { + return readProcsFile(path) +} + +// GetAllPids returns all pids, that were added to cgroup at path and to all its +// subcgroups. +func GetAllPids(path string) ([]int, error) { + var pids []int + // collect pids from all sub-cgroups + err := filepath.Walk(path, func(p string, info os.FileInfo, iErr error) error { + dir, file := filepath.Split(p) + if file != CgroupProcesses { + return nil + } + if iErr != nil { + return iErr + } + cPids, err := readProcsFile(dir) + if err != nil { + return err + } + pids = append(pids, cPids...) + return nil + }) + return pids, err +} + +// WriteCgroupProc writes the specified pid into the cgroup's cgroup.procs file +func WriteCgroupProc(dir string, pid int) error { + // Normally dir should not be empty, one case is that cgroup subsystem + // is not mounted, we will get empty dir, and we want it fail here. + if dir == "" { + return fmt.Errorf("no such directory for %s", CgroupProcesses) + } + + // Dont attach any pid to the cgroup if -1 is specified as a pid + if pid == -1 { + return nil + } + + cgroupProcessesFile, err := os.OpenFile(filepath.Join(dir, CgroupProcesses), os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0700) + if err != nil { + return fmt.Errorf("failed to write %v to %v: %v", pid, CgroupProcesses, err) + } + defer cgroupProcessesFile.Close() + + for i := 0; i < 5; i++ { + _, err = cgroupProcessesFile.WriteString(strconv.Itoa(pid)) + if err == nil { + return nil + } + + // EINVAL might mean that the task being added to cgroup.procs is in state + // TASK_NEW. We should attempt to do so again. + if isEINVAL(err) { + time.Sleep(30 * time.Millisecond) + continue + } + + return fmt.Errorf("failed to write %v to %v: %v", pid, CgroupProcesses, err) + } + return err +} + +func isEINVAL(err error) bool { + switch err := err.(type) { + case *os.PathError: + return err.Err == unix.EINVAL + default: + return false + } +} diff --git a/libcontainer/cgroups/utils_test.go b/libcontainer/cgroups/utils_test.go new file mode 100644 index 0000000..3214b9d --- /dev/null +++ b/libcontainer/cgroups/utils_test.go @@ -0,0 +1,459 @@ +// +build linux + +package cgroups + +import ( + "bytes" + "errors" + "fmt" + "reflect" + "strings" + "testing" +) + +const fedoraMountinfo = `15 35 0:3 / /proc rw,nosuid,nodev,noexec,relatime shared:5 - proc proc rw +16 35 0:14 / /sys rw,nosuid,nodev,noexec,relatime shared:6 - sysfs sysfs rw,seclabel +17 35 0:5 / /dev rw,nosuid shared:2 - devtmpfs devtmpfs rw,seclabel,size=8056484k,nr_inodes=2014121,mode=755 +18 16 0:15 / /sys/kernel/security rw,nosuid,nodev,noexec,relatime shared:7 - securityfs securityfs rw +19 16 0:13 / /sys/fs/selinux rw,relatime shared:8 - selinuxfs selinuxfs rw +20 17 0:16 / /dev/shm rw,nosuid,nodev shared:3 - tmpfs tmpfs rw,seclabel +21 17 0:10 / /dev/pts rw,nosuid,noexec,relatime shared:4 - devpts devpts rw,seclabel,gid=5,mode=620,ptmxmode=000 +22 35 0:17 / /run rw,nosuid,nodev shared:21 - tmpfs tmpfs rw,seclabel,mode=755 +23 16 0:18 / /sys/fs/cgroup rw,nosuid,nodev,noexec shared:9 - tmpfs tmpfs rw,seclabel,mode=755 +24 23 0:19 / /sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime shared:10 - cgroup cgroup rw,xattr,release_agent=/usr/lib/systemd/systemd-cgroups-agent,name=systemd +25 16 0:20 / /sys/fs/pstore rw,nosuid,nodev,noexec,relatime shared:20 - pstore pstore rw +26 23 0:21 / /sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime shared:11 - cgroup cgroup rw,cpuset,clone_children +27 23 0:22 / /sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime shared:12 - cgroup cgroup rw,cpuacct,cpu,clone_children +28 23 0:23 / /sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:13 - cgroup cgroup rw,memory,clone_children +29 23 0:24 / /sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:14 - cgroup cgroup rw,devices,clone_children +30 23 0:25 / /sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime shared:15 - cgroup cgroup rw,freezer,clone_children +31 23 0:26 / /sys/fs/cgroup/net_cls rw,nosuid,nodev,noexec,relatime shared:16 - cgroup cgroup rw,net_cls,clone_children +32 23 0:27 / /sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime shared:17 - cgroup cgroup rw,blkio,clone_children +33 23 0:28 / /sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime shared:18 - cgroup cgroup rw,perf_event,clone_children +34 23 0:29 / /sys/fs/cgroup/hugetlb rw,nosuid,nodev,noexec,relatime shared:19 - cgroup cgroup rw,hugetlb,clone_children +35 1 253:2 / / rw,relatime shared:1 - ext4 /dev/mapper/ssd-root--f20 rw,seclabel,data=ordered +36 15 0:30 / /proc/sys/fs/binfmt_misc rw,relatime shared:22 - autofs systemd-1 rw,fd=38,pgrp=1,timeout=300,minproto=5,maxproto=5,direct +37 17 0:12 / /dev/mqueue rw,relatime shared:23 - mqueue mqueue rw,seclabel +38 35 0:31 / /tmp rw shared:24 - tmpfs tmpfs rw,seclabel +39 17 0:32 / /dev/hugepages rw,relatime shared:25 - hugetlbfs hugetlbfs rw,seclabel +40 16 0:7 / /sys/kernel/debug rw,relatime shared:26 - debugfs debugfs rw +41 16 0:33 / /sys/kernel/config rw,relatime shared:27 - configfs configfs rw +42 35 0:34 / /var/lib/nfs/rpc_pipefs rw,relatime shared:28 - rpc_pipefs sunrpc rw +43 15 0:35 / /proc/fs/nfsd rw,relatime shared:29 - nfsd sunrpc rw +45 35 8:17 / /boot rw,relatime shared:30 - ext4 /dev/sdb1 rw,seclabel,data=ordered +46 35 253:4 / /home rw,relatime shared:31 - ext4 /dev/mapper/ssd-home rw,seclabel,data=ordered +47 35 253:5 / /var/lib/libvirt/images rw,noatime,nodiratime shared:32 - ext4 /dev/mapper/ssd-virt rw,seclabel,discard,data=ordered +48 35 253:12 / /mnt/old rw,relatime shared:33 - ext4 /dev/mapper/HelpDeskRHEL6-FedoraRoot rw,seclabel,data=ordered +121 22 0:36 / /run/user/1000/gvfs rw,nosuid,nodev,relatime shared:104 - fuse.gvfsd-fuse gvfsd-fuse rw,user_id=1000,group_id=1000 +124 16 0:37 / /sys/fs/fuse/connections rw,relatime shared:107 - fusectl fusectl rw +165 38 253:3 / /tmp/mnt rw,relatime shared:147 - ext4 /dev/mapper/ssd-root rw,seclabel,data=ordered +167 35 253:15 / /var/lib/docker/devicemapper/mnt/aae4076022f0e2b80a2afbf8fc6df450c52080191fcef7fb679a73e6f073e5c2 rw,relatime shared:149 - ext4 /dev/mapper/docker-253:2-425882-aae4076022f0e2b80a2afbf8fc6df450c52080191fcef7fb679a73e6f073e5c2 rw,seclabel,discard,stripe=16,data=ordered +171 35 253:16 / /var/lib/docker/devicemapper/mnt/c71be651f114db95180e472f7871b74fa597ee70a58ccc35cb87139ddea15373 rw,relatime shared:153 - ext4 /dev/mapper/docker-253:2-425882-c71be651f114db95180e472f7871b74fa597ee70a58ccc35cb87139ddea15373 rw,seclabel,discard,stripe=16,data=ordered +175 35 253:17 / /var/lib/docker/devicemapper/mnt/1bac6ab72862d2d5626560df6197cf12036b82e258c53d981fa29adce6f06c3c rw,relatime shared:157 - ext4 /dev/mapper/docker-253:2-425882-1bac6ab72862d2d5626560df6197cf12036b82e258c53d981fa29adce6f06c3c rw,seclabel,discard,stripe=16,data=ordered +179 35 253:18 / /var/lib/docker/devicemapper/mnt/d710a357d77158e80d5b2c55710ae07c94e76d34d21ee7bae65ce5418f739b09 rw,relatime shared:161 - ext4 /dev/mapper/docker-253:2-425882-d710a357d77158e80d5b2c55710ae07c94e76d34d21ee7bae65ce5418f739b09 rw,seclabel,discard,stripe=16,data=ordered +183 35 253:19 / /var/lib/docker/devicemapper/mnt/6479f52366114d5f518db6837254baab48fab39f2ac38d5099250e9a6ceae6c7 rw,relatime shared:165 - ext4 /dev/mapper/docker-253:2-425882-6479f52366114d5f518db6837254baab48fab39f2ac38d5099250e9a6ceae6c7 rw,seclabel,discard,stripe=16,data=ordered +187 35 253:20 / /var/lib/docker/devicemapper/mnt/8d9df91c4cca5aef49eeb2725292aab324646f723a7feab56be34c2ad08268e1 rw,relatime shared:169 - ext4 /dev/mapper/docker-253:2-425882-8d9df91c4cca5aef49eeb2725292aab324646f723a7feab56be34c2ad08268e1 rw,seclabel,discard,stripe=16,data=ordered +191 35 253:21 / /var/lib/docker/devicemapper/mnt/c8240b768603d32e920d365dc9d1dc2a6af46cd23e7ae819947f969e1b4ec661 rw,relatime shared:173 - ext4 /dev/mapper/docker-253:2-425882-c8240b768603d32e920d365dc9d1dc2a6af46cd23e7ae819947f969e1b4ec661 rw,seclabel,discard,stripe=16,data=ordered +195 35 253:22 / /var/lib/docker/devicemapper/mnt/2eb3a01278380bbf3ed12d86ac629eaa70a4351301ee307a5cabe7b5f3b1615f rw,relatime shared:177 - ext4 /dev/mapper/docker-253:2-425882-2eb3a01278380bbf3ed12d86ac629eaa70a4351301ee307a5cabe7b5f3b1615f rw,seclabel,discard,stripe=16,data=ordered +199 35 253:23 / /var/lib/docker/devicemapper/mnt/37a17fb7c9d9b80821235d5f2662879bd3483915f245f9b49cdaa0e38779b70b rw,relatime shared:181 - ext4 /dev/mapper/docker-253:2-425882-37a17fb7c9d9b80821235d5f2662879bd3483915f245f9b49cdaa0e38779b70b rw,seclabel,discard,stripe=16,data=ordered +203 35 253:24 / /var/lib/docker/devicemapper/mnt/aea459ae930bf1de913e2f29428fd80ee678a1e962d4080019d9f9774331ee2b rw,relatime shared:185 - ext4 /dev/mapper/docker-253:2-425882-aea459ae930bf1de913e2f29428fd80ee678a1e962d4080019d9f9774331ee2b rw,seclabel,discard,stripe=16,data=ordered +207 35 253:25 / /var/lib/docker/devicemapper/mnt/928ead0bc06c454bd9f269e8585aeae0a6bd697f46dc8754c2a91309bc810882 rw,relatime shared:189 - ext4 /dev/mapper/docker-253:2-425882-928ead0bc06c454bd9f269e8585aeae0a6bd697f46dc8754c2a91309bc810882 rw,seclabel,discard,stripe=16,data=ordered +211 35 253:26 / /var/lib/docker/devicemapper/mnt/0f284d18481d671644706e7a7244cbcf63d590d634cc882cb8721821929d0420 rw,relatime shared:193 - ext4 /dev/mapper/docker-253:2-425882-0f284d18481d671644706e7a7244cbcf63d590d634cc882cb8721821929d0420 rw,seclabel,discard,stripe=16,data=ordered +215 35 253:27 / /var/lib/docker/devicemapper/mnt/d9dd16722ab34c38db2733e23f69e8f4803ce59658250dd63e98adff95d04919 rw,relatime shared:197 - ext4 /dev/mapper/docker-253:2-425882-d9dd16722ab34c38db2733e23f69e8f4803ce59658250dd63e98adff95d04919 rw,seclabel,discard,stripe=16,data=ordered +219 35 253:28 / /var/lib/docker/devicemapper/mnt/bc4500479f18c2c08c21ad5282e5f826a016a386177d9874c2764751c031d634 rw,relatime shared:201 - ext4 /dev/mapper/docker-253:2-425882-bc4500479f18c2c08c21ad5282e5f826a016a386177d9874c2764751c031d634 rw,seclabel,discard,stripe=16,data=ordered +223 35 253:29 / /var/lib/docker/devicemapper/mnt/7770c8b24eb3d5cc159a065910076938910d307ab2f5d94e1dc3b24c06ee2c8a rw,relatime shared:205 - ext4 /dev/mapper/docker-253:2-425882-7770c8b24eb3d5cc159a065910076938910d307ab2f5d94e1dc3b24c06ee2c8a rw,seclabel,discard,stripe=16,data=ordered +227 35 253:30 / /var/lib/docker/devicemapper/mnt/c280cd3d0bf0aa36b478b292279671624cceafc1a67eaa920fa1082601297adf rw,relatime shared:209 - ext4 /dev/mapper/docker-253:2-425882-c280cd3d0bf0aa36b478b292279671624cceafc1a67eaa920fa1082601297adf rw,seclabel,discard,stripe=16,data=ordered +231 35 253:31 / /var/lib/docker/devicemapper/mnt/8b59a7d9340279f09fea67fd6ad89ddef711e9e7050eb647984f8b5ef006335f rw,relatime shared:213 - ext4 /dev/mapper/docker-253:2-425882-8b59a7d9340279f09fea67fd6ad89ddef711e9e7050eb647984f8b5ef006335f rw,seclabel,discard,stripe=16,data=ordered +235 35 253:32 / /var/lib/docker/devicemapper/mnt/1a28059f29eda821578b1bb27a60cc71f76f846a551abefabce6efd0146dce9f rw,relatime shared:217 - ext4 /dev/mapper/docker-253:2-425882-1a28059f29eda821578b1bb27a60cc71f76f846a551abefabce6efd0146dce9f rw,seclabel,discard,stripe=16,data=ordered +239 35 253:33 / /var/lib/docker/devicemapper/mnt/e9aa60c60128cad1 rw,relatime shared:221 - ext4 /dev/mapper/docker-253:2-425882-e9aa60c60128cad1 rw,seclabel,discard,stripe=16,data=ordered +243 35 253:34 / /var/lib/docker/devicemapper/mnt/5fec11304b6f4713fea7b6ccdcc1adc0a1966187f590fe25a8227428a8df275d-init rw,relatime shared:225 - ext4 /dev/mapper/docker-253:2-425882-5fec11304b6f4713fea7b6ccdcc1adc0a1966187f590fe25a8227428a8df275d-init rw,seclabel,discard,stripe=16,data=ordered +247 35 253:35 / /var/lib/docker/devicemapper/mnt/5fec11304b6f4713fea7b6ccdcc1adc0a1966187f590fe25a8227428a8df275d rw,relatime shared:229 - ext4 /dev/mapper/docker-253:2-425882-5fec11304b6f4713fea7b6ccdcc1adc0a1966187f590fe25a8227428a8df275d rw,seclabel,discard,stripe=16,data=ordered +31 21 0:23 / /DATA/foo_bla_bla rw,relatime - cifs //foo/BLA\040BLA\040BLA/ rw,sec=ntlm,cache=loose,unc=\\foo\BLA BLA BLA,username=my_login,domain=mydomain.com,uid=12345678,forceuid,gid=12345678,forcegid,addr=10.1.30.10,file_mode=0755,dir_mode=0755,nounix,rsize=61440,wsize=65536,actimeo=1` + +const systemdMountinfo = `115 83 0:32 / / rw,relatime - aufs none rw,si=c0bd3d3,dio,dirperm1 +116 115 0:35 / /proc rw,nosuid,nodev,noexec,relatime - proc proc rw +117 115 0:36 / /dev rw,nosuid - tmpfs tmpfs rw,mode=755 +118 117 0:37 / /dev/pts rw,nosuid,noexec,relatime - devpts devpts rw,gid=5,mode=620,ptmxmode=666 +119 115 0:38 / /sys rw,nosuid,nodev,noexec,relatime - sysfs sysfs rw +120 119 0:39 / /sys/fs/cgroup rw,nosuid,nodev,noexec,relatime - tmpfs tmpfs rw,mode=755 +121 120 0:19 /system.slice/docker-dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e.scope /sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,xattr,release_agent=/lib/systemd/systemd-cgroups-agent,name=systemd +122 120 0:20 /system.slice/docker-dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e.scope /sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,devices +123 120 0:21 /system.slice/docker-dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e.scope /sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,freezer +124 120 0:22 /system.slice/docker-dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e.scope /sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,memory +125 120 0:23 /system.slice/docker-dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e.scope /sys/fs/cgroup/net_cls,net_prio rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,net_cls,net_prio +126 120 0:24 /system.slice/docker-dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e.scope /sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,blkio +127 120 0:25 /system.slice/docker-dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e.scope /sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,cpuset,clone_children +128 120 0:26 /system.slice/docker-dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e.scope /sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,cpu,cpuacct +129 120 0:27 /system.slice/docker-dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e.scope /sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,perf_event,release_agent=/run/cgmanager/agents/cgm-release-agent.perf_event +130 115 43:0 /var/lib/docker/volumes/a44a712176377f57c094397330ee04387284c478364eb25f4c3d25f775f25c26/_data /var/lib/docker rw,relatime - ext4 /dev/nbd0 rw,data=ordered +131 115 43:0 /var/lib/docker/containers/dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e/resolv.conf /etc/resolv.conf rw,relatime - ext4 /dev/nbd0 rw,data=ordered +132 115 43:0 /var/lib/docker/containers/dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e/hostname /etc/hostname rw,relatime - ext4 /dev/nbd0 rw,data=ordered +133 115 43:0 /var/lib/docker/containers/dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e/hosts /etc/hosts rw,relatime - ext4 /dev/nbd0 rw,data=ordered +134 117 0:33 / /dev/shm rw,nosuid,nodev,noexec,relatime - tmpfs shm rw,size=65536k +135 117 0:13 / /dev/mqueue rw,nosuid,nodev,noexec,relatime - mqueue mqueue rw +136 117 0:12 /1 /dev/console rw,nosuid,noexec,relatime - devpts none rw,gid=5,mode=620,ptmxmode=000 +84 115 0:40 / /tmp rw,relatime - tmpfs none rw` + +const bedrockMountinfo = `120 17 0:28 / /sys/fs/cgroup ro,nosuid,nodev,noexec shared:16 - tmpfs tmpfs ro,mode=755 +124 28 0:28 / /bedrock/strata/arch/sys/fs/cgroup rw,nosuid,nodev,noexec shared:16 - tmpfs tmpfs ro,mode=755 +123 53 0:28 / /bedrock/strata/fallback/sys/fs/cgroup rw,nosuid,nodev,noexec shared:16 - tmpfs tmpfs ro,mode=755 +122 71 0:28 / /bedrock/strata/gentoo/sys/fs/cgroup rw,nosuid,nodev,noexec shared:16 - tmpfs tmpfs ro,mode=755 +121 89 0:28 / /bedrock/strata/kde/sys/fs/cgroup rw,nosuid,nodev,noexec shared:16 - tmpfs tmpfs ro,mode=755 +125 120 0:29 / /sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime shared:17 - cgroup cgroup rw,xattr,release_agent=/lib/systemd/systemd-cgroups-agent,name=systemd +129 124 0:29 / /bedrock/strata/arch/sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime shared:17 - cgroup cgroup rw,xattr,release_agent=/lib/systemd/systemd-cgroups-agent,name=systemd +128 123 0:29 / /bedrock/strata/fallback/sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime shared:17 - cgroup cgroup rw,xattr,release_agent=/lib/systemd/systemd-cgroups-agent,name=systemd +127 122 0:29 / /bedrock/strata/gentoo/sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime shared:17 - cgroup cgroup rw,xattr,release_agent=/lib/systemd/systemd-cgroups-agent,name=systemd +126 121 0:29 / /bedrock/strata/kde/sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime shared:17 - cgroup cgroup rw,xattr,release_agent=/lib/systemd/systemd-cgroups-agent,name=systemd +140 120 0:32 / /sys/fs/cgroup/net_cls,net_prio rw,nosuid,nodev,noexec,relatime shared:48 - cgroup cgroup rw,net_cls,net_prio +144 124 0:32 / /bedrock/strata/arch/sys/fs/cgroup/net_cls,net_prio rw,nosuid,nodev,noexec,relatime shared:48 - cgroup cgroup rw,net_cls,net_prio +143 123 0:32 / /bedrock/strata/fallback/sys/fs/cgroup/net_cls,net_prio rw,nosuid,nodev,noexec,relatime shared:48 - cgroup cgroup rw,net_cls,net_prio +142 122 0:32 / /bedrock/strata/gentoo/sys/fs/cgroup/net_cls,net_prio rw,nosuid,nodev,noexec,relatime shared:48 - cgroup cgroup rw,net_cls,net_prio +141 121 0:32 / /bedrock/strata/kde/sys/fs/cgroup/net_cls,net_prio rw,nosuid,nodev,noexec,relatime shared:48 - cgroup cgroup rw,net_cls,net_prio +145 120 0:33 / /sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime shared:49 - cgroup cgroup rw,blkio +149 124 0:33 / /bedrock/strata/arch/sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime shared:49 - cgroup cgroup rw,blkio +148 123 0:33 / /bedrock/strata/fallback/sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime shared:49 - cgroup cgroup rw,blkio +147 122 0:33 / /bedrock/strata/gentoo/sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime shared:49 - cgroup cgroup rw,blkio +146 121 0:33 / /bedrock/strata/kde/sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime shared:49 - cgroup cgroup rw,blkio +150 120 0:34 / /sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime shared:50 - cgroup cgroup rw,cpu,cpuacct +154 124 0:34 / /bedrock/strata/arch/sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime shared:50 - cgroup cgroup rw,cpu,cpuacct +153 123 0:34 / /bedrock/strata/fallback/sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime shared:50 - cgroup cgroup rw,cpu,cpuacct +152 122 0:34 / /bedrock/strata/gentoo/sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime shared:50 - cgroup cgroup rw,cpu,cpuacct +151 121 0:34 / /bedrock/strata/kde/sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime shared:50 - cgroup cgroup rw,cpu,cpuacct +155 120 0:35 / /sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime shared:51 - cgroup cgroup rw,cpuset +159 124 0:35 / /bedrock/strata/arch/sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime shared:51 - cgroup cgroup rw,cpuset +158 123 0:35 / /bedrock/strata/fallback/sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime shared:51 - cgroup cgroup rw,cpuset +157 122 0:35 / /bedrock/strata/gentoo/sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime shared:51 - cgroup cgroup rw,cpuset +156 121 0:35 / /bedrock/strata/kde/sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime shared:51 - cgroup cgroup rw,cpuset +160 120 0:36 / /sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:52 - cgroup cgroup rw,devices +164 124 0:36 / /bedrock/strata/arch/sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:52 - cgroup cgroup rw,devices +163 123 0:36 / /bedrock/strata/fallback/sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:52 - cgroup cgroup rw,devices +162 122 0:36 / /bedrock/strata/gentoo/sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:52 - cgroup cgroup rw,devices +161 121 0:36 / /bedrock/strata/kde/sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:52 - cgroup cgroup rw,devices +165 120 0:37 / /sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:53 - cgroup cgroup rw,memory +169 124 0:37 / /bedrock/strata/arch/sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:53 - cgroup cgroup rw,memory +168 123 0:37 / /bedrock/strata/fallback/sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:53 - cgroup cgroup rw,memory +167 122 0:37 / /bedrock/strata/gentoo/sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:53 - cgroup cgroup rw,memory +166 121 0:37 / /bedrock/strata/kde/sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:53 - cgroup cgroup rw,memory +170 120 0:38 / /sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime shared:54 - cgroup cgroup rw,freezer +174 124 0:38 / /bedrock/strata/arch/sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime shared:54 - cgroup cgroup rw,freezer +173 123 0:38 / /bedrock/strata/fallback/sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime shared:54 - cgroup cgroup rw,freezer +172 122 0:38 / /bedrock/strata/gentoo/sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime shared:54 - cgroup cgroup rw,freezer +171 121 0:38 / /bedrock/strata/kde/sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime shared:54 - cgroup cgroup rw,freezer +175 120 0:39 / /sys/fs/cgroup/pids rw,nosuid,nodev,noexec,relatime shared:55 - cgroup cgroup rw,pids +179 124 0:39 / /bedrock/strata/arch/sys/fs/cgroup/pids rw,nosuid,nodev,noexec,relatime shared:55 - cgroup cgroup rw,pids +178 123 0:39 / /bedrock/strata/fallback/sys/fs/cgroup/pids rw,nosuid,nodev,noexec,relatime shared:55 - cgroup cgroup rw,pids +177 122 0:39 / /bedrock/strata/gentoo/sys/fs/cgroup/pids rw,nosuid,nodev,noexec,relatime shared:55 - cgroup cgroup rw,pids +176 121 0:39 / /bedrock/strata/kde/sys/fs/cgroup/pids rw,nosuid,nodev,noexec,relatime shared:55 - cgroup cgroup rw,pids +180 120 0:40 / /sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime shared:56 - cgroup cgroup rw,perf_event +184 124 0:40 / /bedrock/strata/arch/sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime shared:56 - cgroup cgroup rw,perf_event +183 123 0:40 / /bedrock/strata/fallback/sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime shared:56 - cgroup cgroup rw,perf_event +182 122 0:40 / /bedrock/strata/gentoo/sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime shared:56 - cgroup cgroup rw,perf_event +181 121 0:40 / /bedrock/strata/kde/sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime shared:56 - cgroup cgroup rw,perf_event` + +const cgroup2Mountinfo = `18 64 0:18 / /sys rw,nosuid,nodev,noexec,relatime shared:6 - sysfs sysfs rw,seclabel +19 64 0:4 / /proc rw,nosuid,nodev,noexec,relatime shared:5 - proc proc rw +20 64 0:6 / /dev rw,nosuid shared:2 - devtmpfs devtmpfs rw,seclabel,size=8171204k,nr_inodes=2042801,mode=755 +21 18 0:19 / /sys/kernel/security rw,nosuid,nodev,noexec,relatime shared:7 - securityfs securityfs rw +22 20 0:20 / /dev/shm rw,nosuid,nodev shared:3 - tmpfs tmpfs rw,seclabel +23 20 0:21 / /dev/pts rw,nosuid,noexec,relatime shared:4 - devpts devpts rw,seclabel,gid=5,mode=620,ptmxmode=000 +24 64 0:22 / /run rw,nosuid,nodev shared:24 - tmpfs tmpfs rw,seclabel,mode=755 +25 18 0:23 / /sys/fs/cgroup ro,nosuid,nodev,noexec shared:8 - tmpfs tmpfs ro,seclabel,mode=755 +26 25 0:24 / /sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime shared:9 - cgroup2 cgroup rw +27 18 0:25 / /sys/fs/pstore rw,nosuid,nodev,noexec,relatime shared:20 - pstore pstore rw,seclabel +28 18 0:26 / /sys/firmware/efi/efivars rw,nosuid,nodev,noexec,relatime shared:21 - efivarfs efivarfs rw +29 25 0:27 / /sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime shared:10 - cgroup cgroup rw,cpu,cpuacct +30 25 0:28 / /sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:11 - cgroup cgroup rw,memory +31 25 0:29 / /sys/fs/cgroup/net_cls,net_prio rw,nosuid,nodev,noexec,relatime shared:12 - cgroup cgroup rw,net_cls,net_prio +32 25 0:30 / /sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime shared:13 - cgroup cgroup rw,blkio +33 25 0:31 / /sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime shared:14 - cgroup cgroup rw,perf_event +34 25 0:32 / /sys/fs/cgroup/hugetlb rw,nosuid,nodev,noexec,relatime shared:15 - cgroup cgroup rw,hugetlb +35 25 0:33 / /sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime shared:16 - cgroup cgroup rw,freezer +36 25 0:34 / /sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime shared:17 - cgroup cgroup rw,cpuset +37 25 0:35 / /sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:18 - cgroup cgroup rw,devices +38 25 0:36 / /sys/fs/cgroup/pids rw,nosuid,nodev,noexec,relatime shared:19 - cgroup cgroup rw,pids +61 18 0:37 / /sys/kernel/config rw,relatime shared:22 - configfs configfs rw +64 0 253:0 / / rw,relatime shared:1 - ext4 /dev/mapper/fedora_dhcp--16--129-root rw,seclabel,data=ordered +39 18 0:17 / /sys/fs/selinux rw,relatime shared:23 - selinuxfs selinuxfs rw +40 20 0:16 / /dev/mqueue rw,relatime shared:25 - mqueue mqueue rw,seclabel +41 20 0:39 / /dev/hugepages rw,relatime shared:26 - hugetlbfs hugetlbfs rw,seclabel +` + +func TestGetCgroupMounts(t *testing.T) { + type testData struct { + mountInfo string + root string + subsystems map[string]bool + } + testTable := []testData{ + { + mountInfo: fedoraMountinfo, + root: "/", + subsystems: map[string]bool{ + "cpuset": false, + "cpu": false, + "cpuacct": false, + "memory": false, + "devices": false, + "freezer": false, + "net_cls": false, + "blkio": false, + "perf_event": false, + "hugetlb": false, + }, + }, + { + mountInfo: systemdMountinfo, + root: "/system.slice/docker-dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e.scope", + subsystems: map[string]bool{ + "cpuset": false, + "cpu": false, + "cpuacct": false, + "memory": false, + "devices": false, + "freezer": false, + "net_cls": false, + "blkio": false, + "perf_event": false, + }, + }, + { + mountInfo: bedrockMountinfo, + root: "/", + subsystems: map[string]bool{ + "cpuset": false, + "cpu": false, + "cpuacct": false, + "memory": false, + "devices": false, + "freezer": false, + "net_cls": false, + "blkio": false, + "perf_event": false, + }, + }, + } + for _, td := range testTable { + mi := bytes.NewBufferString(td.mountInfo) + cgMounts, err := getCgroupMountsHelper(td.subsystems, mi, false) + if err != nil { + t.Fatal(err) + } + cgMap := make(map[string]Mount) + for _, m := range cgMounts { + for _, ss := range m.Subsystems { + cgMap[ss] = m + } + } + for ss := range td.subsystems { + m, ok := cgMap[ss] + if !ok { + t.Fatalf("%s not found", ss) + } + if m.Root != td.root { + t.Fatalf("unexpected root for %s: %s", ss, m.Root) + } + if !strings.HasPrefix(m.Mountpoint, "/sys/fs/cgroup/") && !strings.Contains(m.Mountpoint, ss) { + t.Fatalf("unexpected mountpoint for %s: %s", ss, m.Mountpoint) + } + var ssFound bool + for _, mss := range m.Subsystems { + if mss == ss { + ssFound = true + break + } + } + if !ssFound { + t.Fatalf("subsystem %s not found in Subsystems field %v", ss, m.Subsystems) + } + } + } +} + +func BenchmarkGetCgroupMounts(b *testing.B) { + subsystems := map[string]bool{ + "cpuset": false, + "cpu": false, + "cpuacct": false, + "memory": false, + "devices": false, + "freezer": false, + "net_cls": false, + "blkio": false, + "perf_event": false, + "hugetlb": false, + } + b.ResetTimer() + for i := 0; i < b.N; i++ { + b.StopTimer() + mi := bytes.NewBufferString(fedoraMountinfo) + b.StartTimer() + if _, err := getCgroupMountsHelper(subsystems, mi, false); err != nil { + b.Fatal(err) + } + } +} + +func TestParseCgroupString(t *testing.T) { + testCases := []struct { + input string + expectedError error + expectedOutput map[string]string + }{ + { + // Taken from a CoreOS instance running systemd 225 with CPU/Mem + // accounting enabled in systemd + input: `9:blkio:/ +8:freezer:/ +7:perf_event:/ +6:devices:/system.slice/system-sshd.slice +5:cpuset:/ +4:cpu,cpuacct:/system.slice/system-sshd.slice/sshd@126-10.240.0.15:22-xxx.yyy.zzz.aaa:33678.service +3:net_cls,net_prio:/ +2:memory:/system.slice/system-sshd.slice/sshd@126-10.240.0.15:22-xxx.yyy.zzz.aaa:33678.service +1:name=systemd:/system.slice/system-sshd.slice/sshd@126-10.240.0.15:22-xxx.yyy.zzz.aaa:33678.service`, + expectedOutput: map[string]string{ + "name=systemd": "/system.slice/system-sshd.slice/sshd@126-10.240.0.15:22-xxx.yyy.zzz.aaa:33678.service", + "blkio": "/", + "freezer": "/", + "perf_event": "/", + "devices": "/system.slice/system-sshd.slice", + "cpuset": "/", + "cpu": "/system.slice/system-sshd.slice/sshd@126-10.240.0.15:22-xxx.yyy.zzz.aaa:33678.service", + "cpuacct": "/system.slice/system-sshd.slice/sshd@126-10.240.0.15:22-xxx.yyy.zzz.aaa:33678.service", + "net_cls": "/", + "net_prio": "/", + "memory": "/system.slice/system-sshd.slice/sshd@126-10.240.0.15:22-xxx.yyy.zzz.aaa:33678.service", + }, + }, + { + input: `malformed input`, + expectedError: fmt.Errorf(`invalid cgroup entry: must contain at least two colons: malformed input`), + }, + } + + for ndx, testCase := range testCases { + out, err := parseCgroupFromReader(strings.NewReader(testCase.input)) + if err != nil { + if testCase.expectedError == nil || testCase.expectedError.Error() != err.Error() { + t.Errorf("%v: expected error %v, got error %v", ndx, testCase.expectedError, err) + } + } else { + if !reflect.DeepEqual(testCase.expectedOutput, out) { + t.Errorf("%v: expected output %v, got error %v", ndx, testCase.expectedOutput, out) + } + } + } + +} + +func TestIgnoreCgroup2Mount(t *testing.T) { + subsystems := map[string]bool{ + "cpuset": false, + "cpu": false, + "cpuacct": false, + "memory": false, + "devices": false, + "freezer": false, + "net_cls": false, + "blkio": false, + "perf_event": false, + "pids": false, + "name=systemd": false, + } + + mi := bytes.NewBufferString(cgroup2Mountinfo) + cgMounts, err := getCgroupMountsHelper(subsystems, mi, false) + if err != nil { + t.Fatal(err) + } + for _, m := range cgMounts { + if m.Mountpoint == "/sys/fs/cgroup/systemd" { + t.Errorf("parsed a cgroup2 mount at /sys/fs/cgroup/systemd instead of ignoring it") + } + } +} + +func TestGetClosestMountpointAncestor(t *testing.T) { + fakeMountInfo := ` 18 24 0:17 / /sys rw,nosuid,nodev,noexec,relatime - sysfs sysfs rw +100 99 1:31 / /foo/bar rw,relatime - fake fake rw,fake +100 99 1:31 / /foo/bar/baz2 rw,relatime - fake fake rw,fake +100 99 1:31 / /foo/bar/baz rw,relatime - fake fake rw,fake +100 99 1:31 / /foo/bar/bazza rw,relatime - fake fake rw,fake +100 99 1:31 / /foo/bar/baz3 rw,relatime - fake fake rw,fake +100 99 1:31 / /foo rw,relatime - fake fake rw,fake +100 99 1:31 / /unrelated rw,relatime - fake fake rw,fake +100 99 1:31 / / rw,relatime - fake fake rw,fake +` + testCases := []struct { + input string + output string + }{ + {input: "/foo/bar/baz/a/b/c", output: "/foo/bar/baz"}, + {input: "/foo/bar/baz", output: "/foo/bar/baz"}, + {input: "/foo/bar/bazza", output: "/foo/bar/bazza"}, + {input: "/a/b/c/d", output: "/"}, + } + + for _, c := range testCases { + mountpoint := GetClosestMountpointAncestor(c.input, fakeMountInfo) + if mountpoint != c.output { + t.Errorf("expected %s, got %s", c.output, mountpoint) + } + } +} + +func TestFindCgroupMountpointAndRoot(t *testing.T) { + fakeMountInfo := ` +35 27 0:29 / /foo rw,nosuid,nodev,noexec,relatime shared:18 - cgroup cgroup rw,devices +35 27 0:29 / /sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:18 - cgroup cgroup rw,devices +` + testCases := []struct { + cgroupPath string + output string + }{ + {cgroupPath: "/sys/fs", output: "/sys/fs/cgroup/devices"}, + {cgroupPath: "", output: "/foo"}, + } + + for _, c := range testCases { + mountpoint, _, _ := findCgroupMountpointAndRootFromReader(strings.NewReader(fakeMountInfo), c.cgroupPath, "devices") + if mountpoint != c.output { + t.Errorf("expected %s, got %s", c.output, mountpoint) + } + } +} + +func TestGetHugePageSizeImpl(t *testing.T) { + + testCases := []struct { + inputFiles []string + outputPageSizes []string + err error + }{ + { + inputFiles: []string{"hugepages-1048576kB", "hugepages-2048kB", "hugepages-32768kB", "hugepages-64kB"}, + outputPageSizes: []string{"1GB", "2MB", "32MB", "64KB"}, + err: nil, + }, + { + inputFiles: []string{}, + outputPageSizes: []string{}, + err: nil, + }, + { + inputFiles: []string{"hugepages-a"}, + outputPageSizes: []string{}, + err: errors.New("invalid size: 'a'"), + }, + } + + for _, c := range testCases { + pageSizes, err := getHugePageSizeFromFilenames(c.inputFiles) + if len(pageSizes) != 0 && len(c.outputPageSizes) != 0 && !reflect.DeepEqual(pageSizes, c.outputPageSizes) { + t.Errorf("expected %s, got %s", c.outputPageSizes, pageSizes) + } + if err != nil && err.Error() != c.err.Error() { + t.Errorf("expected error %s, got %s", c.err, err) + } + } +} diff --git a/libcontainer/configs/blkio_device.go b/libcontainer/configs/blkio_device.go new file mode 100644 index 0000000..fa195bf --- /dev/null +++ b/libcontainer/configs/blkio_device.go @@ -0,0 +1,66 @@ +package configs + +import "fmt" + +// blockIODevice holds major:minor format supported in blkio cgroup +type blockIODevice struct { + // Major is the device's major number + Major int64 `json:"major"` + // Minor is the device's minor number + Minor int64 `json:"minor"` +} + +// WeightDevice struct holds a `major:minor weight`|`major:minor leaf_weight` pair +type WeightDevice struct { + blockIODevice + // Weight is the bandwidth rate for the device, range is from 10 to 1000 + Weight uint16 `json:"weight"` + // LeafWeight is the bandwidth rate for the device while competing with the cgroup's child cgroups, range is from 10 to 1000, cfq scheduler only + LeafWeight uint16 `json:"leafWeight"` +} + +// NewWeightDevice returns a configured WeightDevice pointer +func NewWeightDevice(major, minor int64, weight, leafWeight uint16) *WeightDevice { + wd := &WeightDevice{} + wd.Major = major + wd.Minor = minor + wd.Weight = weight + wd.LeafWeight = leafWeight + return wd +} + +// WeightString formats the struct to be writable to the cgroup specific file +func (wd *WeightDevice) WeightString() string { + return fmt.Sprintf("%d:%d %d", wd.Major, wd.Minor, wd.Weight) +} + +// LeafWeightString formats the struct to be writable to the cgroup specific file +func (wd *WeightDevice) LeafWeightString() string { + return fmt.Sprintf("%d:%d %d", wd.Major, wd.Minor, wd.LeafWeight) +} + +// ThrottleDevice struct holds a `major:minor rate_per_second` pair +type ThrottleDevice struct { + blockIODevice + // Rate is the IO rate limit per cgroup per device + Rate uint64 `json:"rate"` +} + +// NewThrottleDevice returns a configured ThrottleDevice pointer +func NewThrottleDevice(major, minor int64, rate uint64) *ThrottleDevice { + td := &ThrottleDevice{} + td.Major = major + td.Minor = minor + td.Rate = rate + return td +} + +// String formats the struct to be writable to the cgroup specific file +func (td *ThrottleDevice) String() string { + return fmt.Sprintf("%d:%d %d", td.Major, td.Minor, td.Rate) +} + +// StringName formats the struct to be writable to the cgroup specific file +func (td *ThrottleDevice) StringName(name string) string { + return fmt.Sprintf("%d:%d %s=%d", td.Major, td.Minor, name, td.Rate) +} diff --git a/libcontainer/configs/cgroup_linux.go b/libcontainer/configs/cgroup_linux.go new file mode 100644 index 0000000..58ed19c --- /dev/null +++ b/libcontainer/configs/cgroup_linux.go @@ -0,0 +1,130 @@ +package configs + +type FreezerState string + +const ( + Undefined FreezerState = "" + Frozen FreezerState = "FROZEN" + Thawed FreezerState = "THAWED" +) + +type Cgroup struct { + // Deprecated, use Path instead + Name string `json:"name,omitempty"` + + // name of parent of cgroup or slice + // Deprecated, use Path instead + Parent string `json:"parent,omitempty"` + + // Path specifies the path to cgroups that are created and/or joined by the container. + // The path is assumed to be relative to the host system cgroup mountpoint. + Path string `json:"path"` + + // ScopePrefix describes prefix for the scope name + ScopePrefix string `json:"scope_prefix"` + + // Paths represent the absolute cgroups paths to join. + // This takes precedence over Path. + Paths map[string]string + + // Resources contains various cgroups settings to apply + *Resources +} + +type Resources struct { + // If this is true allow access to any kind of device within the container. If false, allow access only to devices explicitly listed in the allowed_devices list. + // Deprecated + AllowAllDevices *bool `json:"allow_all_devices,omitempty"` + // Deprecated + AllowedDevices []*Device `json:"allowed_devices,omitempty"` + // Deprecated + DeniedDevices []*Device `json:"denied_devices,omitempty"` + + Devices []*Device `json:"devices"` + + // Memory limit (in bytes) + Memory int64 `json:"memory"` + + // Memory reservation or soft_limit (in bytes) + MemoryReservation int64 `json:"memory_reservation"` + + // Total memory usage (memory + swap); set `-1` to enable unlimited swap + MemorySwap int64 `json:"memory_swap"` + + // Kernel memory limit (in bytes) + KernelMemory int64 `json:"kernel_memory"` + + // Kernel memory limit for TCP use (in bytes) + KernelMemoryTCP int64 `json:"kernel_memory_tcp"` + + // CPU shares (relative weight vs. other containers) + CpuShares uint64 `json:"cpu_shares"` + + // CPU hardcap limit (in usecs). Allowed cpu time in a given period. + CpuQuota int64 `json:"cpu_quota"` + + // CPU period to be used for hardcapping (in usecs). 0 to use system default. + CpuPeriod uint64 `json:"cpu_period"` + + // How many time CPU will use in realtime scheduling (in usecs). + CpuRtRuntime int64 `json:"cpu_rt_quota"` + + // CPU period to be used for realtime scheduling (in usecs). + CpuRtPeriod uint64 `json:"cpu_rt_period"` + + // CPU to use + CpusetCpus string `json:"cpuset_cpus"` + + // MEM to use + CpusetMems string `json:"cpuset_mems"` + + // Process limit; set <= `0' to disable limit. + PidsLimit int64 `json:"pids_limit"` + + // Specifies per cgroup weight, range is from 10 to 1000. + BlkioWeight uint16 `json:"blkio_weight"` + + // Specifies tasks' weight in the given cgroup while competing with the cgroup's child cgroups, range is from 10 to 1000, cfq scheduler only + BlkioLeafWeight uint16 `json:"blkio_leaf_weight"` + + // Weight per cgroup per device, can override BlkioWeight. + BlkioWeightDevice []*WeightDevice `json:"blkio_weight_device"` + + // IO read rate limit per cgroup per device, bytes per second. + BlkioThrottleReadBpsDevice []*ThrottleDevice `json:"blkio_throttle_read_bps_device"` + + // IO write rate limit per cgroup per device, bytes per second. + BlkioThrottleWriteBpsDevice []*ThrottleDevice `json:"blkio_throttle_write_bps_device"` + + // IO read rate limit per cgroup per device, IO per second. + BlkioThrottleReadIOPSDevice []*ThrottleDevice `json:"blkio_throttle_read_iops_device"` + + // IO write rate limit per cgroup per device, IO per second. + BlkioThrottleWriteIOPSDevice []*ThrottleDevice `json:"blkio_throttle_write_iops_device"` + + // set the freeze value for the process + Freezer FreezerState `json:"freezer"` + + // Hugetlb limit (in bytes) + HugetlbLimit []*HugepageLimit `json:"hugetlb_limit"` + + // Whether to disable OOM Killer + OomKillDisable bool `json:"oom_kill_disable"` + + // Tuning swappiness behaviour per cgroup + MemorySwappiness *uint64 `json:"memory_swappiness"` + + // Set priority of network traffic for container + NetPrioIfpriomap []*IfPrioMap `json:"net_prio_ifpriomap"` + + // Set class identifier for container's network packets + NetClsClassid uint32 `json:"net_cls_classid_u"` + + // Used on cgroups v2: + + // CpuWeight sets a proportional bandwidth limit. + CpuWeight uint64 `json:"cpu_weight"` + + // CpuMax sets she maximum bandwidth limit (format: max period). + CpuMax string `json:"cpu_max"` +} diff --git a/libcontainer/configs/cgroup_unsupported.go b/libcontainer/configs/cgroup_unsupported.go new file mode 100644 index 0000000..c0c23d7 --- /dev/null +++ b/libcontainer/configs/cgroup_unsupported.go @@ -0,0 +1,8 @@ +// +build !linux + +package configs + +// TODO Windows: This can ultimately be entirely factored out on Windows as +// cgroups are a Unix-specific construct. +type Cgroup struct { +} diff --git a/libcontainer/configs/config.go b/libcontainer/configs/config.go new file mode 100644 index 0000000..24989e9 --- /dev/null +++ b/libcontainer/configs/config.go @@ -0,0 +1,354 @@ +package configs + +import ( + "bytes" + "encoding/json" + "fmt" + "os/exec" + "time" + + "github.com/opencontainers/runtime-spec/specs-go" + + "github.com/sirupsen/logrus" +) + +type Rlimit struct { + Type int `json:"type"` + Hard uint64 `json:"hard"` + Soft uint64 `json:"soft"` +} + +// IDMap represents UID/GID Mappings for User Namespaces. +type IDMap struct { + ContainerID int `json:"container_id"` + HostID int `json:"host_id"` + Size int `json:"size"` +} + +// Seccomp represents syscall restrictions +// By default, only the native architecture of the kernel is allowed to be used +// for syscalls. Additional architectures can be added by specifying them in +// Architectures. +type Seccomp struct { + DefaultAction Action `json:"default_action"` + Architectures []string `json:"architectures"` + Syscalls []*Syscall `json:"syscalls"` +} + +// Action is taken upon rule match in Seccomp +type Action int + +const ( + Kill Action = iota + 1 + Errno + Trap + Allow + Trace + Log +) + +// Operator is a comparison operator to be used when matching syscall arguments in Seccomp +type Operator int + +const ( + EqualTo Operator = iota + 1 + NotEqualTo + GreaterThan + GreaterThanOrEqualTo + LessThan + LessThanOrEqualTo + MaskEqualTo +) + +// Arg is a rule to match a specific syscall argument in Seccomp +type Arg struct { + Index uint `json:"index"` + Value uint64 `json:"value"` + ValueTwo uint64 `json:"value_two"` + Op Operator `json:"op"` +} + +// Syscall is a rule to match a syscall in Seccomp +type Syscall struct { + Name string `json:"name"` + Action Action `json:"action"` + Args []*Arg `json:"args"` +} + +// TODO Windows. Many of these fields should be factored out into those parts +// which are common across platforms, and those which are platform specific. + +// Config defines configuration options for executing a process inside a contained environment. +type Config struct { + // NoPivotRoot will use MS_MOVE and a chroot to jail the process into the container's rootfs + // This is a common option when the container is running in ramdisk + NoPivotRoot bool `json:"no_pivot_root"` + + // ParentDeathSignal specifies the signal that is sent to the container's process in the case + // that the parent process dies. + ParentDeathSignal int `json:"parent_death_signal"` + + // Path to a directory containing the container's root filesystem. + Rootfs string `json:"rootfs"` + + // Readonlyfs will remount the container's rootfs as readonly where only externally mounted + // bind mounts are writtable. + Readonlyfs bool `json:"readonlyfs"` + + // Specifies the mount propagation flags to be applied to /. + RootPropagation int `json:"rootPropagation"` + + // Mounts specify additional source and destination paths that will be mounted inside the container's + // rootfs and mount namespace if specified + Mounts []*Mount `json:"mounts"` + + // The device nodes that should be automatically created within the container upon container start. Note, make sure that the node is marked as allowed in the cgroup as well! + Devices []*Device `json:"devices"` + + MountLabel string `json:"mount_label"` + + // Hostname optionally sets the container's hostname if provided + Hostname string `json:"hostname"` + + // Namespaces specifies the container's namespaces that it should setup when cloning the init process + // If a namespace is not provided that namespace is shared from the container's parent process + Namespaces Namespaces `json:"namespaces"` + + // Capabilities specify the capabilities to keep when executing the process inside the container + // All capabilities not specified will be dropped from the processes capability mask + Capabilities *Capabilities `json:"capabilities"` + + // Networks specifies the container's network setup to be created + Networks []*Network `json:"networks"` + + // Routes can be specified to create entries in the route table as the container is started + Routes []*Route `json:"routes"` + + // Cgroups specifies specific cgroup settings for the various subsystems that the container is + // placed into to limit the resources the container has available + Cgroups *Cgroup `json:"cgroups"` + + // AppArmorProfile specifies the profile to apply to the process running in the container and is + // change at the time the process is execed + AppArmorProfile string `json:"apparmor_profile,omitempty"` + + // ProcessLabel specifies the label to apply to the process running in the container. It is + // commonly used by selinux + ProcessLabel string `json:"process_label,omitempty"` + + // Rlimits specifies the resource limits, such as max open files, to set in the container + // If Rlimits are not set, the container will inherit rlimits from the parent process + Rlimits []Rlimit `json:"rlimits,omitempty"` + + // OomScoreAdj specifies the adjustment to be made by the kernel when calculating oom scores + // for a process. Valid values are between the range [-1000, '1000'], where processes with + // higher scores are preferred for being killed. If it is unset then we don't touch the current + // value. + // More information about kernel oom score calculation here: https://lwn.net/Articles/317814/ + OomScoreAdj *int `json:"oom_score_adj,omitempty"` + + // UidMappings is an array of User ID mappings for User Namespaces + UidMappings []IDMap `json:"uid_mappings"` + + // GidMappings is an array of Group ID mappings for User Namespaces + GidMappings []IDMap `json:"gid_mappings"` + + // MaskPaths specifies paths within the container's rootfs to mask over with a bind + // mount pointing to /dev/null as to prevent reads of the file. + MaskPaths []string `json:"mask_paths"` + + // ReadonlyPaths specifies paths within the container's rootfs to remount as read-only + // so that these files prevent any writes. + ReadonlyPaths []string `json:"readonly_paths"` + + // Sysctl is a map of properties and their values. It is the equivalent of using + // sysctl -w my.property.name value in Linux. + Sysctl map[string]string `json:"sysctl"` + + // Seccomp allows actions to be taken whenever a syscall is made within the container. + // A number of rules are given, each having an action to be taken if a syscall matches it. + // A default action to be taken if no rules match is also given. + Seccomp *Seccomp `json:"seccomp"` + + // NoNewPrivileges controls whether processes in the container can gain additional privileges. + NoNewPrivileges bool `json:"no_new_privileges,omitempty"` + + // Hooks are a collection of actions to perform at various container lifecycle events. + // CommandHooks are serialized to JSON, but other hooks are not. + Hooks *Hooks + + // Version is the version of opencontainer specification that is supported. + Version string `json:"version"` + + // Labels are user defined metadata that is stored in the config and populated on the state + Labels []string `json:"labels"` + + // NoNewKeyring will not allocated a new session keyring for the container. It will use the + // callers keyring in this case. + NoNewKeyring bool `json:"no_new_keyring"` + + // IntelRdt specifies settings for Intel RDT group that the container is placed into + // to limit the resources (e.g., L3 cache, memory bandwidth) the container has available + IntelRdt *IntelRdt `json:"intel_rdt,omitempty"` + + // RootlessEUID is set when the runc was launched with non-zero EUID. + // Note that RootlessEUID is set to false when launched with EUID=0 in userns. + // When RootlessEUID is set, runc creates a new userns for the container. + // (config.json needs to contain userns settings) + RootlessEUID bool `json:"rootless_euid,omitempty"` + + // RootlessCgroups is set when unlikely to have the full access to cgroups. + // When RootlessCgroups is set, cgroups errors are ignored. + RootlessCgroups bool `json:"rootless_cgroups,omitempty"` +} + +type Hooks struct { + // Prestart commands are executed after the container namespaces are created, + // but before the user supplied command is executed from init. + Prestart []Hook + + // Poststart commands are executed after the container init process starts. + Poststart []Hook + + // Poststop commands are executed after the container init process exits. + Poststop []Hook +} + +type Capabilities struct { + // Bounding is the set of capabilities checked by the kernel. + Bounding []string + // Effective is the set of capabilities checked by the kernel. + Effective []string + // Inheritable is the capabilities preserved across execve. + Inheritable []string + // Permitted is the limiting superset for effective capabilities. + Permitted []string + // Ambient is the ambient set of capabilities that are kept. + Ambient []string +} + +func (hooks *Hooks) UnmarshalJSON(b []byte) error { + var state struct { + Prestart []CommandHook + Poststart []CommandHook + Poststop []CommandHook + } + + if err := json.Unmarshal(b, &state); err != nil { + return err + } + + deserialize := func(shooks []CommandHook) (hooks []Hook) { + for _, shook := range shooks { + hooks = append(hooks, shook) + } + + return hooks + } + + hooks.Prestart = deserialize(state.Prestart) + hooks.Poststart = deserialize(state.Poststart) + hooks.Poststop = deserialize(state.Poststop) + return nil +} + +func (hooks Hooks) MarshalJSON() ([]byte, error) { + serialize := func(hooks []Hook) (serializableHooks []CommandHook) { + for _, hook := range hooks { + switch chook := hook.(type) { + case CommandHook: + serializableHooks = append(serializableHooks, chook) + default: + logrus.Warnf("cannot serialize hook of type %T, skipping", hook) + } + } + + return serializableHooks + } + + return json.Marshal(map[string]interface{}{ + "prestart": serialize(hooks.Prestart), + "poststart": serialize(hooks.Poststart), + "poststop": serialize(hooks.Poststop), + }) +} + +type Hook interface { + // Run executes the hook with the provided state. + Run(*specs.State) error +} + +// NewFunctionHook will call the provided function when the hook is run. +func NewFunctionHook(f func(*specs.State) error) FuncHook { + return FuncHook{ + run: f, + } +} + +type FuncHook struct { + run func(*specs.State) error +} + +func (f FuncHook) Run(s *specs.State) error { + return f.run(s) +} + +type Command struct { + Path string `json:"path"` + Args []string `json:"args"` + Env []string `json:"env"` + Dir string `json:"dir"` + Timeout *time.Duration `json:"timeout"` +} + +// NewCommandHook will execute the provided command when the hook is run. +func NewCommandHook(cmd Command) CommandHook { + return CommandHook{ + Command: cmd, + } +} + +type CommandHook struct { + Command +} + +func (c Command) Run(s *specs.State) error { + b, err := json.Marshal(s) + if err != nil { + return err + } + var stdout, stderr bytes.Buffer + cmd := exec.Cmd{ + Path: c.Path, + Args: c.Args, + Env: c.Env, + Stdin: bytes.NewReader(b), + Stdout: &stdout, + Stderr: &stderr, + } + if err := cmd.Start(); err != nil { + return err + } + errC := make(chan error, 1) + go func() { + err := cmd.Wait() + if err != nil { + err = fmt.Errorf("error running hook: %v, stdout: %s, stderr: %s", err, stdout.String(), stderr.String()) + } + errC <- err + }() + var timerCh <-chan time.Time + if c.Timeout != nil { + timer := time.NewTimer(*c.Timeout) + defer timer.Stop() + timerCh = timer.C + } + select { + case err := <-errC: + return err + case <-timerCh: + cmd.Process.Kill() + cmd.Wait() + return fmt.Errorf("hook ran past specified timeout of %.1fs", c.Timeout.Seconds()) + } +} diff --git a/libcontainer/configs/config_linux.go b/libcontainer/configs/config_linux.go new file mode 100644 index 0000000..07da108 --- /dev/null +++ b/libcontainer/configs/config_linux.go @@ -0,0 +1,61 @@ +package configs + +import "fmt" + +// HostUID gets the translated uid for the process on host which could be +// different when user namespaces are enabled. +func (c Config) HostUID(containerId int) (int, error) { + if c.Namespaces.Contains(NEWUSER) { + if c.UidMappings == nil { + return -1, fmt.Errorf("User namespaces enabled, but no uid mappings found.") + } + id, found := c.hostIDFromMapping(containerId, c.UidMappings) + if !found { + return -1, fmt.Errorf("User namespaces enabled, but no user mapping found.") + } + return id, nil + } + // Return unchanged id. + return containerId, nil +} + +// HostRootUID gets the root uid for the process on host which could be non-zero +// when user namespaces are enabled. +func (c Config) HostRootUID() (int, error) { + return c.HostUID(0) +} + +// HostGID gets the translated gid for the process on host which could be +// different when user namespaces are enabled. +func (c Config) HostGID(containerId int) (int, error) { + if c.Namespaces.Contains(NEWUSER) { + if c.GidMappings == nil { + return -1, fmt.Errorf("User namespaces enabled, but no gid mappings found.") + } + id, found := c.hostIDFromMapping(containerId, c.GidMappings) + if !found { + return -1, fmt.Errorf("User namespaces enabled, but no group mapping found.") + } + return id, nil + } + // Return unchanged id. + return containerId, nil +} + +// HostRootGID gets the root gid for the process on host which could be non-zero +// when user namespaces are enabled. +func (c Config) HostRootGID() (int, error) { + return c.HostGID(0) +} + +// Utility function that gets a host ID for a container ID from user namespace map +// if that ID is present in the map. +func (c Config) hostIDFromMapping(containerID int, uMap []IDMap) (int, bool) { + for _, m := range uMap { + if (containerID >= m.ContainerID) && (containerID <= (m.ContainerID + m.Size - 1)) { + hostID := m.HostID + (containerID - m.ContainerID) + return hostID, true + } + } + return -1, false +} diff --git a/libcontainer/configs/config_linux_test.go b/libcontainer/configs/config_linux_test.go new file mode 100644 index 0000000..9c5f0fe --- /dev/null +++ b/libcontainer/configs/config_linux_test.go @@ -0,0 +1,130 @@ +package configs + +import ( + "encoding/json" + "fmt" + "os" + "path/filepath" + "testing" +) + +func loadConfig(name string) (*Config, error) { + f, err := os.Open(filepath.Join("../sample_configs", name)) + if err != nil { + return nil, err + } + defer f.Close() + + var container *Config + if err := json.NewDecoder(f).Decode(&container); err != nil { + return nil, err + } + + // Check that a config doesn't contain extra fields + var configMap, abstractMap map[string]interface{} + + if _, err := f.Seek(0, 0); err != nil { + return nil, err + } + + if err := json.NewDecoder(f).Decode(&abstractMap); err != nil { + return nil, err + } + + configData, err := json.Marshal(&container) + if err != nil { + return nil, err + } + + if err := json.Unmarshal(configData, &configMap); err != nil { + return nil, err + } + + for k := range configMap { + delete(abstractMap, k) + } + + if len(abstractMap) != 0 { + return nil, fmt.Errorf("unknown fields: %s", abstractMap) + } + + return container, nil +} + +func TestRemoveNamespace(t *testing.T) { + ns := Namespaces{ + {Type: NEWNET}, + } + if !ns.Remove(NEWNET) { + t.Fatal("NEWNET was not removed") + } + if len(ns) != 0 { + t.Fatalf("namespaces should have 0 items but reports %d", len(ns)) + } +} + +func TestHostRootUIDNoUSERNS(t *testing.T) { + config := &Config{ + Namespaces: Namespaces{}, + } + uid, err := config.HostRootUID() + if err != nil { + t.Fatal(err) + } + if uid != 0 { + t.Fatalf("expected uid 0 with no USERNS but received %d", uid) + } +} + +func TestHostRootUIDWithUSERNS(t *testing.T) { + config := &Config{ + Namespaces: Namespaces{{Type: NEWUSER}}, + UidMappings: []IDMap{ + { + ContainerID: 0, + HostID: 1000, + Size: 1, + }, + }, + } + uid, err := config.HostRootUID() + if err != nil { + t.Fatal(err) + } + if uid != 1000 { + t.Fatalf("expected uid 1000 with no USERNS but received %d", uid) + } +} + +func TestHostRootGIDNoUSERNS(t *testing.T) { + config := &Config{ + Namespaces: Namespaces{}, + } + uid, err := config.HostRootGID() + if err != nil { + t.Fatal(err) + } + if uid != 0 { + t.Fatalf("expected gid 0 with no USERNS but received %d", uid) + } +} + +func TestHostRootGIDWithUSERNS(t *testing.T) { + config := &Config{ + Namespaces: Namespaces{{Type: NEWUSER}}, + GidMappings: []IDMap{ + { + ContainerID: 0, + HostID: 1000, + Size: 1, + }, + }, + } + uid, err := config.HostRootGID() + if err != nil { + t.Fatal(err) + } + if uid != 1000 { + t.Fatalf("expected gid 1000 with no USERNS but received %d", uid) + } +} diff --git a/libcontainer/configs/config_test.go b/libcontainer/configs/config_test.go new file mode 100644 index 0000000..c89a764 --- /dev/null +++ b/libcontainer/configs/config_test.go @@ -0,0 +1,195 @@ +package configs_test + +import ( + "encoding/json" + "fmt" + "os" + "reflect" + "testing" + "time" + + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runtime-spec/specs-go" +) + +func TestUnmarshalHooks(t *testing.T) { + timeout := time.Second + + prestartCmd := configs.NewCommandHook(configs.Command{ + Path: "/var/vcap/hooks/prestart", + Args: []string{"--pid=123"}, + Env: []string{"FOO=BAR"}, + Dir: "/var/vcap", + Timeout: &timeout, + }) + prestart, err := json.Marshal(prestartCmd.Command) + if err != nil { + t.Fatal(err) + } + + hook := configs.Hooks{} + err = hook.UnmarshalJSON([]byte(fmt.Sprintf(`{"Prestart" :[%s]}`, prestart))) + if err != nil { + t.Fatal(err) + } + + if !reflect.DeepEqual(hook.Prestart[0], prestartCmd) { + t.Errorf("Expected prestart to equal %+v but it was %+v", + prestartCmd, hook.Prestart[0]) + } +} + +func TestUnmarshalHooksWithInvalidData(t *testing.T) { + hook := configs.Hooks{} + err := hook.UnmarshalJSON([]byte(`{invalid-json}`)) + if err == nil { + t.Error("Expected error to occur but it was nil") + } +} + +func TestMarshalHooks(t *testing.T) { + timeout := time.Second + + prestartCmd := configs.NewCommandHook(configs.Command{ + Path: "/var/vcap/hooks/prestart", + Args: []string{"--pid=123"}, + Env: []string{"FOO=BAR"}, + Dir: "/var/vcap", + Timeout: &timeout, + }) + + hook := configs.Hooks{ + Prestart: []configs.Hook{prestartCmd}, + } + hooks, err := hook.MarshalJSON() + if err != nil { + t.Fatal(err) + } + + h := `{"poststart":null,"poststop":null,"prestart":[{"path":"/var/vcap/hooks/prestart","args":["--pid=123"],"env":["FOO=BAR"],"dir":"/var/vcap","timeout":1000000000}]}` + if string(hooks) != h { + t.Errorf("Expected hooks %s to equal %s", string(hooks), h) + } +} + +func TestMarshalUnmarshalHooks(t *testing.T) { + timeout := time.Second + + prestart := configs.NewCommandHook(configs.Command{ + Path: "/var/vcap/hooks/prestart", + Args: []string{"--pid=123"}, + Env: []string{"FOO=BAR"}, + Dir: "/var/vcap", + Timeout: &timeout, + }) + + hook := configs.Hooks{ + Prestart: []configs.Hook{prestart}, + } + hooks, err := hook.MarshalJSON() + if err != nil { + t.Fatal(err) + } + + umMhook := configs.Hooks{} + err = umMhook.UnmarshalJSON(hooks) + if err != nil { + t.Fatal(err) + } + if !reflect.DeepEqual(umMhook.Prestart[0], prestart) { + t.Errorf("Expected hooks to be equal after mashaling -> unmarshaling them: %+v, %+v", umMhook.Prestart[0], prestart) + } +} + +func TestMarshalHooksWithUnexpectedType(t *testing.T) { + fHook := configs.NewFunctionHook(func(*specs.State) error { + return nil + }) + hook := configs.Hooks{ + Prestart: []configs.Hook{fHook}, + } + hooks, err := hook.MarshalJSON() + if err != nil { + t.Fatal(err) + } + + h := `{"poststart":null,"poststop":null,"prestart":null}` + if string(hooks) != h { + t.Errorf("Expected hooks %s to equal %s", string(hooks), h) + } +} + +func TestFuncHookRun(t *testing.T) { + state := &specs.State{ + Version: "1", + ID: "1", + Status: "created", + Pid: 1, + Bundle: "/bundle", + } + + fHook := configs.NewFunctionHook(func(s *specs.State) error { + if !reflect.DeepEqual(state, s) { + t.Errorf("Expected state %+v to equal %+v", state, s) + } + return nil + }) + + fHook.Run(state) +} + +func TestCommandHookRun(t *testing.T) { + state := &specs.State{ + Version: "1", + ID: "1", + Status: "created", + Pid: 1, + Bundle: "/bundle", + } + timeout := time.Second + + cmdHook := configs.NewCommandHook(configs.Command{ + Path: os.Args[0], + Args: []string{os.Args[0], "-test.run=TestHelperProcess"}, + Env: []string{"FOO=BAR"}, + Dir: "/", + Timeout: &timeout, + }) + + err := cmdHook.Run(state) + if err != nil { + t.Errorf(fmt.Sprintf("Expected error to not occur but it was %+v", err)) + } +} + +func TestCommandHookRunTimeout(t *testing.T) { + state := &specs.State{ + Version: "1", + ID: "1", + Status: "created", + Pid: 1, + Bundle: "/bundle", + } + timeout := (10 * time.Millisecond) + + cmdHook := configs.NewCommandHook(configs.Command{ + Path: os.Args[0], + Args: []string{os.Args[0], "-test.run=TestHelperProcessWithTimeout"}, + Env: []string{"FOO=BAR"}, + Dir: "/", + Timeout: &timeout, + }) + + err := cmdHook.Run(state) + if err == nil { + t.Error("Expected error to occur but it was nil") + } +} + +func TestHelperProcess(*testing.T) { + fmt.Println("Helper Process") + os.Exit(0) +} +func TestHelperProcessWithTimeout(*testing.T) { + time.Sleep(time.Second) +} diff --git a/libcontainer/configs/config_windows_test.go b/libcontainer/configs/config_windows_test.go new file mode 100644 index 0000000..1a0c8fa --- /dev/null +++ b/libcontainer/configs/config_windows_test.go @@ -0,0 +1,3 @@ +package configs + +// All current tests are for Unix-specific functionality diff --git a/libcontainer/configs/device.go b/libcontainer/configs/device.go new file mode 100644 index 0000000..8701bb2 --- /dev/null +++ b/libcontainer/configs/device.go @@ -0,0 +1,57 @@ +package configs + +import ( + "fmt" + "os" +) + +const ( + Wildcard = -1 +) + +// TODO Windows: This can be factored out in the future + +type Device struct { + // Device type, block, char, etc. + Type rune `json:"type"` + + // Path to the device. + Path string `json:"path"` + + // Major is the device's major number. + Major int64 `json:"major"` + + // Minor is the device's minor number. + Minor int64 `json:"minor"` + + // Cgroup permissions format, rwm. + Permissions string `json:"permissions"` + + // FileMode permission bits for the device. + FileMode os.FileMode `json:"file_mode"` + + // Uid of the device. + Uid uint32 `json:"uid"` + + // Gid of the device. + Gid uint32 `json:"gid"` + + // Write the file to the allowed list + Allow bool `json:"allow"` +} + +func (d *Device) CgroupString() string { + return fmt.Sprintf("%c %s:%s %s", d.Type, deviceNumberString(d.Major), deviceNumberString(d.Minor), d.Permissions) +} + +func (d *Device) Mkdev() int { + return int((d.Major << 8) | (d.Minor & 0xff) | ((d.Minor & 0xfff00) << 12)) +} + +// deviceNumberString converts the device number to a string return result. +func deviceNumberString(number int64) string { + if number == Wildcard { + return "*" + } + return fmt.Sprint(number) +} diff --git a/libcontainer/configs/device_defaults.go b/libcontainer/configs/device_defaults.go new file mode 100644 index 0000000..e4f423c --- /dev/null +++ b/libcontainer/configs/device_defaults.go @@ -0,0 +1,111 @@ +// +build linux + +package configs + +var ( + // DefaultSimpleDevices are devices that are to be both allowed and created. + DefaultSimpleDevices = []*Device{ + // /dev/null and zero + { + Path: "/dev/null", + Type: 'c', + Major: 1, + Minor: 3, + Permissions: "rwm", + FileMode: 0666, + }, + { + Path: "/dev/zero", + Type: 'c', + Major: 1, + Minor: 5, + Permissions: "rwm", + FileMode: 0666, + }, + + { + Path: "/dev/full", + Type: 'c', + Major: 1, + Minor: 7, + Permissions: "rwm", + FileMode: 0666, + }, + + // consoles and ttys + { + Path: "/dev/tty", + Type: 'c', + Major: 5, + Minor: 0, + Permissions: "rwm", + FileMode: 0666, + }, + + // /dev/urandom,/dev/random + { + Path: "/dev/urandom", + Type: 'c', + Major: 1, + Minor: 9, + Permissions: "rwm", + FileMode: 0666, + }, + { + Path: "/dev/random", + Type: 'c', + Major: 1, + Minor: 8, + Permissions: "rwm", + FileMode: 0666, + }, + } + DefaultAllowedDevices = append([]*Device{ + // allow mknod for any device + { + Type: 'c', + Major: Wildcard, + Minor: Wildcard, + Permissions: "m", + }, + { + Type: 'b', + Major: Wildcard, + Minor: Wildcard, + Permissions: "m", + }, + + { + Path: "/dev/console", + Type: 'c', + Major: 5, + Minor: 1, + Permissions: "rwm", + }, + // /dev/pts/ - pts namespaces are "coming soon" + { + Path: "", + Type: 'c', + Major: 136, + Minor: Wildcard, + Permissions: "rwm", + }, + { + Path: "", + Type: 'c', + Major: 5, + Minor: 2, + Permissions: "rwm", + }, + + // tuntap + { + Path: "", + Type: 'c', + Major: 10, + Minor: 200, + Permissions: "rwm", + }, + }, DefaultSimpleDevices...) + DefaultAutoCreatedDevices = append([]*Device{}, DefaultSimpleDevices...) +) diff --git a/libcontainer/configs/hugepage_limit.go b/libcontainer/configs/hugepage_limit.go new file mode 100644 index 0000000..d302163 --- /dev/null +++ b/libcontainer/configs/hugepage_limit.go @@ -0,0 +1,9 @@ +package configs + +type HugepageLimit struct { + // which type of hugepage to limit. + Pagesize string `json:"page_size"` + + // usage limit for hugepage. + Limit uint64 `json:"limit"` +} diff --git a/libcontainer/configs/intelrdt.go b/libcontainer/configs/intelrdt.go new file mode 100644 index 0000000..57e9f03 --- /dev/null +++ b/libcontainer/configs/intelrdt.go @@ -0,0 +1,13 @@ +package configs + +type IntelRdt struct { + // The schema for L3 cache id and capacity bitmask (CBM) + // Format: "L3:=;=;..." + L3CacheSchema string `json:"l3_cache_schema,omitempty"` + + // The schema of memory bandwidth per L3 cache id + // Format: "MB:=bandwidth0;=bandwidth1;..." + // The unit of memory bandwidth is specified in "percentages" by + // default, and in "MBps" if MBA Software Controller is enabled. + MemBwSchema string `json:"memBwSchema,omitempty"` +} diff --git a/libcontainer/configs/interface_priority_map.go b/libcontainer/configs/interface_priority_map.go new file mode 100644 index 0000000..9a0395e --- /dev/null +++ b/libcontainer/configs/interface_priority_map.go @@ -0,0 +1,14 @@ +package configs + +import ( + "fmt" +) + +type IfPrioMap struct { + Interface string `json:"interface"` + Priority int64 `json:"priority"` +} + +func (i *IfPrioMap) CgroupString() string { + return fmt.Sprintf("%s %d", i.Interface, i.Priority) +} diff --git a/libcontainer/configs/mount.go b/libcontainer/configs/mount.go new file mode 100644 index 0000000..670757d --- /dev/null +++ b/libcontainer/configs/mount.go @@ -0,0 +1,39 @@ +package configs + +const ( + // EXT_COPYUP is a directive to copy up the contents of a directory when + // a tmpfs is mounted over it. + EXT_COPYUP = 1 << iota +) + +type Mount struct { + // Source path for the mount. + Source string `json:"source"` + + // Destination path for the mount inside the container. + Destination string `json:"destination"` + + // Device the mount is for. + Device string `json:"device"` + + // Mount flags. + Flags int `json:"flags"` + + // Propagation Flags + PropagationFlags []int `json:"propagation_flags"` + + // Mount data applied to the mount. + Data string `json:"data"` + + // Relabel source if set, "z" indicates shared, "Z" indicates unshared. + Relabel string `json:"relabel"` + + // Extensions are additional flags that are specific to runc. + Extensions int `json:"extensions"` + + // Optional Command to be run before Source is mounted. + PremountCmds []Command `json:"premount_cmds"` + + // Optional Command to be run after Source is mounted. + PostmountCmds []Command `json:"postmount_cmds"` +} diff --git a/libcontainer/configs/namespaces.go b/libcontainer/configs/namespaces.go new file mode 100644 index 0000000..a3329a3 --- /dev/null +++ b/libcontainer/configs/namespaces.go @@ -0,0 +1,5 @@ +package configs + +type NamespaceType string + +type Namespaces []Namespace diff --git a/libcontainer/configs/namespaces_linux.go b/libcontainer/configs/namespaces_linux.go new file mode 100644 index 0000000..1bbaef9 --- /dev/null +++ b/libcontainer/configs/namespaces_linux.go @@ -0,0 +1,126 @@ +package configs + +import ( + "fmt" + "os" + "sync" +) + +const ( + NEWNET NamespaceType = "NEWNET" + NEWPID NamespaceType = "NEWPID" + NEWNS NamespaceType = "NEWNS" + NEWUTS NamespaceType = "NEWUTS" + NEWIPC NamespaceType = "NEWIPC" + NEWUSER NamespaceType = "NEWUSER" + NEWCGROUP NamespaceType = "NEWCGROUP" +) + +var ( + nsLock sync.Mutex + supportedNamespaces = make(map[NamespaceType]bool) +) + +// NsName converts the namespace type to its filename +func NsName(ns NamespaceType) string { + switch ns { + case NEWNET: + return "net" + case NEWNS: + return "mnt" + case NEWPID: + return "pid" + case NEWIPC: + return "ipc" + case NEWUSER: + return "user" + case NEWUTS: + return "uts" + case NEWCGROUP: + return "cgroup" + } + return "" +} + +// IsNamespaceSupported returns whether a namespace is available or +// not +func IsNamespaceSupported(ns NamespaceType) bool { + nsLock.Lock() + defer nsLock.Unlock() + supported, ok := supportedNamespaces[ns] + if ok { + return supported + } + nsFile := NsName(ns) + // if the namespace type is unknown, just return false + if nsFile == "" { + return false + } + _, err := os.Stat(fmt.Sprintf("/proc/self/ns/%s", nsFile)) + // a namespace is supported if it exists and we have permissions to read it + supported = err == nil + supportedNamespaces[ns] = supported + return supported +} + +func NamespaceTypes() []NamespaceType { + return []NamespaceType{ + NEWUSER, // Keep user NS always first, don't move it. + NEWIPC, + NEWUTS, + NEWNET, + NEWPID, + NEWNS, + NEWCGROUP, + } +} + +// Namespace defines configuration for each namespace. It specifies an +// alternate path that is able to be joined via setns. +type Namespace struct { + Type NamespaceType `json:"type"` + Path string `json:"path"` +} + +func (n *Namespace) GetPath(pid int) string { + return fmt.Sprintf("/proc/%d/ns/%s", pid, NsName(n.Type)) +} + +func (n *Namespaces) Remove(t NamespaceType) bool { + i := n.index(t) + if i == -1 { + return false + } + *n = append((*n)[:i], (*n)[i+1:]...) + return true +} + +func (n *Namespaces) Add(t NamespaceType, path string) { + i := n.index(t) + if i == -1 { + *n = append(*n, Namespace{Type: t, Path: path}) + return + } + (*n)[i].Path = path +} + +func (n *Namespaces) index(t NamespaceType) int { + for i, ns := range *n { + if ns.Type == t { + return i + } + } + return -1 +} + +func (n *Namespaces) Contains(t NamespaceType) bool { + return n.index(t) != -1 +} + +func (n *Namespaces) PathOf(t NamespaceType) string { + i := n.index(t) + if i == -1 { + return "" + } + return (*n)[i].Path +} diff --git a/libcontainer/configs/namespaces_syscall.go b/libcontainer/configs/namespaces_syscall.go new file mode 100644 index 0000000..2dc7adf --- /dev/null +++ b/libcontainer/configs/namespaces_syscall.go @@ -0,0 +1,32 @@ +// +build linux + +package configs + +import "golang.org/x/sys/unix" + +func (n *Namespace) Syscall() int { + return namespaceInfo[n.Type] +} + +var namespaceInfo = map[NamespaceType]int{ + NEWNET: unix.CLONE_NEWNET, + NEWNS: unix.CLONE_NEWNS, + NEWUSER: unix.CLONE_NEWUSER, + NEWIPC: unix.CLONE_NEWIPC, + NEWUTS: unix.CLONE_NEWUTS, + NEWPID: unix.CLONE_NEWPID, + NEWCGROUP: unix.CLONE_NEWCGROUP, +} + +// CloneFlags parses the container's Namespaces options to set the correct +// flags on clone, unshare. This function returns flags only for new namespaces. +func (n *Namespaces) CloneFlags() uintptr { + var flag int + for _, v := range *n { + if v.Path != "" { + continue + } + flag |= namespaceInfo[v.Type] + } + return uintptr(flag) +} diff --git a/libcontainer/configs/namespaces_syscall_unsupported.go b/libcontainer/configs/namespaces_syscall_unsupported.go new file mode 100644 index 0000000..5d9a5c8 --- /dev/null +++ b/libcontainer/configs/namespaces_syscall_unsupported.go @@ -0,0 +1,13 @@ +// +build !linux,!windows + +package configs + +func (n *Namespace) Syscall() int { + panic("No namespace syscall support") +} + +// CloneFlags parses the container's Namespaces options to set the correct +// flags on clone, unshare. This function returns flags only for new namespaces. +func (n *Namespaces) CloneFlags() uintptr { + panic("No namespace syscall support") +} diff --git a/libcontainer/configs/namespaces_unsupported.go b/libcontainer/configs/namespaces_unsupported.go new file mode 100644 index 0000000..19bf713 --- /dev/null +++ b/libcontainer/configs/namespaces_unsupported.go @@ -0,0 +1,8 @@ +// +build !linux + +package configs + +// Namespace defines configuration for each namespace. It specifies an +// alternate path that is able to be joined via setns. +type Namespace struct { +} diff --git a/libcontainer/configs/network.go b/libcontainer/configs/network.go new file mode 100644 index 0000000..ccdb228 --- /dev/null +++ b/libcontainer/configs/network.go @@ -0,0 +1,72 @@ +package configs + +// Network defines configuration for a container's networking stack +// +// The network configuration can be omitted from a container causing the +// container to be setup with the host's networking stack +type Network struct { + // Type sets the networks type, commonly veth and loopback + Type string `json:"type"` + + // Name of the network interface + Name string `json:"name"` + + // The bridge to use. + Bridge string `json:"bridge"` + + // MacAddress contains the MAC address to set on the network interface + MacAddress string `json:"mac_address"` + + // Address contains the IPv4 and mask to set on the network interface + Address string `json:"address"` + + // Gateway sets the gateway address that is used as the default for the interface + Gateway string `json:"gateway"` + + // IPv6Address contains the IPv6 and mask to set on the network interface + IPv6Address string `json:"ipv6_address"` + + // IPv6Gateway sets the ipv6 gateway address that is used as the default for the interface + IPv6Gateway string `json:"ipv6_gateway"` + + // Mtu sets the mtu value for the interface and will be mirrored on both the host and + // container's interfaces if a pair is created, specifically in the case of type veth + // Note: This does not apply to loopback interfaces. + Mtu int `json:"mtu"` + + // TxQueueLen sets the tx_queuelen value for the interface and will be mirrored on both the host and + // container's interfaces if a pair is created, specifically in the case of type veth + // Note: This does not apply to loopback interfaces. + TxQueueLen int `json:"txqueuelen"` + + // HostInterfaceName is a unique name of a veth pair that resides on in the host interface of the + // container. + HostInterfaceName string `json:"host_interface_name"` + + // HairpinMode specifies if hairpin NAT should be enabled on the virtual interface + // bridge port in the case of type veth + // Note: This is unsupported on some systems. + // Note: This does not apply to loopback interfaces. + HairpinMode bool `json:"hairpin_mode"` +} + +// Routes can be specified to create entries in the route table as the container is started +// +// All of destination, source, and gateway should be either IPv4 or IPv6. +// One of the three options must be present, and omitted entries will use their +// IP family default for the route table. For IPv4 for example, setting the +// gateway to 1.2.3.4 and the interface to eth0 will set up a standard +// destination of 0.0.0.0(or *) when viewed in the route table. +type Route struct { + // Sets the destination and mask, should be a CIDR. Accepts IPv4 and IPv6 + Destination string `json:"destination"` + + // Sets the source and mask, should be a CIDR. Accepts IPv4 and IPv6 + Source string `json:"source"` + + // Sets the gateway. Accepts IPv4 and IPv6 + Gateway string `json:"gateway"` + + // The device to set this route up for, for example: eth0 + InterfaceName string `json:"interface_name"` +} diff --git a/libcontainer/configs/validate/rootless.go b/libcontainer/configs/validate/rootless.go new file mode 100644 index 0000000..393d9e8 --- /dev/null +++ b/libcontainer/configs/validate/rootless.go @@ -0,0 +1,89 @@ +package validate + +import ( + "fmt" + "strings" + + "github.com/opencontainers/runc/libcontainer/configs" +) + +// rootlessEUID makes sure that the config can be applied when runc +// is being executed as a non-root user (euid != 0) in the current user namespace. +func (v *ConfigValidator) rootlessEUID(config *configs.Config) error { + if err := rootlessEUIDMappings(config); err != nil { + return err + } + if err := rootlessEUIDMount(config); err != nil { + return err + } + + // XXX: We currently can't verify the user config at all, because + // configs.Config doesn't store the user-related configs. So this + // has to be verified by setupUser() in init_linux.go. + + return nil +} + +func hasIDMapping(id int, mappings []configs.IDMap) bool { + for _, m := range mappings { + if id >= m.ContainerID && id < m.ContainerID+m.Size { + return true + } + } + return false +} + +func rootlessEUIDMappings(config *configs.Config) error { + if !config.Namespaces.Contains(configs.NEWUSER) { + return fmt.Errorf("rootless container requires user namespaces") + } + + if len(config.UidMappings) == 0 { + return fmt.Errorf("rootless containers requires at least one UID mapping") + } + if len(config.GidMappings) == 0 { + return fmt.Errorf("rootless containers requires at least one GID mapping") + } + return nil +} + +// mount verifies that the user isn't trying to set up any mounts they don't have +// the rights to do. In addition, it makes sure that no mount has a `uid=` or +// `gid=` option that doesn't resolve to root. +func rootlessEUIDMount(config *configs.Config) error { + // XXX: We could whitelist allowed devices at this point, but I'm not + // convinced that's a good idea. The kernel is the best arbiter of + // access control. + + for _, mount := range config.Mounts { + // Check that the options list doesn't contain any uid= or gid= entries + // that don't resolve to root. + for _, opt := range strings.Split(mount.Data, ",") { + if strings.HasPrefix(opt, "uid=") { + var uid int + n, err := fmt.Sscanf(opt, "uid=%d", &uid) + if n != 1 || err != nil { + // Ignore unknown mount options. + continue + } + if !hasIDMapping(uid, config.UidMappings) { + return fmt.Errorf("cannot specify uid= mount options for unmapped uid in rootless containers") + } + } + + if strings.HasPrefix(opt, "gid=") { + var gid int + n, err := fmt.Sscanf(opt, "gid=%d", &gid) + if n != 1 || err != nil { + // Ignore unknown mount options. + continue + } + if !hasIDMapping(gid, config.GidMappings) { + return fmt.Errorf("cannot specify gid= mount options for unmapped gid in rootless containers") + } + } + } + } + + return nil +} diff --git a/libcontainer/configs/validate/rootless_test.go b/libcontainer/configs/validate/rootless_test.go new file mode 100644 index 0000000..59d1557 --- /dev/null +++ b/libcontainer/configs/validate/rootless_test.go @@ -0,0 +1,155 @@ +package validate + +import ( + "testing" + + "github.com/opencontainers/runc/libcontainer/configs" +) + +func rootlessEUIDConfig() *configs.Config { + return &configs.Config{ + Rootfs: "/var", + RootlessEUID: true, + RootlessCgroups: true, + Namespaces: configs.Namespaces( + []configs.Namespace{ + {Type: configs.NEWUSER}, + }, + ), + UidMappings: []configs.IDMap{ + { + HostID: 1337, + ContainerID: 0, + Size: 1, + }, + }, + GidMappings: []configs.IDMap{ + { + HostID: 7331, + ContainerID: 0, + Size: 1, + }, + }, + } +} + +func TestValidateRootlessEUID(t *testing.T) { + validator := New() + + config := rootlessEUIDConfig() + if err := validator.Validate(config); err != nil { + t.Errorf("Expected error to not occur: %+v", err) + } +} + +/* rootlessEUIDMappings */ + +func TestValidateRootlessEUIDUserns(t *testing.T) { + validator := New() + + config := rootlessEUIDConfig() + config.Namespaces = nil + if err := validator.Validate(config); err == nil { + t.Errorf("Expected error to occur if user namespaces not set") + } +} + +func TestValidateRootlessEUIDMappingUid(t *testing.T) { + validator := New() + + config := rootlessEUIDConfig() + config.UidMappings = nil + if err := validator.Validate(config); err == nil { + t.Errorf("Expected error to occur if no uid mappings provided") + } +} + +func TestValidateNonZeroEUIDMappingGid(t *testing.T) { + validator := New() + + config := rootlessEUIDConfig() + config.GidMappings = nil + if err := validator.Validate(config); err == nil { + t.Errorf("Expected error to occur if no gid mappings provided") + } +} + +/* rootlessEUIDMount() */ + +func TestValidateRootlessEUIDMountUid(t *testing.T) { + config := rootlessEUIDConfig() + validator := New() + + config.Mounts = []*configs.Mount{ + { + Source: "devpts", + Destination: "/dev/pts", + Device: "devpts", + }, + } + + if err := validator.Validate(config); err != nil { + t.Errorf("Expected error to not occur when uid= not set in mount options: %+v", err) + } + + config.Mounts[0].Data = "uid=5" + if err := validator.Validate(config); err == nil { + t.Errorf("Expected error to occur when setting uid=5 in mount options") + } + + config.Mounts[0].Data = "uid=0" + if err := validator.Validate(config); err != nil { + t.Errorf("Expected error to not occur when setting uid=0 in mount options: %+v", err) + } + + config.Mounts[0].Data = "uid=2" + config.UidMappings[0].Size = 10 + if err := validator.Validate(config); err != nil { + t.Errorf("Expected error to not occur when setting uid=2 in mount options and UidMapping[0].size is 10") + } + + config.Mounts[0].Data = "uid=20" + config.UidMappings[0].Size = 10 + if err := validator.Validate(config); err == nil { + t.Errorf("Expected error to occur when setting uid=20 in mount options and UidMapping[0].size is 10") + } +} + +func TestValidateRootlessEUIDMountGid(t *testing.T) { + config := rootlessEUIDConfig() + validator := New() + + config.Mounts = []*configs.Mount{ + { + Source: "devpts", + Destination: "/dev/pts", + Device: "devpts", + }, + } + + if err := validator.Validate(config); err != nil { + t.Errorf("Expected error to not occur when gid= not set in mount options: %+v", err) + } + + config.Mounts[0].Data = "gid=5" + if err := validator.Validate(config); err == nil { + t.Errorf("Expected error to occur when setting gid=5 in mount options") + } + + config.Mounts[0].Data = "gid=0" + if err := validator.Validate(config); err != nil { + t.Errorf("Expected error to not occur when setting gid=0 in mount options: %+v", err) + } + + config.Mounts[0].Data = "gid=5" + config.GidMappings[0].Size = 10 + if err := validator.Validate(config); err != nil { + t.Errorf("Expected error to not occur when setting gid=5 in mount options and GidMapping[0].size is 10") + } + + config.Mounts[0].Data = "gid=11" + config.GidMappings[0].Size = 10 + if err := validator.Validate(config); err == nil { + t.Errorf("Expected error to occur when setting gid=11 in mount options and GidMapping[0].size is 10") + } +} diff --git a/libcontainer/configs/validate/validator.go b/libcontainer/configs/validate/validator.go new file mode 100644 index 0000000..3b42f30 --- /dev/null +++ b/libcontainer/configs/validate/validator.go @@ -0,0 +1,245 @@ +package validate + +import ( + "fmt" + "os" + "path/filepath" + "strings" + + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/intelrdt" + selinux "github.com/opencontainers/selinux/go-selinux" +) + +type Validator interface { + Validate(*configs.Config) error +} + +func New() Validator { + return &ConfigValidator{} +} + +type ConfigValidator struct { +} + +func (v *ConfigValidator) Validate(config *configs.Config) error { + if err := v.rootfs(config); err != nil { + return err + } + if err := v.network(config); err != nil { + return err + } + if err := v.hostname(config); err != nil { + return err + } + if err := v.security(config); err != nil { + return err + } + if err := v.usernamespace(config); err != nil { + return err + } + if err := v.cgroupnamespace(config); err != nil { + return err + } + if err := v.sysctl(config); err != nil { + return err + } + if err := v.intelrdt(config); err != nil { + return err + } + if config.RootlessEUID { + if err := v.rootlessEUID(config); err != nil { + return err + } + } + return nil +} + +// rootfs validates if the rootfs is an absolute path and is not a symlink +// to the container's root filesystem. +func (v *ConfigValidator) rootfs(config *configs.Config) error { + if _, err := os.Stat(config.Rootfs); err != nil { + if os.IsNotExist(err) { + return fmt.Errorf("rootfs (%s) does not exist", config.Rootfs) + } + return err + } + cleaned, err := filepath.Abs(config.Rootfs) + if err != nil { + return err + } + if cleaned, err = filepath.EvalSymlinks(cleaned); err != nil { + return err + } + if filepath.Clean(config.Rootfs) != cleaned { + return fmt.Errorf("%s is not an absolute path or is a symlink", config.Rootfs) + } + return nil +} + +func (v *ConfigValidator) network(config *configs.Config) error { + if !config.Namespaces.Contains(configs.NEWNET) { + if len(config.Networks) > 0 || len(config.Routes) > 0 { + return fmt.Errorf("unable to apply network settings without a private NET namespace") + } + } + return nil +} + +func (v *ConfigValidator) hostname(config *configs.Config) error { + if config.Hostname != "" && !config.Namespaces.Contains(configs.NEWUTS) { + return fmt.Errorf("unable to set hostname without a private UTS namespace") + } + return nil +} + +func (v *ConfigValidator) security(config *configs.Config) error { + // restrict sys without mount namespace + if (len(config.MaskPaths) > 0 || len(config.ReadonlyPaths) > 0) && + !config.Namespaces.Contains(configs.NEWNS) { + return fmt.Errorf("unable to restrict sys entries without a private MNT namespace") + } + if config.ProcessLabel != "" && !selinux.GetEnabled() { + return fmt.Errorf("selinux label is specified in config, but selinux is disabled or not supported") + } + + return nil +} + +func (v *ConfigValidator) usernamespace(config *configs.Config) error { + if config.Namespaces.Contains(configs.NEWUSER) { + if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) { + return fmt.Errorf("USER namespaces aren't enabled in the kernel") + } + } else { + if config.UidMappings != nil || config.GidMappings != nil { + return fmt.Errorf("User namespace mappings specified, but USER namespace isn't enabled in the config") + } + } + return nil +} + +func (v *ConfigValidator) cgroupnamespace(config *configs.Config) error { + if config.Namespaces.Contains(configs.NEWCGROUP) { + if _, err := os.Stat("/proc/self/ns/cgroup"); os.IsNotExist(err) { + return fmt.Errorf("cgroup namespaces aren't enabled in the kernel") + } + } + return nil +} + +// sysctl validates that the specified sysctl keys are valid or not. +// /proc/sys isn't completely namespaced and depending on which namespaces +// are specified, a subset of sysctls are permitted. +func (v *ConfigValidator) sysctl(config *configs.Config) error { + validSysctlMap := map[string]bool{ + "kernel.msgmax": true, + "kernel.msgmnb": true, + "kernel.msgmni": true, + "kernel.sem": true, + "kernel.shmall": true, + "kernel.shmmax": true, + "kernel.shmmni": true, + "kernel.shm_rmid_forced": true, + } + + for s := range config.Sysctl { + if validSysctlMap[s] || strings.HasPrefix(s, "fs.mqueue.") { + if config.Namespaces.Contains(configs.NEWIPC) { + continue + } else { + return fmt.Errorf("sysctl %q is not allowed in the hosts ipc namespace", s) + } + } + if strings.HasPrefix(s, "net.") { + if config.Namespaces.Contains(configs.NEWNET) { + if path := config.Namespaces.PathOf(configs.NEWNET); path != "" { + if err := checkHostNs(s, path); err != nil { + return err + } + } + continue + } else { + return fmt.Errorf("sysctl %q is not allowed in the hosts network namespace", s) + } + } + if config.Namespaces.Contains(configs.NEWUTS) { + switch s { + case "kernel.domainname": + // This is namespaced and there's no explicit OCI field for it. + continue + case "kernel.hostname": + // This is namespaced but there's a conflicting (dedicated) OCI field for it. + return fmt.Errorf("sysctl %q is not allowed as it conflicts with the OCI %q field", s, "hostname") + } + } + return fmt.Errorf("sysctl %q is not in a separate kernel namespace", s) + } + + return nil +} + +func (v *ConfigValidator) intelrdt(config *configs.Config) error { + if config.IntelRdt != nil { + if !intelrdt.IsCatEnabled() && !intelrdt.IsMbaEnabled() { + return fmt.Errorf("intelRdt is specified in config, but Intel RDT is not supported or enabled") + } + + if !intelrdt.IsCatEnabled() && config.IntelRdt.L3CacheSchema != "" { + return fmt.Errorf("intelRdt.l3CacheSchema is specified in config, but Intel RDT/CAT is not enabled") + } + if !intelrdt.IsMbaEnabled() && config.IntelRdt.MemBwSchema != "" { + return fmt.Errorf("intelRdt.memBwSchema is specified in config, but Intel RDT/MBA is not enabled") + } + + if intelrdt.IsCatEnabled() && config.IntelRdt.L3CacheSchema == "" { + return fmt.Errorf("Intel RDT/CAT is enabled and intelRdt is specified in config, but intelRdt.l3CacheSchema is empty") + } + if intelrdt.IsMbaEnabled() && config.IntelRdt.MemBwSchema == "" { + return fmt.Errorf("Intel RDT/MBA is enabled and intelRdt is specified in config, but intelRdt.memBwSchema is empty") + } + } + + return nil +} + +func isSymbolicLink(path string) (bool, error) { + fi, err := os.Lstat(path) + if err != nil { + return false, err + } + + return fi.Mode()&os.ModeSymlink == os.ModeSymlink, nil +} + +// checkHostNs checks whether network sysctl is used in host namespace. +func checkHostNs(sysctlConfig string, path string) error { + var currentProcessNetns = "/proc/self/ns/net" + // readlink on the current processes network namespace + destOfCurrentProcess, err := os.Readlink(currentProcessNetns) + if err != nil { + return fmt.Errorf("read soft link %q error", currentProcessNetns) + } + + // First check if the provided path is a symbolic link + symLink, err := isSymbolicLink(path) + if err != nil { + return fmt.Errorf("could not check that %q is a symlink: %v", path, err) + } + + if symLink == false { + // The provided namespace is not a symbolic link, + // it is not the host namespace. + return nil + } + + // readlink on the path provided in the struct + destOfContainer, err := os.Readlink(path) + if err != nil { + return fmt.Errorf("read soft link %q error", path) + } + if destOfContainer == destOfCurrentProcess { + return fmt.Errorf("sysctl %q is not allowed in the hosts network namespace", sysctlConfig) + } + return nil +} diff --git a/libcontainer/configs/validate/validator_test.go b/libcontainer/configs/validate/validator_test.go new file mode 100644 index 0000000..f6826fb --- /dev/null +++ b/libcontainer/configs/validate/validator_test.go @@ -0,0 +1,267 @@ +package validate_test + +import ( + "os" + "testing" + + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/configs/validate" +) + +func TestValidate(t *testing.T) { + config := &configs.Config{ + Rootfs: "/var", + } + + validator := validate.New() + err := validator.Validate(config) + if err != nil { + t.Errorf("Expected error to not occur: %+v", err) + } +} + +func TestValidateWithInvalidRootfs(t *testing.T) { + dir := "rootfs" + os.Symlink("/var", dir) + defer os.Remove(dir) + + config := &configs.Config{ + Rootfs: dir, + } + + validator := validate.New() + err := validator.Validate(config) + if err == nil { + t.Error("Expected error to occur but it was nil") + } +} + +func TestValidateNetworkWithoutNETNamespace(t *testing.T) { + network := &configs.Network{Type: "loopback"} + config := &configs.Config{ + Rootfs: "/var", + Namespaces: []configs.Namespace{}, + Networks: []*configs.Network{network}, + } + + validator := validate.New() + err := validator.Validate(config) + if err == nil { + t.Error("Expected error to occur but it was nil") + } +} + +func TestValidateNetworkRoutesWithoutNETNamespace(t *testing.T) { + route := &configs.Route{Gateway: "255.255.255.0"} + config := &configs.Config{ + Rootfs: "/var", + Namespaces: []configs.Namespace{}, + Routes: []*configs.Route{route}, + } + + validator := validate.New() + err := validator.Validate(config) + if err == nil { + t.Error("Expected error to occur but it was nil") + } +} + +func TestValidateHostname(t *testing.T) { + config := &configs.Config{ + Rootfs: "/var", + Hostname: "runc", + Namespaces: configs.Namespaces( + []configs.Namespace{ + {Type: configs.NEWUTS}, + }, + ), + } + + validator := validate.New() + err := validator.Validate(config) + if err != nil { + t.Errorf("Expected error to not occur: %+v", err) + } +} + +func TestValidateHostnameWithoutUTSNamespace(t *testing.T) { + config := &configs.Config{ + Rootfs: "/var", + Hostname: "runc", + } + + validator := validate.New() + err := validator.Validate(config) + if err == nil { + t.Error("Expected error to occur but it was nil") + } +} + +func TestValidateSecurityWithMaskPaths(t *testing.T) { + config := &configs.Config{ + Rootfs: "/var", + MaskPaths: []string{"/proc/kcore"}, + Namespaces: configs.Namespaces( + []configs.Namespace{ + {Type: configs.NEWNS}, + }, + ), + } + + validator := validate.New() + err := validator.Validate(config) + if err != nil { + t.Errorf("Expected error to not occur: %+v", err) + } +} + +func TestValidateSecurityWithROPaths(t *testing.T) { + config := &configs.Config{ + Rootfs: "/var", + ReadonlyPaths: []string{"/proc/sys"}, + Namespaces: configs.Namespaces( + []configs.Namespace{ + {Type: configs.NEWNS}, + }, + ), + } + + validator := validate.New() + err := validator.Validate(config) + if err != nil { + t.Errorf("Expected error to not occur: %+v", err) + } +} + +func TestValidateSecurityWithoutNEWNS(t *testing.T) { + config := &configs.Config{ + Rootfs: "/var", + MaskPaths: []string{"/proc/kcore"}, + ReadonlyPaths: []string{"/proc/sys"}, + } + + validator := validate.New() + err := validator.Validate(config) + if err == nil { + t.Error("Expected error to occur but it was nil") + } +} + +func TestValidateUsernamespace(t *testing.T) { + if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) { + t.Skip("userns is unsupported") + } + config := &configs.Config{ + Rootfs: "/var", + Namespaces: configs.Namespaces( + []configs.Namespace{ + {Type: configs.NEWUSER}, + }, + ), + } + + validator := validate.New() + err := validator.Validate(config) + if err != nil { + t.Errorf("expected error to not occur %+v", err) + } +} + +func TestValidateUsernamespaceWithoutUserNS(t *testing.T) { + uidMap := configs.IDMap{ContainerID: 123} + config := &configs.Config{ + Rootfs: "/var", + UidMappings: []configs.IDMap{uidMap}, + } + + validator := validate.New() + err := validator.Validate(config) + if err == nil { + t.Error("Expected error to occur but it was nil") + } +} + +func TestValidateSysctl(t *testing.T) { + sysctl := map[string]string{ + "fs.mqueue.ctl": "ctl", + "net.ctl": "ctl", + "kernel.ctl": "ctl", + } + + for k, v := range sysctl { + config := &configs.Config{ + Rootfs: "/var", + Sysctl: map[string]string{k: v}, + } + + validator := validate.New() + err := validator.Validate(config) + if err == nil { + t.Error("Expected error to occur but it was nil") + } + } +} + +func TestValidateValidSysctl(t *testing.T) { + sysctl := map[string]string{ + "fs.mqueue.ctl": "ctl", + "net.ctl": "ctl", + "kernel.msgmax": "ctl", + } + + for k, v := range sysctl { + config := &configs.Config{ + Rootfs: "/var", + Sysctl: map[string]string{k: v}, + Namespaces: []configs.Namespace{ + { + Type: configs.NEWNET, + }, + { + Type: configs.NEWIPC, + }, + }, + } + + validator := validate.New() + err := validator.Validate(config) + if err != nil { + t.Errorf("Expected error to not occur with {%s=%s} but got: %q", k, v, err) + } + } +} + +func TestValidateSysctlWithSameNs(t *testing.T) { + config := &configs.Config{ + Rootfs: "/var", + Sysctl: map[string]string{"net.ctl": "ctl"}, + Namespaces: configs.Namespaces( + []configs.Namespace{ + { + Type: configs.NEWNET, + Path: "/proc/self/ns/net", + }, + }, + ), + } + + validator := validate.New() + err := validator.Validate(config) + if err == nil { + t.Error("Expected error to occur but it was nil") + } +} + +func TestValidateSysctlWithoutNETNamespace(t *testing.T) { + config := &configs.Config{ + Rootfs: "/var", + Sysctl: map[string]string{"net.ctl": "ctl"}, + Namespaces: []configs.Namespace{}, + } + + validator := validate.New() + err := validator.Validate(config) + if err == nil { + t.Error("Expected error to occur but it was nil") + } +} diff --git a/libcontainer/console_linux.go b/libcontainer/console_linux.go new file mode 100644 index 0000000..9997e93 --- /dev/null +++ b/libcontainer/console_linux.go @@ -0,0 +1,41 @@ +package libcontainer + +import ( + "os" + + "golang.org/x/sys/unix" +) + +// mount initializes the console inside the rootfs mounting with the specified mount label +// and applying the correct ownership of the console. +func mountConsole(slavePath string) error { + oldMask := unix.Umask(0000) + defer unix.Umask(oldMask) + f, err := os.Create("/dev/console") + if err != nil && !os.IsExist(err) { + return err + } + if f != nil { + f.Close() + } + return unix.Mount(slavePath, "/dev/console", "bind", unix.MS_BIND, "") +} + +// dupStdio opens the slavePath for the console and dups the fds to the current +// processes stdio, fd 0,1,2. +func dupStdio(slavePath string) error { + fd, err := unix.Open(slavePath, unix.O_RDWR, 0) + if err != nil { + return &os.PathError{ + Op: "open", + Path: slavePath, + Err: err, + } + } + for _, i := range []int{0, 1, 2} { + if err := unix.Dup3(fd, i, 0); err != nil { + return err + } + } + return nil +} diff --git a/libcontainer/container.go b/libcontainer/container.go new file mode 100644 index 0000000..ba7541c --- /dev/null +++ b/libcontainer/container.go @@ -0,0 +1,173 @@ +// Package libcontainer provides a native Go implementation for creating containers +// with namespaces, cgroups, capabilities, and filesystem access controls. +// It allows you to manage the lifecycle of the container performing additional operations +// after the container is created. +package libcontainer + +import ( + "os" + "time" + + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runtime-spec/specs-go" +) + +// Status is the status of a container. +type Status int + +const ( + // Created is the status that denotes the container exists but has not been run yet. + Created Status = iota + // Running is the status that denotes the container exists and is running. + Running + // Pausing is the status that denotes the container exists, it is in the process of being paused. + Pausing + // Paused is the status that denotes the container exists, but all its processes are paused. + Paused + // Stopped is the status that denotes the container does not have a created or running process. + Stopped +) + +func (s Status) String() string { + switch s { + case Created: + return "created" + case Running: + return "running" + case Pausing: + return "pausing" + case Paused: + return "paused" + case Stopped: + return "stopped" + default: + return "unknown" + } +} + +// BaseState represents the platform agnostic pieces relating to a +// running container's state +type BaseState struct { + // ID is the container ID. + ID string `json:"id"` + + // InitProcessPid is the init process id in the parent namespace. + InitProcessPid int `json:"init_process_pid"` + + // InitProcessStartTime is the init process start time in clock cycles since boot time. + InitProcessStartTime uint64 `json:"init_process_start"` + + // Created is the unix timestamp for the creation time of the container in UTC + Created time.Time `json:"created"` + + // Config is the container's configuration. + Config configs.Config `json:"config"` +} + +// BaseContainer is a libcontainer container object. +// +// Each container is thread-safe within the same process. Since a container can +// be destroyed by a separate process, any function may return that the container +// was not found. BaseContainer includes methods that are platform agnostic. +type BaseContainer interface { + // Returns the ID of the container + ID() string + + // Returns the current status of the container. + // + // errors: + // ContainerNotExists - Container no longer exists, + // Systemerror - System error. + Status() (Status, error) + + // State returns the current container's state information. + // + // errors: + // SystemError - System error. + State() (*State, error) + + // OCIState returns the current container's state information. + // + // errors: + // SystemError - System error. + OCIState() (*specs.State, error) + + // Returns the current config of the container. + Config() configs.Config + + // Returns the PIDs inside this container. The PIDs are in the namespace of the calling process. + // + // errors: + // ContainerNotExists - Container no longer exists, + // Systemerror - System error. + // + // Some of the returned PIDs may no longer refer to processes in the Container, unless + // the Container state is PAUSED in which case every PID in the slice is valid. + Processes() ([]int, error) + + // Returns statistics for the container. + // + // errors: + // ContainerNotExists - Container no longer exists, + // Systemerror - System error. + Stats() (*Stats, error) + + // Set resources of container as configured + // + // We can use this to change resources when containers are running. + // + // errors: + // SystemError - System error. + Set(config configs.Config) error + + // Start a process inside the container. Returns error if process fails to + // start. You can track process lifecycle with passed Process structure. + // + // errors: + // ContainerNotExists - Container no longer exists, + // ConfigInvalid - config is invalid, + // ContainerPaused - Container is paused, + // SystemError - System error. + Start(process *Process) (err error) + + // Run immediately starts the process inside the container. Returns error if process + // fails to start. It does not block waiting for the exec fifo after start returns but + // opens the fifo after start returns. + // + // errors: + // ContainerNotExists - Container no longer exists, + // ConfigInvalid - config is invalid, + // ContainerPaused - Container is paused, + // SystemError - System error. + Run(process *Process) (err error) + + // Destroys the container, if its in a valid state, after killing any + // remaining running processes. + // + // Any event registrations are removed before the container is destroyed. + // No error is returned if the container is already destroyed. + // + // Running containers must first be stopped using Signal(..). + // Paused containers must first be resumed using Resume(..). + // + // errors: + // ContainerNotStopped - Container is still running, + // ContainerPaused - Container is paused, + // SystemError - System error. + Destroy() error + + // Signal sends the provided signal code to the container's initial process. + // + // If all is specified the signal is sent to all processes in the container + // including the initial process. + // + // errors: + // SystemError - System error. + Signal(s os.Signal, all bool) error + + // Exec signals the container to exec the users process at the end of the init. + // + // errors: + // SystemError - System error. + Exec() error +} diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go new file mode 100644 index 0000000..fe70c93 --- /dev/null +++ b/libcontainer/container_linux.go @@ -0,0 +1,2060 @@ +// +build linux + +package libcontainer + +import ( + "bytes" + "encoding/json" + "errors" + "fmt" + "io" + "io/ioutil" + "net" + "os" + "os/exec" + "path/filepath" + "reflect" + "strings" + "sync" + "syscall" // only for SysProcAttr and Signal + "time" + + securejoin "github.com/cyphar/filepath-securejoin" + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/intelrdt" + "github.com/opencontainers/runc/libcontainer/system" + "github.com/opencontainers/runc/libcontainer/utils" + "github.com/opencontainers/runtime-spec/specs-go" + + criurpc "github.com/checkpoint-restore/go-criu/rpc" + "github.com/golang/protobuf/proto" + "github.com/sirupsen/logrus" + "github.com/vishvananda/netlink/nl" + "golang.org/x/sys/unix" +) + +const stdioFdCount = 3 + +type linuxContainer struct { + id string + root string + config *configs.Config + cgroupManager cgroups.Manager + intelRdtManager intelrdt.Manager + initPath string + initArgs []string + initProcess parentProcess + initProcessStartTime uint64 + criuPath string + newuidmapPath string + newgidmapPath string + m sync.Mutex + criuVersion int + state containerState + created time.Time +} + +// State represents a running container's state +type State struct { + BaseState + + // Platform specific fields below here + + // Specified if the container was started under the rootless mode. + // Set to true if BaseState.Config.RootlessEUID && BaseState.Config.RootlessCgroups + Rootless bool `json:"rootless"` + + // Path to all the cgroups setup for a container. Key is cgroup subsystem name + // with the value as the path. + CgroupPaths map[string]string `json:"cgroup_paths"` + + // NamespacePaths are filepaths to the container's namespaces. Key is the namespace type + // with the value as the path. + NamespacePaths map[configs.NamespaceType]string `json:"namespace_paths"` + + // Container's standard descriptors (std{in,out,err}), needed for checkpoint and restore + ExternalDescriptors []string `json:"external_descriptors,omitempty"` + + // Intel RDT "resource control" filesystem path + IntelRdtPath string `json:"intel_rdt_path"` +} + +// Container is a libcontainer container object. +// +// Each container is thread-safe within the same process. Since a container can +// be destroyed by a separate process, any function may return that the container +// was not found. +type Container interface { + BaseContainer + + // Methods below here are platform specific + + // Checkpoint checkpoints the running container's state to disk using the criu(8) utility. + // + // errors: + // Systemerror - System error. + Checkpoint(criuOpts *CriuOpts) error + + // Restore restores the checkpointed container to a running state using the criu(8) utility. + // + // errors: + // Systemerror - System error. + Restore(process *Process, criuOpts *CriuOpts) error + + // If the Container state is RUNNING or CREATED, sets the Container state to PAUSING and pauses + // the execution of any user processes. Asynchronously, when the container finished being paused the + // state is changed to PAUSED. + // If the Container state is PAUSED, do nothing. + // + // errors: + // ContainerNotExists - Container no longer exists, + // ContainerNotRunning - Container not running or created, + // Systemerror - System error. + Pause() error + + // If the Container state is PAUSED, resumes the execution of any user processes in the + // Container before setting the Container state to RUNNING. + // If the Container state is RUNNING, do nothing. + // + // errors: + // ContainerNotExists - Container no longer exists, + // ContainerNotPaused - Container is not paused, + // Systemerror - System error. + Resume() error + + // NotifyOOM returns a read-only channel signaling when the container receives an OOM notification. + // + // errors: + // Systemerror - System error. + NotifyOOM() (<-chan struct{}, error) + + // NotifyMemoryPressure returns a read-only channel signaling when the container reaches a given pressure level + // + // errors: + // Systemerror - System error. + NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error) +} + +// ID returns the container's unique ID +func (c *linuxContainer) ID() string { + return c.id +} + +// Config returns the container's configuration +func (c *linuxContainer) Config() configs.Config { + return *c.config +} + +func (c *linuxContainer) Status() (Status, error) { + c.m.Lock() + defer c.m.Unlock() + return c.currentStatus() +} + +func (c *linuxContainer) State() (*State, error) { + c.m.Lock() + defer c.m.Unlock() + return c.currentState() +} + +func (c *linuxContainer) OCIState() (*specs.State, error) { + c.m.Lock() + defer c.m.Unlock() + return c.currentOCIState() +} + +func (c *linuxContainer) Processes() ([]int, error) { + pids, err := c.cgroupManager.GetAllPids() + if err != nil { + return nil, newSystemErrorWithCause(err, "getting all container pids from cgroups") + } + return pids, nil +} + +func (c *linuxContainer) Stats() (*Stats, error) { + var ( + err error + stats = &Stats{} + ) + if stats.CgroupStats, err = c.cgroupManager.GetStats(); err != nil { + return stats, newSystemErrorWithCause(err, "getting container stats from cgroups") + } + if c.intelRdtManager != nil { + if stats.IntelRdtStats, err = c.intelRdtManager.GetStats(); err != nil { + return stats, newSystemErrorWithCause(err, "getting container's Intel RDT stats") + } + } + for _, iface := range c.config.Networks { + switch iface.Type { + case "veth": + istats, err := getNetworkInterfaceStats(iface.HostInterfaceName) + if err != nil { + return stats, newSystemErrorWithCausef(err, "getting network stats for interface %q", iface.HostInterfaceName) + } + stats.Interfaces = append(stats.Interfaces, istats) + } + } + return stats, nil +} + +func (c *linuxContainer) Set(config configs.Config) error { + c.m.Lock() + defer c.m.Unlock() + status, err := c.currentStatus() + if err != nil { + return err + } + if status == Stopped { + return newGenericError(fmt.Errorf("container not running"), ContainerNotRunning) + } + if err := c.cgroupManager.Set(&config); err != nil { + // Set configs back + if err2 := c.cgroupManager.Set(c.config); err2 != nil { + logrus.Warnf("Setting back cgroup configs failed due to error: %v, your state.json and actual configs might be inconsistent.", err2) + } + return err + } + if c.intelRdtManager != nil { + if err := c.intelRdtManager.Set(&config); err != nil { + // Set configs back + if err2 := c.intelRdtManager.Set(c.config); err2 != nil { + logrus.Warnf("Setting back intelrdt configs failed due to error: %v, your state.json and actual configs might be inconsistent.", err2) + } + return err + } + } + // After config setting succeed, update config and states + c.config = &config + _, err = c.updateState(nil) + return err +} + +func (c *linuxContainer) Start(process *Process) error { + c.m.Lock() + defer c.m.Unlock() + if process.Init { + if err := c.createExecFifo(); err != nil { + return err + } + } + if err := c.start(process); err != nil { + if process.Init { + c.deleteExecFifo() + } + return err + } + return nil +} + +func (c *linuxContainer) Run(process *Process) error { + if err := c.Start(process); err != nil { + return err + } + if process.Init { + return c.exec() + } + return nil +} + +func (c *linuxContainer) Exec() error { + c.m.Lock() + defer c.m.Unlock() + return c.exec() +} + +func (c *linuxContainer) exec() error { + path := filepath.Join(c.root, execFifoFilename) + pid := c.initProcess.pid() + blockingFifoOpenCh := awaitFifoOpen(path) + for { + select { + case result := <-blockingFifoOpenCh: + return handleFifoResult(result) + + case <-time.After(time.Millisecond * 100): + stat, err := system.Stat(pid) + if err != nil || stat.State == system.Zombie { + // could be because process started, ran, and completed between our 100ms timeout and our system.Stat() check. + // see if the fifo exists and has data (with a non-blocking open, which will succeed if the writing process is complete). + if err := handleFifoResult(fifoOpen(path, false)); err != nil { + return errors.New("container process is already dead") + } + return nil + } + } + } +} + +func readFromExecFifo(execFifo io.Reader) error { + data, err := ioutil.ReadAll(execFifo) + if err != nil { + return err + } + if len(data) <= 0 { + return fmt.Errorf("cannot start an already running container") + } + return nil +} + +func awaitFifoOpen(path string) <-chan openResult { + fifoOpened := make(chan openResult) + go func() { + result := fifoOpen(path, true) + fifoOpened <- result + }() + return fifoOpened +} + +func fifoOpen(path string, block bool) openResult { + flags := os.O_RDONLY + if !block { + flags |= syscall.O_NONBLOCK + } + f, err := os.OpenFile(path, flags, 0) + if err != nil { + return openResult{err: newSystemErrorWithCause(err, "open exec fifo for reading")} + } + return openResult{file: f} +} + +func handleFifoResult(result openResult) error { + if result.err != nil { + return result.err + } + f := result.file + defer f.Close() + if err := readFromExecFifo(f); err != nil { + return err + } + return os.Remove(f.Name()) +} + +type openResult struct { + file *os.File + err error +} + +func (c *linuxContainer) start(process *Process) error { + parent, err := c.newParentProcess(process) + if err != nil { + return newSystemErrorWithCause(err, "creating new parent process") + } + parent.forwardChildLogs() + if err := parent.start(); err != nil { + // terminate the process to ensure that it properly is reaped. + if err := ignoreTerminateErrors(parent.terminate()); err != nil { + logrus.Warn(err) + } + return newSystemErrorWithCause(err, "starting container process") + } + // generate a timestamp indicating when the container was started + c.created = time.Now().UTC() + if process.Init { + c.state = &createdState{ + c: c, + } + state, err := c.updateState(parent) + if err != nil { + return err + } + c.initProcessStartTime = state.InitProcessStartTime + + if c.config.Hooks != nil { + s, err := c.currentOCIState() + if err != nil { + return err + } + for i, hook := range c.config.Hooks.Poststart { + if err := hook.Run(s); err != nil { + if err := ignoreTerminateErrors(parent.terminate()); err != nil { + logrus.Warn(err) + } + return newSystemErrorWithCausef(err, "running poststart hook %d", i) + } + } + } + } + return nil +} + +func (c *linuxContainer) Signal(s os.Signal, all bool) error { + if all { + return signalAllProcesses(c.cgroupManager, s) + } + status, err := c.currentStatus() + if err != nil { + return err + } + // to avoid a PID reuse attack + if status == Running || status == Created || status == Paused { + if err := c.initProcess.signal(s); err != nil { + return newSystemErrorWithCause(err, "signaling init process") + } + return nil + } + return newGenericError(fmt.Errorf("container not running"), ContainerNotRunning) +} + +func (c *linuxContainer) createExecFifo() error { + rootuid, err := c.Config().HostRootUID() + if err != nil { + return err + } + rootgid, err := c.Config().HostRootGID() + if err != nil { + return err + } + + fifoName := filepath.Join(c.root, execFifoFilename) + if _, err := os.Stat(fifoName); err == nil { + return fmt.Errorf("exec fifo %s already exists", fifoName) + } + oldMask := unix.Umask(0000) + if err := unix.Mkfifo(fifoName, 0622); err != nil { + unix.Umask(oldMask) + return err + } + unix.Umask(oldMask) + return os.Chown(fifoName, rootuid, rootgid) +} + +func (c *linuxContainer) deleteExecFifo() { + fifoName := filepath.Join(c.root, execFifoFilename) + os.Remove(fifoName) +} + +// includeExecFifo opens the container's execfifo as a pathfd, so that the +// container cannot access the statedir (and the FIFO itself remains +// un-opened). It then adds the FifoFd to the given exec.Cmd as an inherited +// fd, with _LIBCONTAINER_FIFOFD set to its fd number. +func (c *linuxContainer) includeExecFifo(cmd *exec.Cmd) error { + fifoName := filepath.Join(c.root, execFifoFilename) + fifoFd, err := unix.Open(fifoName, unix.O_PATH|unix.O_CLOEXEC, 0) + if err != nil { + return err + } + + cmd.ExtraFiles = append(cmd.ExtraFiles, os.NewFile(uintptr(fifoFd), fifoName)) + cmd.Env = append(cmd.Env, + fmt.Sprintf("_LIBCONTAINER_FIFOFD=%d", stdioFdCount+len(cmd.ExtraFiles)-1)) + return nil +} + +func (c *linuxContainer) newParentProcess(p *Process) (parentProcess, error) { + parentInitPipe, childInitPipe, err := utils.NewSockPair("init") + if err != nil { + return nil, newSystemErrorWithCause(err, "creating new init pipe") + } + messageSockPair := filePair{parentInitPipe, childInitPipe} + + parentLogPipe, childLogPipe, err := os.Pipe() + if err != nil { + return nil, fmt.Errorf("Unable to create the log pipe: %s", err) + } + logFilePair := filePair{parentLogPipe, childLogPipe} + + cmd, err := c.commandTemplate(p, childInitPipe, childLogPipe) + if err != nil { + return nil, newSystemErrorWithCause(err, "creating new command template") + } + if !p.Init { + return c.newSetnsProcess(p, cmd, messageSockPair, logFilePair) + } + + // We only set up fifoFd if we're not doing a `runc exec`. The historic + // reason for this is that previously we would pass a dirfd that allowed + // for container rootfs escape (and not doing it in `runc exec` avoided + // that problem), but we no longer do that. However, there's no need to do + // this for `runc exec` so we just keep it this way to be safe. + if err := c.includeExecFifo(cmd); err != nil { + return nil, newSystemErrorWithCause(err, "including execfifo in cmd.Exec setup") + } + return c.newInitProcess(p, cmd, messageSockPair, logFilePair) +} + +func (c *linuxContainer) commandTemplate(p *Process, childInitPipe *os.File, childLogPipe *os.File) (*exec.Cmd, error) { + cmd := exec.Command(c.initPath, c.initArgs[1:]...) + cmd.Args[0] = c.initArgs[0] + cmd.Stdin = p.Stdin + cmd.Stdout = p.Stdout + cmd.Stderr = p.Stderr + cmd.Dir = c.config.Rootfs + if cmd.SysProcAttr == nil { + cmd.SysProcAttr = &syscall.SysProcAttr{} + } + cmd.Env = append(cmd.Env, fmt.Sprintf("GOMAXPROCS=%s", os.Getenv("GOMAXPROCS"))) + cmd.ExtraFiles = append(cmd.ExtraFiles, p.ExtraFiles...) + if p.ConsoleSocket != nil { + cmd.ExtraFiles = append(cmd.ExtraFiles, p.ConsoleSocket) + cmd.Env = append(cmd.Env, + fmt.Sprintf("_LIBCONTAINER_CONSOLE=%d", stdioFdCount+len(cmd.ExtraFiles)-1), + ) + } + cmd.ExtraFiles = append(cmd.ExtraFiles, childInitPipe) + cmd.Env = append(cmd.Env, + fmt.Sprintf("_LIBCONTAINER_INITPIPE=%d", stdioFdCount+len(cmd.ExtraFiles)-1), + fmt.Sprintf("_LIBCONTAINER_STATEDIR=%s", c.root), + ) + + cmd.ExtraFiles = append(cmd.ExtraFiles, childLogPipe) + cmd.Env = append(cmd.Env, + fmt.Sprintf("_LIBCONTAINER_LOGPIPE=%d", stdioFdCount+len(cmd.ExtraFiles)-1), + fmt.Sprintf("_LIBCONTAINER_LOGLEVEL=%s", p.LogLevel), + ) + + // NOTE: when running a container with no PID namespace and the parent process spawning the container is + // PID1 the pdeathsig is being delivered to the container's init process by the kernel for some reason + // even with the parent still running. + if c.config.ParentDeathSignal > 0 { + cmd.SysProcAttr.Pdeathsig = syscall.Signal(c.config.ParentDeathSignal) + } + return cmd, nil +} + +func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, messageSockPair, logFilePair filePair) (*initProcess, error) { + cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard)) + nsMaps := make(map[configs.NamespaceType]string) + for _, ns := range c.config.Namespaces { + if ns.Path != "" { + nsMaps[ns.Type] = ns.Path + } + } + _, sharePidns := nsMaps[configs.NEWPID] + data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps) + if err != nil { + return nil, err + } + init := &initProcess{ + cmd: cmd, + messageSockPair: messageSockPair, + logFilePair: logFilePair, + manager: c.cgroupManager, + intelRdtManager: c.intelRdtManager, + config: c.newInitConfig(p), + container: c, + process: p, + bootstrapData: data, + sharePidns: sharePidns, + } + c.initProcess = init + return init, nil +} + +func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, messageSockPair, logFilePair filePair) (*setnsProcess, error) { + cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initSetns)) + state, err := c.currentState() + if err != nil { + return nil, newSystemErrorWithCause(err, "getting container's current state") + } + // for setns process, we don't have to set cloneflags as the process namespaces + // will only be set via setns syscall + data, err := c.bootstrapData(0, state.NamespacePaths) + if err != nil { + return nil, err + } + return &setnsProcess{ + cmd: cmd, + cgroupPaths: c.cgroupManager.GetPaths(), + rootlessCgroups: c.config.RootlessCgroups, + intelRdtPath: state.IntelRdtPath, + messageSockPair: messageSockPair, + logFilePair: logFilePair, + config: c.newInitConfig(p), + process: p, + bootstrapData: data, + }, nil +} + +func (c *linuxContainer) newInitConfig(process *Process) *initConfig { + cfg := &initConfig{ + Config: c.config, + Args: process.Args, + Env: process.Env, + User: process.User, + AdditionalGroups: process.AdditionalGroups, + Cwd: process.Cwd, + Capabilities: process.Capabilities, + PassedFilesCount: len(process.ExtraFiles), + ContainerId: c.ID(), + NoNewPrivileges: c.config.NoNewPrivileges, + RootlessEUID: c.config.RootlessEUID, + RootlessCgroups: c.config.RootlessCgroups, + AppArmorProfile: c.config.AppArmorProfile, + ProcessLabel: c.config.ProcessLabel, + Rlimits: c.config.Rlimits, + } + if process.NoNewPrivileges != nil { + cfg.NoNewPrivileges = *process.NoNewPrivileges + } + if process.AppArmorProfile != "" { + cfg.AppArmorProfile = process.AppArmorProfile + } + if process.Label != "" { + cfg.ProcessLabel = process.Label + } + if len(process.Rlimits) > 0 { + cfg.Rlimits = process.Rlimits + } + cfg.CreateConsole = process.ConsoleSocket != nil + cfg.ConsoleWidth = process.ConsoleWidth + cfg.ConsoleHeight = process.ConsoleHeight + return cfg +} + +func (c *linuxContainer) Destroy() error { + c.m.Lock() + defer c.m.Unlock() + return c.state.destroy() +} + +func (c *linuxContainer) Pause() error { + c.m.Lock() + defer c.m.Unlock() + status, err := c.currentStatus() + if err != nil { + return err + } + switch status { + case Running, Created: + if err := c.cgroupManager.Freeze(configs.Frozen); err != nil { + return err + } + return c.state.transition(&pausedState{ + c: c, + }) + } + return newGenericError(fmt.Errorf("container not running or created: %s", status), ContainerNotRunning) +} + +func (c *linuxContainer) Resume() error { + c.m.Lock() + defer c.m.Unlock() + status, err := c.currentStatus() + if err != nil { + return err + } + if status != Paused { + return newGenericError(fmt.Errorf("container not paused"), ContainerNotPaused) + } + if err := c.cgroupManager.Freeze(configs.Thawed); err != nil { + return err + } + return c.state.transition(&runningState{ + c: c, + }) +} + +func (c *linuxContainer) NotifyOOM() (<-chan struct{}, error) { + // XXX(cyphar): This requires cgroups. + if c.config.RootlessCgroups { + logrus.Warn("getting OOM notifications may fail if you don't have the full access to cgroups") + } + return notifyOnOOM(c.cgroupManager.GetPaths()) +} + +func (c *linuxContainer) NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error) { + // XXX(cyphar): This requires cgroups. + if c.config.RootlessCgroups { + logrus.Warn("getting memory pressure notifications may fail if you don't have the full access to cgroups") + } + return notifyMemoryPressure(c.cgroupManager.GetPaths(), level) +} + +var criuFeatures *criurpc.CriuFeatures + +func (c *linuxContainer) checkCriuFeatures(criuOpts *CriuOpts, rpcOpts *criurpc.CriuOpts, criuFeat *criurpc.CriuFeatures) error { + + var t criurpc.CriuReqType + t = criurpc.CriuReqType_FEATURE_CHECK + + // criu 1.8 => 10800 + if err := c.checkCriuVersion(10800); err != nil { + // Feature checking was introduced with CRIU 1.8. + // Ignore the feature check if an older CRIU version is used + // and just act as before. + // As all automated PR testing is done using CRIU 1.7 this + // code will not be tested by automated PR testing. + return nil + } + + // make sure the features we are looking for are really not from + // some previous check + criuFeatures = nil + + req := &criurpc.CriuReq{ + Type: &t, + // Theoretically this should not be necessary but CRIU + // segfaults if Opts is empty. + // Fixed in CRIU 2.12 + Opts: rpcOpts, + Features: criuFeat, + } + + err := c.criuSwrk(nil, req, criuOpts, false, nil) + if err != nil { + logrus.Debugf("%s", err) + return fmt.Errorf("CRIU feature check failed") + } + + logrus.Debugf("Feature check says: %s", criuFeatures) + missingFeatures := false + + // The outer if checks if the fields actually exist + if (criuFeat.MemTrack != nil) && + (criuFeatures.MemTrack != nil) { + // The inner if checks if they are set to true + if *criuFeat.MemTrack && !*criuFeatures.MemTrack { + missingFeatures = true + logrus.Debugf("CRIU does not support MemTrack") + } + } + + // This needs to be repeated for every new feature check. + // Is there a way to put this in a function. Reflection? + if (criuFeat.LazyPages != nil) && + (criuFeatures.LazyPages != nil) { + if *criuFeat.LazyPages && !*criuFeatures.LazyPages { + missingFeatures = true + logrus.Debugf("CRIU does not support LazyPages") + } + } + + if missingFeatures { + return fmt.Errorf("CRIU is missing features") + } + + return nil +} + +func parseCriuVersion(path string) (int, error) { + var x, y, z int + + out, err := exec.Command(path, "-V").Output() + if err != nil { + return 0, fmt.Errorf("Unable to execute CRIU command: %s", path) + } + + x = 0 + y = 0 + z = 0 + if ep := strings.Index(string(out), "-"); ep >= 0 { + // criu Git version format + var version string + if sp := strings.Index(string(out), "GitID"); sp > 0 { + version = string(out)[sp:ep] + } else { + return 0, fmt.Errorf("Unable to parse the CRIU version: %s", path) + } + + n, err := fmt.Sscanf(version, "GitID: v%d.%d.%d", &x, &y, &z) // 1.5.2 + if err != nil { + n, err = fmt.Sscanf(version, "GitID: v%d.%d", &x, &y) // 1.6 + y++ + } else { + z++ + } + if n < 2 || err != nil { + return 0, fmt.Errorf("Unable to parse the CRIU version: %s %d %s", version, n, err) + } + } else { + // criu release version format + n, err := fmt.Sscanf(string(out), "Version: %d.%d.%d\n", &x, &y, &z) // 1.5.2 + if err != nil { + n, err = fmt.Sscanf(string(out), "Version: %d.%d\n", &x, &y) // 1.6 + } + if n < 2 || err != nil { + return 0, fmt.Errorf("Unable to parse the CRIU version: %s %d %s", out, n, err) + } + } + + return x*10000 + y*100 + z, nil +} + +func compareCriuVersion(criuVersion int, minVersion int) error { + // simple function to perform the actual version compare + if criuVersion < minVersion { + return fmt.Errorf("CRIU version %d must be %d or higher", criuVersion, minVersion) + } + + return nil +} + +// This is used to store the result of criu version RPC +var criuVersionRPC *criurpc.CriuVersion + +// checkCriuVersion checks Criu version greater than or equal to minVersion +func (c *linuxContainer) checkCriuVersion(minVersion int) error { + + // If the version of criu has already been determined there is no need + // to ask criu for the version again. Use the value from c.criuVersion. + if c.criuVersion != 0 { + return compareCriuVersion(c.criuVersion, minVersion) + } + + // First try if this version of CRIU support the version RPC. + // The CRIU version RPC was introduced with CRIU 3.0. + + // First, reset the variable for the RPC answer to nil + criuVersionRPC = nil + + var t criurpc.CriuReqType + t = criurpc.CriuReqType_VERSION + req := &criurpc.CriuReq{ + Type: &t, + } + + err := c.criuSwrk(nil, req, nil, false, nil) + if err != nil { + return fmt.Errorf("CRIU version check failed: %s", err) + } + + if criuVersionRPC != nil { + logrus.Debugf("CRIU version: %s", criuVersionRPC) + // major and minor are always set + c.criuVersion = int(*criuVersionRPC.Major) * 10000 + c.criuVersion += int(*criuVersionRPC.Minor) * 100 + if criuVersionRPC.Sublevel != nil { + c.criuVersion += int(*criuVersionRPC.Sublevel) + } + if criuVersionRPC.Gitid != nil { + // runc's convention is that a CRIU git release is + // always the same as increasing the minor by 1 + c.criuVersion -= (c.criuVersion % 100) + c.criuVersion += 100 + } + return compareCriuVersion(c.criuVersion, minVersion) + } + + // This is CRIU without the version RPC and therefore + // older than 3.0. Parsing the output is required. + + // This can be remove once runc does not work with criu older than 3.0 + + c.criuVersion, err = parseCriuVersion(c.criuPath) + if err != nil { + return err + } + + return compareCriuVersion(c.criuVersion, minVersion) +} + +const descriptorsFilename = "descriptors.json" + +func (c *linuxContainer) addCriuDumpMount(req *criurpc.CriuReq, m *configs.Mount) { + mountDest := m.Destination + if strings.HasPrefix(mountDest, c.config.Rootfs) { + mountDest = mountDest[len(c.config.Rootfs):] + } + + extMnt := &criurpc.ExtMountMap{ + Key: proto.String(mountDest), + Val: proto.String(mountDest), + } + req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt) +} + +func (c *linuxContainer) addMaskPaths(req *criurpc.CriuReq) error { + for _, path := range c.config.MaskPaths { + fi, err := os.Stat(fmt.Sprintf("/proc/%d/root/%s", c.initProcess.pid(), path)) + if err != nil { + if os.IsNotExist(err) { + continue + } + return err + } + if fi.IsDir() { + continue + } + + extMnt := &criurpc.ExtMountMap{ + Key: proto.String(path), + Val: proto.String("/dev/null"), + } + req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt) + } + return nil +} + +func waitForCriuLazyServer(r *os.File, status string) error { + + data := make([]byte, 1) + _, err := r.Read(data) + if err != nil { + return err + } + fd, err := os.OpenFile(status, os.O_TRUNC|os.O_WRONLY, os.ModeAppend) + if err != nil { + return err + } + _, err = fd.Write(data) + if err != nil { + return err + } + fd.Close() + + return nil +} + +func (c *linuxContainer) handleCriuConfigurationFile(rpcOpts *criurpc.CriuOpts) { + // CRIU will evaluate a configuration starting with release 3.11. + // Settings in the configuration file will overwrite RPC settings. + // Look for annotations. The annotation 'org.criu.config' + // specifies if CRIU should use a different, container specific + // configuration file. + _, annotations := utils.Annotations(c.config.Labels) + configFile, exists := annotations["org.criu.config"] + if exists { + // If the annotation 'org.criu.config' exists and is set + // to a non-empty string, tell CRIU to use that as a + // configuration file. If the file does not exist, CRIU + // will just ignore it. + if configFile != "" { + rpcOpts.ConfigFile = proto.String(configFile) + } + // If 'org.criu.config' exists and is set to an empty + // string, a runc specific CRIU configuration file will + // be not set at all. + } else { + // If the mentioned annotation has not been found, specify + // a default CRIU configuration file. + rpcOpts.ConfigFile = proto.String("/etc/criu/runc.conf") + } +} + +func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error { + c.m.Lock() + defer c.m.Unlock() + + // Checkpoint is unlikely to work if os.Geteuid() != 0 || system.RunningInUserNS(). + // (CLI prints a warning) + // TODO(avagin): Figure out how to make this work nicely. CRIU 2.0 has + // support for doing unprivileged dumps, but the setup of + // rootless containers might make this complicated. + + // criu 1.5.2 => 10502 + if err := c.checkCriuVersion(10502); err != nil { + return err + } + + if criuOpts.ImagesDirectory == "" { + return fmt.Errorf("invalid directory to save checkpoint") + } + + // Since a container can be C/R'ed multiple times, + // the checkpoint directory may already exist. + if err := os.Mkdir(criuOpts.ImagesDirectory, 0700); err != nil && !os.IsExist(err) { + return err + } + + if criuOpts.WorkDirectory == "" { + criuOpts.WorkDirectory = filepath.Join(c.root, "criu.work") + } + + if err := os.Mkdir(criuOpts.WorkDirectory, 0700); err != nil && !os.IsExist(err) { + return err + } + + workDir, err := os.Open(criuOpts.WorkDirectory) + if err != nil { + return err + } + defer workDir.Close() + + imageDir, err := os.Open(criuOpts.ImagesDirectory) + if err != nil { + return err + } + defer imageDir.Close() + + rpcOpts := criurpc.CriuOpts{ + ImagesDirFd: proto.Int32(int32(imageDir.Fd())), + WorkDirFd: proto.Int32(int32(workDir.Fd())), + LogLevel: proto.Int32(4), + LogFile: proto.String("dump.log"), + Root: proto.String(c.config.Rootfs), + ManageCgroups: proto.Bool(true), + NotifyScripts: proto.Bool(true), + Pid: proto.Int32(int32(c.initProcess.pid())), + ShellJob: proto.Bool(criuOpts.ShellJob), + LeaveRunning: proto.Bool(criuOpts.LeaveRunning), + TcpEstablished: proto.Bool(criuOpts.TcpEstablished), + ExtUnixSk: proto.Bool(criuOpts.ExternalUnixConnections), + FileLocks: proto.Bool(criuOpts.FileLocks), + EmptyNs: proto.Uint32(criuOpts.EmptyNs), + OrphanPtsMaster: proto.Bool(true), + AutoDedup: proto.Bool(criuOpts.AutoDedup), + LazyPages: proto.Bool(criuOpts.LazyPages), + } + + c.handleCriuConfigurationFile(&rpcOpts) + + // If the container is running in a network namespace and has + // a path to the network namespace configured, we will dump + // that network namespace as an external namespace and we + // will expect that the namespace exists during restore. + // This basically means that CRIU will ignore the namespace + // and expect to be setup correctly. + nsPath := c.config.Namespaces.PathOf(configs.NEWNET) + if nsPath != "" { + // For this to work we need at least criu 3.11.0 => 31100. + // As there was already a successful version check we will + // not error out if it fails. runc will just behave as it used + // to do and ignore external network namespaces. + err := c.checkCriuVersion(31100) + if err == nil { + // CRIU expects the information about an external namespace + // like this: --external net[]: + // This is always 'extRootNetNS'. + var netns syscall.Stat_t + err = syscall.Stat(nsPath, &netns) + if err != nil { + return err + } + criuExternal := fmt.Sprintf("net[%d]:extRootNetNS", netns.Ino) + rpcOpts.External = append(rpcOpts.External, criuExternal) + } + } + + fcg := c.cgroupManager.GetPaths()["freezer"] + if fcg != "" { + rpcOpts.FreezeCgroup = proto.String(fcg) + } + + // append optional criu opts, e.g., page-server and port + if criuOpts.PageServer.Address != "" && criuOpts.PageServer.Port != 0 { + rpcOpts.Ps = &criurpc.CriuPageServerInfo{ + Address: proto.String(criuOpts.PageServer.Address), + Port: proto.Int32(criuOpts.PageServer.Port), + } + } + + //pre-dump may need parentImage param to complete iterative migration + if criuOpts.ParentImage != "" { + rpcOpts.ParentImg = proto.String(criuOpts.ParentImage) + rpcOpts.TrackMem = proto.Bool(true) + } + + // append optional manage cgroups mode + if criuOpts.ManageCgroupsMode != 0 { + // criu 1.7 => 10700 + if err := c.checkCriuVersion(10700); err != nil { + return err + } + mode := criurpc.CriuCgMode(criuOpts.ManageCgroupsMode) + rpcOpts.ManageCgroupsMode = &mode + } + + var t criurpc.CriuReqType + if criuOpts.PreDump { + feat := criurpc.CriuFeatures{ + MemTrack: proto.Bool(true), + } + + if err := c.checkCriuFeatures(criuOpts, &rpcOpts, &feat); err != nil { + return err + } + + t = criurpc.CriuReqType_PRE_DUMP + } else { + t = criurpc.CriuReqType_DUMP + } + req := &criurpc.CriuReq{ + Type: &t, + Opts: &rpcOpts, + } + + if criuOpts.LazyPages { + // lazy migration requested; check if criu supports it + feat := criurpc.CriuFeatures{ + LazyPages: proto.Bool(true), + } + + if err := c.checkCriuFeatures(criuOpts, &rpcOpts, &feat); err != nil { + return err + } + + statusRead, statusWrite, err := os.Pipe() + if err != nil { + return err + } + rpcOpts.StatusFd = proto.Int32(int32(statusWrite.Fd())) + go waitForCriuLazyServer(statusRead, criuOpts.StatusFd) + } + + //no need to dump these information in pre-dump + if !criuOpts.PreDump { + for _, m := range c.config.Mounts { + switch m.Device { + case "bind": + c.addCriuDumpMount(req, m) + case "cgroup": + binds, err := getCgroupMounts(m) + if err != nil { + return err + } + for _, b := range binds { + c.addCriuDumpMount(req, b) + } + } + } + + if err := c.addMaskPaths(req); err != nil { + return err + } + + for _, node := range c.config.Devices { + m := &configs.Mount{Destination: node.Path, Source: node.Path} + c.addCriuDumpMount(req, m) + } + + // Write the FD info to a file in the image directory + fdsJSON, err := json.Marshal(c.initProcess.externalDescriptors()) + if err != nil { + return err + } + + err = ioutil.WriteFile(filepath.Join(criuOpts.ImagesDirectory, descriptorsFilename), fdsJSON, 0600) + if err != nil { + return err + } + } + + err = c.criuSwrk(nil, req, criuOpts, false, nil) + if err != nil { + return err + } + return nil +} + +func (c *linuxContainer) addCriuRestoreMount(req *criurpc.CriuReq, m *configs.Mount) { + mountDest := m.Destination + if strings.HasPrefix(mountDest, c.config.Rootfs) { + mountDest = mountDest[len(c.config.Rootfs):] + } + + extMnt := &criurpc.ExtMountMap{ + Key: proto.String(mountDest), + Val: proto.String(m.Source), + } + req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt) +} + +func (c *linuxContainer) restoreNetwork(req *criurpc.CriuReq, criuOpts *CriuOpts) { + for _, iface := range c.config.Networks { + switch iface.Type { + case "veth": + veth := new(criurpc.CriuVethPair) + veth.IfOut = proto.String(iface.HostInterfaceName) + veth.IfIn = proto.String(iface.Name) + req.Opts.Veths = append(req.Opts.Veths, veth) + case "loopback": + // Do nothing + } + } + for _, i := range criuOpts.VethPairs { + veth := new(criurpc.CriuVethPair) + veth.IfOut = proto.String(i.HostInterfaceName) + veth.IfIn = proto.String(i.ContainerInterfaceName) + req.Opts.Veths = append(req.Opts.Veths, veth) + } +} + +// makeCriuRestoreMountpoints makes the actual mountpoints for the +// restore using CRIU. This function is inspired from the code in +// rootfs_linux.go +func (c *linuxContainer) makeCriuRestoreMountpoints(m *configs.Mount) error { + switch m.Device { + case "cgroup": + // Do nothing for cgroup, CRIU should handle it + case "bind": + // The prepareBindMount() function checks if source + // exists. So it cannot be used for other filesystem types. + if err := prepareBindMount(m, c.config.Rootfs); err != nil { + return err + } + default: + // for all other file-systems just create the mountpoints + dest, err := securejoin.SecureJoin(c.config.Rootfs, m.Destination) + if err != nil { + return err + } + if err := checkProcMount(c.config.Rootfs, dest, ""); err != nil { + return err + } + m.Destination = dest + if err := os.MkdirAll(dest, 0755); err != nil { + return err + } + } + return nil +} + +// isPathInPrefixList is a small function for CRIU restore to make sure +// mountpoints, which are on a tmpfs, are not created in the roofs +func isPathInPrefixList(path string, prefix []string) bool { + for _, p := range prefix { + if strings.HasPrefix(path, p+"/") { + return false + } + } + return true +} + +// prepareCriuRestoreMounts tries to set up the rootfs of the +// container to be restored in the same way runc does it for +// initial container creation. Even for a read-only rootfs container +// runc modifies the rootfs to add mountpoints which do not exist. +// This function also creates missing mountpoints as long as they +// are not on top of a tmpfs, as CRIU will restore tmpfs content anyway. +func (c *linuxContainer) prepareCriuRestoreMounts(mounts []*configs.Mount) error { + // First get a list of a all tmpfs mounts + tmpfs := []string{} + for _, m := range mounts { + switch m.Device { + case "tmpfs": + tmpfs = append(tmpfs, m.Destination) + } + } + // Now go through all mounts and create the mountpoints + // if the mountpoints are not on a tmpfs, as CRIU will + // restore the complete tmpfs content from its checkpoint. + for _, m := range mounts { + if isPathInPrefixList(m.Destination, tmpfs) { + if err := c.makeCriuRestoreMountpoints(m); err != nil { + return err + } + } + } + return nil +} + +func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error { + c.m.Lock() + defer c.m.Unlock() + + var extraFiles []*os.File + + // Restore is unlikely to work if os.Geteuid() != 0 || system.RunningInUserNS(). + // (CLI prints a warning) + // TODO(avagin): Figure out how to make this work nicely. CRIU doesn't have + // support for unprivileged restore at the moment. + + // criu 1.5.2 => 10502 + if err := c.checkCriuVersion(10502); err != nil { + return err + } + if criuOpts.WorkDirectory == "" { + criuOpts.WorkDirectory = filepath.Join(c.root, "criu.work") + } + // Since a container can be C/R'ed multiple times, + // the work directory may already exist. + if err := os.Mkdir(criuOpts.WorkDirectory, 0700); err != nil && !os.IsExist(err) { + return err + } + workDir, err := os.Open(criuOpts.WorkDirectory) + if err != nil { + return err + } + defer workDir.Close() + if criuOpts.ImagesDirectory == "" { + return fmt.Errorf("invalid directory to restore checkpoint") + } + imageDir, err := os.Open(criuOpts.ImagesDirectory) + if err != nil { + return err + } + defer imageDir.Close() + // CRIU has a few requirements for a root directory: + // * it must be a mount point + // * its parent must not be overmounted + // c.config.Rootfs is bind-mounted to a temporary directory + // to satisfy these requirements. + root := filepath.Join(c.root, "criu-root") + if err := os.Mkdir(root, 0755); err != nil { + return err + } + defer os.Remove(root) + root, err = filepath.EvalSymlinks(root) + if err != nil { + return err + } + err = unix.Mount(c.config.Rootfs, root, "", unix.MS_BIND|unix.MS_REC, "") + if err != nil { + return err + } + defer unix.Unmount(root, unix.MNT_DETACH) + t := criurpc.CriuReqType_RESTORE + req := &criurpc.CriuReq{ + Type: &t, + Opts: &criurpc.CriuOpts{ + ImagesDirFd: proto.Int32(int32(imageDir.Fd())), + WorkDirFd: proto.Int32(int32(workDir.Fd())), + EvasiveDevices: proto.Bool(true), + LogLevel: proto.Int32(4), + LogFile: proto.String("restore.log"), + RstSibling: proto.Bool(true), + Root: proto.String(root), + ManageCgroups: proto.Bool(true), + NotifyScripts: proto.Bool(true), + ShellJob: proto.Bool(criuOpts.ShellJob), + ExtUnixSk: proto.Bool(criuOpts.ExternalUnixConnections), + TcpEstablished: proto.Bool(criuOpts.TcpEstablished), + FileLocks: proto.Bool(criuOpts.FileLocks), + EmptyNs: proto.Uint32(criuOpts.EmptyNs), + OrphanPtsMaster: proto.Bool(true), + AutoDedup: proto.Bool(criuOpts.AutoDedup), + LazyPages: proto.Bool(criuOpts.LazyPages), + }, + } + + c.handleCriuConfigurationFile(req.Opts) + + // Same as during checkpointing. If the container has a specific network namespace + // assigned to it, this now expects that the checkpoint will be restored in a + // already created network namespace. + nsPath := c.config.Namespaces.PathOf(configs.NEWNET) + if nsPath != "" { + // For this to work we need at least criu 3.11.0 => 31100. + // As there was already a successful version check we will + // not error out if it fails. runc will just behave as it used + // to do and ignore external network namespaces. + err := c.checkCriuVersion(31100) + if err == nil { + // CRIU wants the information about an existing network namespace + // like this: --inherit-fd fd[]: + // The needs to be the same as during checkpointing. + // We are always using 'extRootNetNS' as the key in this. + netns, err := os.Open(nsPath) + defer netns.Close() + if err != nil { + logrus.Errorf("If a specific network namespace is defined it must exist: %s", err) + return fmt.Errorf("Requested network namespace %v does not exist", nsPath) + } + inheritFd := new(criurpc.InheritFd) + inheritFd.Key = proto.String("extRootNetNS") + // The offset of four is necessary because 0, 1, 2 and 3 is already + // used by stdin, stdout, stderr, 'criu swrk' socket. + inheritFd.Fd = proto.Int32(int32(4 + len(extraFiles))) + req.Opts.InheritFd = append(req.Opts.InheritFd, inheritFd) + // All open FDs need to be transferred to CRIU via extraFiles + extraFiles = append(extraFiles, netns) + } + } + + // This will modify the rootfs of the container in the same way runc + // modifies the container during initial creation. + if err := c.prepareCriuRestoreMounts(c.config.Mounts); err != nil { + return err + } + + for _, m := range c.config.Mounts { + switch m.Device { + case "bind": + c.addCriuRestoreMount(req, m) + case "cgroup": + binds, err := getCgroupMounts(m) + if err != nil { + return err + } + for _, b := range binds { + c.addCriuRestoreMount(req, b) + } + } + } + + if len(c.config.MaskPaths) > 0 { + m := &configs.Mount{Destination: "/dev/null", Source: "/dev/null"} + c.addCriuRestoreMount(req, m) + } + + for _, node := range c.config.Devices { + m := &configs.Mount{Destination: node.Path, Source: node.Path} + c.addCriuRestoreMount(req, m) + } + + if criuOpts.EmptyNs&unix.CLONE_NEWNET == 0 { + c.restoreNetwork(req, criuOpts) + } + + // append optional manage cgroups mode + if criuOpts.ManageCgroupsMode != 0 { + // criu 1.7 => 10700 + if err := c.checkCriuVersion(10700); err != nil { + return err + } + mode := criurpc.CriuCgMode(criuOpts.ManageCgroupsMode) + req.Opts.ManageCgroupsMode = &mode + } + + var ( + fds []string + fdJSON []byte + ) + if fdJSON, err = ioutil.ReadFile(filepath.Join(criuOpts.ImagesDirectory, descriptorsFilename)); err != nil { + return err + } + + if err := json.Unmarshal(fdJSON, &fds); err != nil { + return err + } + for i := range fds { + if s := fds[i]; strings.Contains(s, "pipe:") { + inheritFd := new(criurpc.InheritFd) + inheritFd.Key = proto.String(s) + inheritFd.Fd = proto.Int32(int32(i)) + req.Opts.InheritFd = append(req.Opts.InheritFd, inheritFd) + } + } + return c.criuSwrk(process, req, criuOpts, true, extraFiles) +} + +func (c *linuxContainer) criuApplyCgroups(pid int, req *criurpc.CriuReq) error { + // XXX: Do we need to deal with this case? AFAIK criu still requires root. + if err := c.cgroupManager.Apply(pid); err != nil { + return err + } + + if err := c.cgroupManager.Set(c.config); err != nil { + return newSystemError(err) + } + + path := fmt.Sprintf("/proc/%d/cgroup", pid) + cgroupsPaths, err := cgroups.ParseCgroupFile(path) + if err != nil { + return err + } + + for c, p := range cgroupsPaths { + cgroupRoot := &criurpc.CgroupRoot{ + Ctrl: proto.String(c), + Path: proto.String(p), + } + req.Opts.CgRoot = append(req.Opts.CgRoot, cgroupRoot) + } + + return nil +} + +func (c *linuxContainer) criuSwrk(process *Process, req *criurpc.CriuReq, opts *CriuOpts, applyCgroups bool, extraFiles []*os.File) error { + fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_SEQPACKET|unix.SOCK_CLOEXEC, 0) + if err != nil { + return err + } + + var logPath string + if opts != nil { + logPath = filepath.Join(opts.WorkDirectory, req.GetOpts().GetLogFile()) + } else { + // For the VERSION RPC 'opts' is set to 'nil' and therefore + // opts.WorkDirectory does not exist. Set logPath to "". + logPath = "" + } + criuClient := os.NewFile(uintptr(fds[0]), "criu-transport-client") + criuClientFileCon, err := net.FileConn(criuClient) + criuClient.Close() + if err != nil { + return err + } + + criuClientCon := criuClientFileCon.(*net.UnixConn) + defer criuClientCon.Close() + + criuServer := os.NewFile(uintptr(fds[1]), "criu-transport-server") + defer criuServer.Close() + + args := []string{"swrk", "3"} + if c.criuVersion != 0 { + // If the CRIU Version is still '0' then this is probably + // the initial CRIU run to detect the version. Skip it. + logrus.Debugf("Using CRIU %d at: %s", c.criuVersion, c.criuPath) + } + logrus.Debugf("Using CRIU with following args: %s", args) + cmd := exec.Command(c.criuPath, args...) + if process != nil { + cmd.Stdin = process.Stdin + cmd.Stdout = process.Stdout + cmd.Stderr = process.Stderr + } + cmd.ExtraFiles = append(cmd.ExtraFiles, criuServer) + if extraFiles != nil { + cmd.ExtraFiles = append(cmd.ExtraFiles, extraFiles...) + } + + if err := cmd.Start(); err != nil { + return err + } + criuServer.Close() + + defer func() { + criuClientCon.Close() + _, err := cmd.Process.Wait() + if err != nil { + return + } + }() + + if applyCgroups { + err := c.criuApplyCgroups(cmd.Process.Pid, req) + if err != nil { + return err + } + } + + var extFds []string + if process != nil { + extFds, err = getPipeFds(cmd.Process.Pid) + if err != nil { + return err + } + } + + logrus.Debugf("Using CRIU in %s mode", req.GetType().String()) + // In the case of criurpc.CriuReqType_FEATURE_CHECK req.GetOpts() + // should be empty. For older CRIU versions it still will be + // available but empty. criurpc.CriuReqType_VERSION actually + // has no req.GetOpts(). + if !(req.GetType() == criurpc.CriuReqType_FEATURE_CHECK || + req.GetType() == criurpc.CriuReqType_VERSION) { + + val := reflect.ValueOf(req.GetOpts()) + v := reflect.Indirect(val) + for i := 0; i < v.NumField(); i++ { + st := v.Type() + name := st.Field(i).Name + if strings.HasPrefix(name, "XXX_") { + continue + } + value := val.MethodByName("Get" + name).Call([]reflect.Value{}) + logrus.Debugf("CRIU option %s with value %v", name, value[0]) + } + } + data, err := proto.Marshal(req) + if err != nil { + return err + } + _, err = criuClientCon.Write(data) + if err != nil { + return err + } + + buf := make([]byte, 10*4096) + oob := make([]byte, 4096) + for true { + n, oobn, _, _, err := criuClientCon.ReadMsgUnix(buf, oob) + if err != nil { + return err + } + if n == 0 { + return fmt.Errorf("unexpected EOF") + } + if n == len(buf) { + return fmt.Errorf("buffer is too small") + } + + resp := new(criurpc.CriuResp) + err = proto.Unmarshal(buf[:n], resp) + if err != nil { + return err + } + if !resp.GetSuccess() { + typeString := req.GetType().String() + if typeString == "VERSION" { + // If the VERSION RPC fails this probably means that the CRIU + // version is too old for this RPC. Just return 'nil'. + return nil + } + return fmt.Errorf("criu failed: type %s errno %d\nlog file: %s", typeString, resp.GetCrErrno(), logPath) + } + + t := resp.GetType() + switch { + case t == criurpc.CriuReqType_VERSION: + logrus.Debugf("CRIU version: %s", resp) + criuVersionRPC = resp.GetVersion() + break + case t == criurpc.CriuReqType_FEATURE_CHECK: + logrus.Debugf("Feature check says: %s", resp) + criuFeatures = resp.GetFeatures() + case t == criurpc.CriuReqType_NOTIFY: + if err := c.criuNotifications(resp, process, opts, extFds, oob[:oobn]); err != nil { + return err + } + t = criurpc.CriuReqType_NOTIFY + req = &criurpc.CriuReq{ + Type: &t, + NotifySuccess: proto.Bool(true), + } + data, err = proto.Marshal(req) + if err != nil { + return err + } + _, err = criuClientCon.Write(data) + if err != nil { + return err + } + continue + case t == criurpc.CriuReqType_RESTORE: + case t == criurpc.CriuReqType_DUMP: + case t == criurpc.CriuReqType_PRE_DUMP: + default: + return fmt.Errorf("unable to parse the response %s", resp.String()) + } + + break + } + + criuClientCon.CloseWrite() + // cmd.Wait() waits cmd.goroutines which are used for proxying file descriptors. + // Here we want to wait only the CRIU process. + st, err := cmd.Process.Wait() + if err != nil { + return err + } + + // In pre-dump mode CRIU is in a loop and waits for + // the final DUMP command. + // The current runc pre-dump approach, however, is + // start criu in PRE_DUMP once for a single pre-dump + // and not the whole series of pre-dump, pre-dump, ...m, dump + // If we got the message CriuReqType_PRE_DUMP it means + // CRIU was successful and we need to forcefully stop CRIU + if !st.Success() && *req.Type != criurpc.CriuReqType_PRE_DUMP { + return fmt.Errorf("criu failed: %s\nlog file: %s", st.String(), logPath) + } + return nil +} + +// block any external network activity +func lockNetwork(config *configs.Config) error { + for _, config := range config.Networks { + strategy, err := getStrategy(config.Type) + if err != nil { + return err + } + + if err := strategy.detach(config); err != nil { + return err + } + } + return nil +} + +func unlockNetwork(config *configs.Config) error { + for _, config := range config.Networks { + strategy, err := getStrategy(config.Type) + if err != nil { + return err + } + if err = strategy.attach(config); err != nil { + return err + } + } + return nil +} + +func (c *linuxContainer) criuNotifications(resp *criurpc.CriuResp, process *Process, opts *CriuOpts, fds []string, oob []byte) error { + notify := resp.GetNotify() + if notify == nil { + return fmt.Errorf("invalid response: %s", resp.String()) + } + logrus.Debugf("notify: %s\n", notify.GetScript()) + switch { + case notify.GetScript() == "post-dump": + f, err := os.Create(filepath.Join(c.root, "checkpoint")) + if err != nil { + return err + } + f.Close() + case notify.GetScript() == "network-unlock": + if err := unlockNetwork(c.config); err != nil { + return err + } + case notify.GetScript() == "network-lock": + if err := lockNetwork(c.config); err != nil { + return err + } + case notify.GetScript() == "setup-namespaces": + if c.config.Hooks != nil { + s, err := c.currentOCIState() + if err != nil { + return nil + } + s.Pid = int(notify.GetPid()) + for i, hook := range c.config.Hooks.Prestart { + if err := hook.Run(s); err != nil { + return newSystemErrorWithCausef(err, "running prestart hook %d", i) + } + } + } + case notify.GetScript() == "post-restore": + pid := notify.GetPid() + r, err := newRestoredProcess(int(pid), fds) + if err != nil { + return err + } + process.ops = r + if err := c.state.transition(&restoredState{ + imageDir: opts.ImagesDirectory, + c: c, + }); err != nil { + return err + } + // create a timestamp indicating when the restored checkpoint was started + c.created = time.Now().UTC() + if _, err := c.updateState(r); err != nil { + return err + } + if err := os.Remove(filepath.Join(c.root, "checkpoint")); err != nil { + if !os.IsNotExist(err) { + logrus.Error(err) + } + } + case notify.GetScript() == "orphan-pts-master": + scm, err := unix.ParseSocketControlMessage(oob) + if err != nil { + return err + } + fds, err := unix.ParseUnixRights(&scm[0]) + if err != nil { + return err + } + + master := os.NewFile(uintptr(fds[0]), "orphan-pts-master") + defer master.Close() + + // While we can access console.master, using the API is a good idea. + if err := utils.SendFd(process.ConsoleSocket, master.Name(), master.Fd()); err != nil { + return err + } + } + return nil +} + +func (c *linuxContainer) updateState(process parentProcess) (*State, error) { + if process != nil { + c.initProcess = process + } + state, err := c.currentState() + if err != nil { + return nil, err + } + err = c.saveState(state) + if err != nil { + return nil, err + } + return state, nil +} + +func (c *linuxContainer) saveState(s *State) error { + f, err := os.Create(filepath.Join(c.root, stateFilename)) + if err != nil { + return err + } + defer f.Close() + return utils.WriteJSON(f, s) +} + +func (c *linuxContainer) deleteState() error { + return os.Remove(filepath.Join(c.root, stateFilename)) +} + +func (c *linuxContainer) currentStatus() (Status, error) { + if err := c.refreshState(); err != nil { + return -1, err + } + return c.state.status(), nil +} + +// refreshState needs to be called to verify that the current state on the +// container is what is true. Because consumers of libcontainer can use it +// out of process we need to verify the container's status based on runtime +// information and not rely on our in process info. +func (c *linuxContainer) refreshState() error { + paused, err := c.isPaused() + if err != nil { + return err + } + if paused { + return c.state.transition(&pausedState{c: c}) + } + t, err := c.runType() + if err != nil { + return err + } + switch t { + case Created: + return c.state.transition(&createdState{c: c}) + case Running: + return c.state.transition(&runningState{c: c}) + } + return c.state.transition(&stoppedState{c: c}) +} + +func (c *linuxContainer) runType() (Status, error) { + if c.initProcess == nil { + return Stopped, nil + } + pid := c.initProcess.pid() + stat, err := system.Stat(pid) + if err != nil { + return Stopped, nil + } + if stat.StartTime != c.initProcessStartTime || stat.State == system.Zombie || stat.State == system.Dead { + return Stopped, nil + } + // We'll create exec fifo and blocking on it after container is created, + // and delete it after start container. + if _, err := os.Stat(filepath.Join(c.root, execFifoFilename)); err == nil { + return Created, nil + } + return Running, nil +} + +func (c *linuxContainer) isPaused() (bool, error) { + fcg := c.cgroupManager.GetPaths()["freezer"] + if fcg == "" { + // A container doesn't have a freezer cgroup + return false, nil + } + pausedState := "FROZEN" + filename := "freezer.state" + if cgroups.IsCgroup2UnifiedMode() { + filename = "cgroup.freeze" + pausedState = "1" + } + + data, err := ioutil.ReadFile(filepath.Join(fcg, filename)) + if err != nil { + // If freezer cgroup is not mounted, the container would just be not paused. + if os.IsNotExist(err) || err == syscall.ENODEV { + return false, nil + } + return false, newSystemErrorWithCause(err, "checking if container is paused") + } + return bytes.Equal(bytes.TrimSpace(data), []byte(pausedState)), nil +} + +func (c *linuxContainer) currentState() (*State, error) { + var ( + startTime uint64 + externalDescriptors []string + pid = -1 + ) + if c.initProcess != nil { + pid = c.initProcess.pid() + startTime, _ = c.initProcess.startTime() + externalDescriptors = c.initProcess.externalDescriptors() + } + intelRdtPath, err := intelrdt.GetIntelRdtPath(c.ID()) + if err != nil { + intelRdtPath = "" + } + state := &State{ + BaseState: BaseState{ + ID: c.ID(), + Config: *c.config, + InitProcessPid: pid, + InitProcessStartTime: startTime, + Created: c.created, + }, + Rootless: c.config.RootlessEUID && c.config.RootlessCgroups, + CgroupPaths: c.cgroupManager.GetPaths(), + IntelRdtPath: intelRdtPath, + NamespacePaths: make(map[configs.NamespaceType]string), + ExternalDescriptors: externalDescriptors, + } + if pid > 0 { + for _, ns := range c.config.Namespaces { + state.NamespacePaths[ns.Type] = ns.GetPath(pid) + } + for _, nsType := range configs.NamespaceTypes() { + if !configs.IsNamespaceSupported(nsType) { + continue + } + if _, ok := state.NamespacePaths[nsType]; !ok { + ns := configs.Namespace{Type: nsType} + state.NamespacePaths[ns.Type] = ns.GetPath(pid) + } + } + } + return state, nil +} + +func (c *linuxContainer) currentOCIState() (*specs.State, error) { + bundle, annotations := utils.Annotations(c.config.Labels) + state := &specs.State{ + Version: specs.Version, + ID: c.ID(), + Bundle: bundle, + Annotations: annotations, + } + status, err := c.currentStatus() + if err != nil { + return nil, err + } + state.Status = status.String() + if status != Stopped { + if c.initProcess != nil { + state.Pid = c.initProcess.pid() + } + } + return state, nil +} + +// orderNamespacePaths sorts namespace paths into a list of paths that we +// can setns in order. +func (c *linuxContainer) orderNamespacePaths(namespaces map[configs.NamespaceType]string) ([]string, error) { + paths := []string{} + for _, ns := range configs.NamespaceTypes() { + + // Remove namespaces that we don't need to join. + if !c.config.Namespaces.Contains(ns) { + continue + } + + if p, ok := namespaces[ns]; ok && p != "" { + // check if the requested namespace is supported + if !configs.IsNamespaceSupported(ns) { + return nil, newSystemError(fmt.Errorf("namespace %s is not supported", ns)) + } + // only set to join this namespace if it exists + if _, err := os.Lstat(p); err != nil { + return nil, newSystemErrorWithCausef(err, "running lstat on namespace path %q", p) + } + // do not allow namespace path with comma as we use it to separate + // the namespace paths + if strings.ContainsRune(p, ',') { + return nil, newSystemError(fmt.Errorf("invalid path %s", p)) + } + paths = append(paths, fmt.Sprintf("%s:%s", configs.NsName(ns), p)) + } + + } + + return paths, nil +} + +func encodeIDMapping(idMap []configs.IDMap) ([]byte, error) { + data := bytes.NewBuffer(nil) + for _, im := range idMap { + line := fmt.Sprintf("%d %d %d\n", im.ContainerID, im.HostID, im.Size) + if _, err := data.WriteString(line); err != nil { + return nil, err + } + } + return data.Bytes(), nil +} + +// bootstrapData encodes the necessary data in netlink binary format +// as a io.Reader. +// Consumer can write the data to a bootstrap program +// such as one that uses nsenter package to bootstrap the container's +// init process correctly, i.e. with correct namespaces, uid/gid +// mapping etc. +func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.NamespaceType]string) (io.Reader, error) { + // create the netlink message + r := nl.NewNetlinkRequest(int(InitMsg), 0) + + // write cloneFlags + r.AddData(&Int32msg{ + Type: CloneFlagsAttr, + Value: uint32(cloneFlags), + }) + + // write custom namespace paths + if len(nsMaps) > 0 { + nsPaths, err := c.orderNamespacePaths(nsMaps) + if err != nil { + return nil, err + } + r.AddData(&Bytemsg{ + Type: NsPathsAttr, + Value: []byte(strings.Join(nsPaths, ",")), + }) + } + + // write namespace paths only when we are not joining an existing user ns + _, joinExistingUser := nsMaps[configs.NEWUSER] + if !joinExistingUser { + // write uid mappings + if len(c.config.UidMappings) > 0 { + if c.config.RootlessEUID && c.newuidmapPath != "" { + r.AddData(&Bytemsg{ + Type: UidmapPathAttr, + Value: []byte(c.newuidmapPath), + }) + } + b, err := encodeIDMapping(c.config.UidMappings) + if err != nil { + return nil, err + } + r.AddData(&Bytemsg{ + Type: UidmapAttr, + Value: b, + }) + } + + // write gid mappings + if len(c.config.GidMappings) > 0 { + b, err := encodeIDMapping(c.config.GidMappings) + if err != nil { + return nil, err + } + r.AddData(&Bytemsg{ + Type: GidmapAttr, + Value: b, + }) + if c.config.RootlessEUID && c.newgidmapPath != "" { + r.AddData(&Bytemsg{ + Type: GidmapPathAttr, + Value: []byte(c.newgidmapPath), + }) + } + if requiresRootOrMappingTool(c.config) { + r.AddData(&Boolmsg{ + Type: SetgroupAttr, + Value: true, + }) + } + } + } + + if c.config.OomScoreAdj != nil { + // write oom_score_adj + r.AddData(&Bytemsg{ + Type: OomScoreAdjAttr, + Value: []byte(fmt.Sprintf("%d", *c.config.OomScoreAdj)), + }) + } + + // write rootless + r.AddData(&Boolmsg{ + Type: RootlessEUIDAttr, + Value: c.config.RootlessEUID, + }) + + return bytes.NewReader(r.Serialize()), nil +} + +// ignoreTerminateErrors returns nil if the given err matches an error known +// to indicate that the terminate occurred successfully or err was nil, otherwise +// err is returned unaltered. +func ignoreTerminateErrors(err error) error { + if err == nil { + return nil + } + s := err.Error() + switch { + case strings.Contains(s, "process already finished"), strings.Contains(s, "Wait was already called"): + return nil + } + return err +} + +func requiresRootOrMappingTool(c *configs.Config) bool { + gidMap := []configs.IDMap{ + {ContainerID: 0, HostID: os.Getegid(), Size: 1}, + } + return !reflect.DeepEqual(c.GidMappings, gidMap) +} diff --git a/libcontainer/container_linux_test.go b/libcontainer/container_linux_test.go new file mode 100644 index 0000000..f8af05d --- /dev/null +++ b/libcontainer/container_linux_test.go @@ -0,0 +1,372 @@ +// +build linux + +package libcontainer + +import ( + "fmt" + "io/ioutil" + "os" + "testing" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/intelrdt" + "github.com/opencontainers/runc/libcontainer/system" +) + +type mockCgroupManager struct { + pids []int + allPids []int + stats *cgroups.Stats + paths map[string]string +} + +type mockIntelRdtManager struct { + stats *intelrdt.Stats + path string +} + +func (m *mockCgroupManager) GetPids() ([]int, error) { + return m.pids, nil +} + +func (m *mockCgroupManager) GetAllPids() ([]int, error) { + return m.allPids, nil +} + +func (m *mockCgroupManager) GetStats() (*cgroups.Stats, error) { + return m.stats, nil +} + +func (m *mockCgroupManager) Apply(pid int) error { + return nil +} + +func (m *mockCgroupManager) Set(container *configs.Config) error { + return nil +} + +func (m *mockCgroupManager) Destroy() error { + return nil +} + +func (m *mockCgroupManager) GetPaths() map[string]string { + return m.paths +} + +func (m *mockCgroupManager) GetUnifiedPath() (string, error) { + return "", fmt.Errorf("unimplemented") +} + +func (m *mockCgroupManager) Freeze(state configs.FreezerState) error { + return nil +} +func (m *mockCgroupManager) GetCgroups() (*configs.Cgroup, error) { + return nil, nil +} + +func (m *mockIntelRdtManager) Apply(pid int) error { + return nil +} + +func (m *mockIntelRdtManager) GetStats() (*intelrdt.Stats, error) { + return m.stats, nil +} + +func (m *mockIntelRdtManager) Destroy() error { + return nil +} + +func (m *mockIntelRdtManager) GetPath() string { + return m.path +} + +func (m *mockIntelRdtManager) Set(container *configs.Config) error { + return nil +} + +func (m *mockIntelRdtManager) GetCgroups() (*configs.Cgroup, error) { + return nil, nil +} + +type mockProcess struct { + _pid int + started uint64 +} + +func (m *mockProcess) terminate() error { + return nil +} + +func (m *mockProcess) pid() int { + return m._pid +} + +func (m *mockProcess) startTime() (uint64, error) { + return m.started, nil +} + +func (m *mockProcess) start() error { + return nil +} + +func (m *mockProcess) wait() (*os.ProcessState, error) { + return nil, nil +} + +func (m *mockProcess) signal(_ os.Signal) error { + return nil +} + +func (m *mockProcess) externalDescriptors() []string { + return []string{} +} + +func (m *mockProcess) setExternalDescriptors(newFds []string) { +} + +func (m *mockProcess) forwardChildLogs() { +} + +func TestGetContainerPids(t *testing.T) { + container := &linuxContainer{ + id: "myid", + config: &configs.Config{}, + cgroupManager: &mockCgroupManager{allPids: []int{1, 2, 3}}, + } + pids, err := container.Processes() + if err != nil { + t.Fatal(err) + } + for i, expected := range []int{1, 2, 3} { + if pids[i] != expected { + t.Fatalf("expected pid %d but received %d", expected, pids[i]) + } + } +} + +func TestGetContainerStats(t *testing.T) { + container := &linuxContainer{ + id: "myid", + config: &configs.Config{}, + cgroupManager: &mockCgroupManager{ + pids: []int{1, 2, 3}, + stats: &cgroups.Stats{ + MemoryStats: cgroups.MemoryStats{ + Usage: cgroups.MemoryData{ + Usage: 1024, + }, + }, + }, + }, + intelRdtManager: &mockIntelRdtManager{ + stats: &intelrdt.Stats{ + L3CacheSchema: "L3:0=f;1=f0", + MemBwSchema: "MB:0=20;1=70", + }, + }, + } + stats, err := container.Stats() + if err != nil { + t.Fatal(err) + } + if stats.CgroupStats == nil { + t.Fatal("cgroup stats are nil") + } + if stats.CgroupStats.MemoryStats.Usage.Usage != 1024 { + t.Fatalf("expected memory usage 1024 but received %d", stats.CgroupStats.MemoryStats.Usage.Usage) + } + if intelrdt.IsCatEnabled() { + if stats.IntelRdtStats == nil { + t.Fatal("intel rdt stats are nil") + } + if stats.IntelRdtStats.L3CacheSchema != "L3:0=f;1=f0" { + t.Fatalf("expected L3CacheSchema L3:0=f;1=f0 but received %s", stats.IntelRdtStats.L3CacheSchema) + } + } + if intelrdt.IsMbaEnabled() { + if stats.IntelRdtStats == nil { + t.Fatal("intel rdt stats are nil") + } + if stats.IntelRdtStats.MemBwSchema != "MB:0=20;1=70" { + t.Fatalf("expected MemBwSchema MB:0=20;1=70 but received %s", stats.IntelRdtStats.MemBwSchema) + } + } +} + +func TestGetContainerState(t *testing.T) { + var ( + pid = os.Getpid() + expectedMemoryPath = "/sys/fs/cgroup/memory/myid" + expectedNetworkPath = fmt.Sprintf("/proc/%d/ns/net", pid) + expectedIntelRdtPath = "/sys/fs/resctrl/myid" + ) + container := &linuxContainer{ + id: "myid", + config: &configs.Config{ + Namespaces: []configs.Namespace{ + {Type: configs.NEWPID}, + {Type: configs.NEWNS}, + {Type: configs.NEWNET, Path: expectedNetworkPath}, + {Type: configs.NEWUTS}, + // emulate host for IPC + //{Type: configs.NEWIPC}, + {Type: configs.NEWCGROUP}, + }, + }, + initProcess: &mockProcess{ + _pid: pid, + started: 10, + }, + cgroupManager: &mockCgroupManager{ + pids: []int{1, 2, 3}, + stats: &cgroups.Stats{ + MemoryStats: cgroups.MemoryStats{ + Usage: cgroups.MemoryData{ + Usage: 1024, + }, + }, + }, + paths: map[string]string{ + "memory": expectedMemoryPath, + }, + }, + intelRdtManager: &mockIntelRdtManager{ + stats: &intelrdt.Stats{ + L3CacheSchema: "L3:0=f0;1=f", + MemBwSchema: "MB:0=70;1=20", + }, + path: expectedIntelRdtPath, + }, + } + container.state = &createdState{c: container} + state, err := container.State() + if err != nil { + t.Fatal(err) + } + if state.InitProcessPid != pid { + t.Fatalf("expected pid %d but received %d", pid, state.InitProcessPid) + } + if state.InitProcessStartTime != 10 { + t.Fatalf("expected process start time 10 but received %d", state.InitProcessStartTime) + } + paths := state.CgroupPaths + if paths == nil { + t.Fatal("cgroup paths should not be nil") + } + if memPath := paths["memory"]; memPath != expectedMemoryPath { + t.Fatalf("expected memory path %q but received %q", expectedMemoryPath, memPath) + } + if intelrdt.IsCatEnabled() || intelrdt.IsMbaEnabled() { + intelRdtPath := state.IntelRdtPath + if intelRdtPath == "" { + t.Fatal("intel rdt path should not be empty") + } + if intelRdtPath != expectedIntelRdtPath { + t.Fatalf("expected intel rdt path %q but received %q", expectedIntelRdtPath, intelRdtPath) + } + } + for _, ns := range container.config.Namespaces { + path := state.NamespacePaths[ns.Type] + if path == "" { + t.Fatalf("expected non nil namespace path for %s", ns.Type) + } + if ns.Type == configs.NEWNET { + if path != expectedNetworkPath { + t.Fatalf("expected path %q but received %q", expectedNetworkPath, path) + } + } else { + file := "" + switch ns.Type { + case configs.NEWNET: + file = "net" + case configs.NEWNS: + file = "mnt" + case configs.NEWPID: + file = "pid" + case configs.NEWIPC: + file = "ipc" + case configs.NEWUSER: + file = "user" + case configs.NEWUTS: + file = "uts" + case configs.NEWCGROUP: + file = "cgroup" + } + expected := fmt.Sprintf("/proc/%d/ns/%s", pid, file) + if expected != path { + t.Fatalf("expected path %q but received %q", expected, path) + } + } + } +} + +func TestGetContainerStateAfterUpdate(t *testing.T) { + var ( + pid = os.Getpid() + ) + stat, err := system.Stat(pid) + if err != nil { + t.Fatal(err) + } + + rootDir, err := ioutil.TempDir("", "TestGetContainerStateAfterUpdate") + if err != nil { + t.Fatal(err) + } + defer os.RemoveAll(rootDir) + + container := &linuxContainer{ + root: rootDir, + id: "myid", + config: &configs.Config{ + Namespaces: []configs.Namespace{ + {Type: configs.NEWPID}, + {Type: configs.NEWNS}, + {Type: configs.NEWNET}, + {Type: configs.NEWUTS}, + {Type: configs.NEWIPC}, + }, + Cgroups: &configs.Cgroup{ + Resources: &configs.Resources{ + Memory: 1024, + }, + }, + }, + initProcess: &mockProcess{ + _pid: pid, + started: stat.StartTime, + }, + cgroupManager: &mockCgroupManager{}, + } + container.state = &createdState{c: container} + state, err := container.State() + if err != nil { + t.Fatal(err) + } + if state.InitProcessPid != pid { + t.Fatalf("expected pid %d but received %d", pid, state.InitProcessPid) + } + if state.InitProcessStartTime != stat.StartTime { + t.Fatalf("expected process start time %d but received %d", stat.StartTime, state.InitProcessStartTime) + } + if state.Config.Cgroups.Resources.Memory != 1024 { + t.Fatalf("expected Memory to be 1024 but received %q", state.Config.Cgroups.Memory) + } + + // Set initProcessStartTime so we fake to be running + container.initProcessStartTime = state.InitProcessStartTime + container.state = &runningState{c: container} + newConfig := container.Config() + newConfig.Cgroups.Resources.Memory = 2048 + if err := container.Set(newConfig); err != nil { + t.Fatal(err) + } + state, err = container.State() + if err != nil { + t.Fatal(err) + } + if state.Config.Cgroups.Resources.Memory != 2048 { + t.Fatalf("expected Memory to be 2048 but received %q", state.Config.Cgroups.Memory) + } +} diff --git a/libcontainer/criu_opts_linux.go b/libcontainer/criu_opts_linux.go new file mode 100644 index 0000000..a2e344f --- /dev/null +++ b/libcontainer/criu_opts_linux.go @@ -0,0 +1,40 @@ +package libcontainer + +// cgroup restoring strategy provided by criu +type cgMode uint32 + +const ( + CRIU_CG_MODE_SOFT cgMode = 3 + iota // restore cgroup properties if only dir created by criu + CRIU_CG_MODE_FULL // always restore all cgroups and their properties + CRIU_CG_MODE_STRICT // restore all, requiring them to not present in the system + CRIU_CG_MODE_DEFAULT // the same as CRIU_CG_MODE_SOFT +) + +type CriuPageServerInfo struct { + Address string // IP address of CRIU page server + Port int32 // port number of CRIU page server +} + +type VethPairName struct { + ContainerInterfaceName string + HostInterfaceName string +} + +type CriuOpts struct { + ImagesDirectory string // directory for storing image files + WorkDirectory string // directory to cd and write logs/pidfiles/stats to + ParentImage string // directory for storing parent image files in pre-dump and dump + LeaveRunning bool // leave container in running state after checkpoint + TcpEstablished bool // checkpoint/restore established TCP connections + ExternalUnixConnections bool // allow external unix connections + ShellJob bool // allow to dump and restore shell jobs + FileLocks bool // handle file locks, for safety + PreDump bool // call criu predump to perform iterative checkpoint + PageServer CriuPageServerInfo // allow to dump to criu page server + VethPairs []VethPairName // pass the veth to criu when restore + ManageCgroupsMode cgMode // dump or restore cgroup mode + EmptyNs uint32 // don't c/r properties for namespace from this mask + AutoDedup bool // auto deduplication for incremental dumps + LazyPages bool // restore memory pages lazily using userfaultfd + StatusFd string // fd for feedback when lazy server is ready +} diff --git a/libcontainer/devices/devices.go b/libcontainer/devices/devices.go new file mode 100644 index 0000000..5dabe06 --- /dev/null +++ b/libcontainer/devices/devices.go @@ -0,0 +1,110 @@ +package devices + +import ( + "errors" + "io/ioutil" + "os" + "path/filepath" + + "github.com/opencontainers/runc/libcontainer/configs" + "golang.org/x/sys/unix" +) + +var ( + // ErrNotADevice denotes that a file is not a valid linux device. + ErrNotADevice = errors.New("not a device node") +) + +// Testing dependencies +var ( + unixLstat = unix.Lstat + ioutilReadDir = ioutil.ReadDir +) + +// Given the path to a device and its cgroup_permissions(which cannot be easily queried) look up the +// information about a linux device and return that information as a Device struct. +func DeviceFromPath(path, permissions string) (*configs.Device, error) { + var stat unix.Stat_t + err := unixLstat(path, &stat) + if err != nil { + return nil, err + } + + var ( + devNumber = uint64(stat.Rdev) + major = unix.Major(devNumber) + minor = unix.Minor(devNumber) + ) + if major == 0 { + return nil, ErrNotADevice + } + + var ( + devType rune + mode = stat.Mode + ) + switch { + case mode&unix.S_IFBLK == unix.S_IFBLK: + devType = 'b' + case mode&unix.S_IFCHR == unix.S_IFCHR: + devType = 'c' + } + return &configs.Device{ + Type: devType, + Path: path, + Major: int64(major), + Minor: int64(minor), + Permissions: permissions, + FileMode: os.FileMode(mode), + Uid: stat.Uid, + Gid: stat.Gid, + }, nil +} + +// HostDevices returns all devices that can be found under /dev directory. +func HostDevices() ([]*configs.Device, error) { + return GetDevices("/dev") +} + +// GetDevices recursively traverses a directory specified by path +// and returns all devices found there. +func GetDevices(path string) ([]*configs.Device, error) { + files, err := ioutilReadDir(path) + if err != nil { + return nil, err + } + var out []*configs.Device + for _, f := range files { + switch { + case f.IsDir(): + switch f.Name() { + // ".lxc" & ".lxd-mounts" added to address https://github.com/lxc/lxd/issues/2825 + // ".udev" added to address https://github.com/opencontainers/runc/issues/2093 + case "pts", "shm", "fd", "mqueue", ".lxc", ".lxd-mounts", ".udev": + continue + default: + sub, err := GetDevices(filepath.Join(path, f.Name())) + if err != nil { + return nil, err + } + + out = append(out, sub...) + continue + } + case f.Name() == "console": + continue + } + device, err := DeviceFromPath(filepath.Join(path, f.Name()), "rwm") + if err != nil { + if err == ErrNotADevice { + continue + } + if os.IsNotExist(err) { + continue + } + return nil, err + } + out = append(out, device) + } + return out, nil +} diff --git a/libcontainer/devices/devices_test.go b/libcontainer/devices/devices_test.go new file mode 100644 index 0000000..0afa9d9 --- /dev/null +++ b/libcontainer/devices/devices_test.go @@ -0,0 +1,63 @@ +package devices + +import ( + "errors" + "os" + "testing" + + "golang.org/x/sys/unix" +) + +func TestDeviceFromPathLstatFailure(t *testing.T) { + testError := errors.New("test error") + + // Override unix.Lstat to inject error. + unixLstat = func(path string, stat *unix.Stat_t) error { + return testError + } + + _, err := DeviceFromPath("", "") + if err != testError { + t.Fatalf("Unexpected error %v, expected %v", err, testError) + } +} + +func TestHostDevicesIoutilReadDirFailure(t *testing.T) { + testError := errors.New("test error") + + // Override ioutil.ReadDir to inject error. + ioutilReadDir = func(dirname string) ([]os.FileInfo, error) { + return nil, testError + } + + _, err := HostDevices() + if err != testError { + t.Fatalf("Unexpected error %v, expected %v", err, testError) + } +} + +func TestHostDevicesIoutilReadDirDeepFailure(t *testing.T) { + testError := errors.New("test error") + called := false + + // Override ioutil.ReadDir to inject error after the first call. + ioutilReadDir = func(dirname string) ([]os.FileInfo, error) { + if called { + return nil, testError + } + called = true + + // Provoke a second call. + fi, err := os.Lstat("/tmp") + if err != nil { + t.Fatalf("Unexpected error %v", err) + } + + return []os.FileInfo{fi}, nil + } + + _, err := HostDevices() + if err != testError { + t.Fatalf("Unexpected error %v, expected %v", err, testError) + } +} diff --git a/libcontainer/error.go b/libcontainer/error.go new file mode 100644 index 0000000..21a3789 --- /dev/null +++ b/libcontainer/error.go @@ -0,0 +1,70 @@ +package libcontainer + +import "io" + +// ErrorCode is the API error code type. +type ErrorCode int + +// API error codes. +const ( + // Factory errors + IdInUse ErrorCode = iota + InvalidIdFormat + + // Container errors + ContainerNotExists + ContainerPaused + ContainerNotStopped + ContainerNotRunning + ContainerNotPaused + + // Process errors + NoProcessOps + + // Common errors + ConfigInvalid + ConsoleExists + SystemError +) + +func (c ErrorCode) String() string { + switch c { + case IdInUse: + return "Id already in use" + case InvalidIdFormat: + return "Invalid format" + case ContainerPaused: + return "Container paused" + case ConfigInvalid: + return "Invalid configuration" + case SystemError: + return "System error" + case ContainerNotExists: + return "Container does not exist" + case ContainerNotStopped: + return "Container is not stopped" + case ContainerNotRunning: + return "Container is not running" + case ConsoleExists: + return "Console exists for process" + case ContainerNotPaused: + return "Container is not paused" + case NoProcessOps: + return "No process operations" + default: + return "Unknown error" + } +} + +// Error is the API error type. +type Error interface { + error + + // Returns an error if it failed to write the detail of the Error to w. + // The detail of the Error may include the error message and a + // representation of the stack trace. + Detail(w io.Writer) error + + // Returns the error code for this error. + Code() ErrorCode +} diff --git a/libcontainer/error_test.go b/libcontainer/error_test.go new file mode 100644 index 0000000..36841ad --- /dev/null +++ b/libcontainer/error_test.go @@ -0,0 +1,25 @@ +package libcontainer + +import "testing" + +func TestErrorCode(t *testing.T) { + codes := map[ErrorCode]string{ + IdInUse: "Id already in use", + InvalidIdFormat: "Invalid format", + ContainerPaused: "Container paused", + ConfigInvalid: "Invalid configuration", + SystemError: "System error", + ContainerNotExists: "Container does not exist", + ContainerNotStopped: "Container is not stopped", + ContainerNotRunning: "Container is not running", + ConsoleExists: "Console exists for process", + ContainerNotPaused: "Container is not paused", + NoProcessOps: "No process operations", + } + + for code, expected := range codes { + if actual := code.String(); actual != expected { + t.Fatalf("expected string %q but received %q", expected, actual) + } + } +} diff --git a/libcontainer/factory.go b/libcontainer/factory.go new file mode 100644 index 0000000..0986cd7 --- /dev/null +++ b/libcontainer/factory.go @@ -0,0 +1,44 @@ +package libcontainer + +import ( + "github.com/opencontainers/runc/libcontainer/configs" +) + +type Factory interface { + // Creates a new container with the given id and starts the initial process inside it. + // id must be a string containing only letters, digits and underscores and must contain + // between 1 and 1024 characters, inclusive. + // + // The id must not already be in use by an existing container. Containers created using + // a factory with the same path (and filesystem) must have distinct ids. + // + // Returns the new container with a running process. + // + // errors: + // IdInUse - id is already in use by a container + // InvalidIdFormat - id has incorrect format + // ConfigInvalid - config is invalid + // Systemerror - System error + // + // On error, any partially created container parts are cleaned up (the operation is atomic). + Create(id string, config *configs.Config) (Container, error) + + // Load takes an ID for an existing container and returns the container information + // from the state. This presents a read only view of the container. + // + // errors: + // Path does not exist + // System error + Load(id string) (Container, error) + + // StartInitialization is an internal API to libcontainer used during the reexec of the + // container. + // + // Errors: + // Pipe connection error + // System error + StartInitialization() error + + // Type returns info string about factory type (e.g. lxc, libcontainer...) + Type() string +} diff --git a/libcontainer/factory_linux.go b/libcontainer/factory_linux.go new file mode 100644 index 0000000..437633c --- /dev/null +++ b/libcontainer/factory_linux.go @@ -0,0 +1,427 @@ +// +build linux + +package libcontainer + +import ( + "encoding/json" + "fmt" + "os" + "path/filepath" + "regexp" + "runtime/debug" + "strconv" + + "github.com/cyphar/filepath-securejoin" + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fs" + "github.com/opencontainers/runc/libcontainer/cgroups/fs2" + "github.com/opencontainers/runc/libcontainer/cgroups/systemd" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/configs/validate" + "github.com/opencontainers/runc/libcontainer/intelrdt" + "github.com/opencontainers/runc/libcontainer/mount" + "github.com/opencontainers/runc/libcontainer/utils" + "github.com/pkg/errors" + + "golang.org/x/sys/unix" +) + +const ( + stateFilename = "state.json" + execFifoFilename = "exec.fifo" +) + +var idRegex = regexp.MustCompile(`^[\w+-\.]+$`) + +// InitArgs returns an options func to configure a LinuxFactory with the +// provided init binary path and arguments. +func InitArgs(args ...string) func(*LinuxFactory) error { + return func(l *LinuxFactory) (err error) { + if len(args) > 0 { + // Resolve relative paths to ensure that its available + // after directory changes. + if args[0], err = filepath.Abs(args[0]); err != nil { + return newGenericError(err, ConfigInvalid) + } + } + + l.InitArgs = args + return nil + } +} + +// SystemdCgroups is an options func to configure a LinuxFactory to return +// containers that use systemd to create and manage cgroups. +func SystemdCgroups(l *LinuxFactory) error { + systemdCgroupsManager, err := systemd.NewSystemdCgroupsManager() + if err != nil { + return err + } + l.NewCgroupsManager = systemdCgroupsManager + return nil +} + +func getUnifiedPath(paths map[string]string) string { + unifiedPath := "" + for k, v := range paths { + if unifiedPath == "" { + unifiedPath = v + } else if v != unifiedPath { + panic(errors.Errorf("expected %q path to be unified path %q, got %q", k, unifiedPath, v)) + } + } + // can be empty + return unifiedPath +} + +func cgroupfs2(l *LinuxFactory, rootless bool) error { + l.NewCgroupsManager = func(config *configs.Cgroup, paths map[string]string) cgroups.Manager { + m, err := fs2.NewManager(config, getUnifiedPath(paths), rootless) + if err != nil { + panic(err) + } + return m + } + return nil +} + +// Cgroupfs is an options func to configure a LinuxFactory to return containers +// that use the native cgroups filesystem implementation to create and manage +// cgroups. +func Cgroupfs(l *LinuxFactory) error { + if cgroups.IsCgroup2UnifiedMode() { + return cgroupfs2(l, false) + } + l.NewCgroupsManager = func(config *configs.Cgroup, paths map[string]string) cgroups.Manager { + return &fs.Manager{ + Cgroups: config, + Paths: paths, + } + } + return nil +} + +// RootlessCgroupfs is an options func to configure a LinuxFactory to return +// containers that use the native cgroups filesystem implementation to create +// and manage cgroups. The difference between RootlessCgroupfs and Cgroupfs is +// that RootlessCgroupfs can transparently handle permission errors that occur +// during rootless container (including euid=0 in userns) setup (while still allowing cgroup usage if +// they've been set up properly). +func RootlessCgroupfs(l *LinuxFactory) error { + if cgroups.IsCgroup2UnifiedMode() { + return cgroupfs2(l, true) + } + l.NewCgroupsManager = func(config *configs.Cgroup, paths map[string]string) cgroups.Manager { + return &fs.Manager{ + Cgroups: config, + Rootless: true, + Paths: paths, + } + } + return nil +} + +// IntelRdtfs is an options func to configure a LinuxFactory to return +// containers that use the Intel RDT "resource control" filesystem to +// create and manage Intel RDT resources (e.g., L3 cache, memory bandwidth). +func IntelRdtFs(l *LinuxFactory) error { + l.NewIntelRdtManager = func(config *configs.Config, id string, path string) intelrdt.Manager { + return &intelrdt.IntelRdtManager{ + Config: config, + Id: id, + Path: path, + } + } + return nil +} + +// TmpfsRoot is an option func to mount LinuxFactory.Root to tmpfs. +func TmpfsRoot(l *LinuxFactory) error { + mounted, err := mount.Mounted(l.Root) + if err != nil { + return err + } + if !mounted { + if err := unix.Mount("tmpfs", l.Root, "tmpfs", 0, ""); err != nil { + return err + } + } + return nil +} + +// CriuPath returns an option func to configure a LinuxFactory with the +// provided criupath +func CriuPath(criupath string) func(*LinuxFactory) error { + return func(l *LinuxFactory) error { + l.CriuPath = criupath + return nil + } +} + +// New returns a linux based container factory based in the root directory and +// configures the factory with the provided option funcs. +func New(root string, options ...func(*LinuxFactory) error) (Factory, error) { + if root != "" { + if err := os.MkdirAll(root, 0700); err != nil { + return nil, newGenericError(err, SystemError) + } + } + l := &LinuxFactory{ + Root: root, + InitPath: "/proc/self/exe", + InitArgs: []string{os.Args[0], "init"}, + Validator: validate.New(), + CriuPath: "criu", + } + Cgroupfs(l) + for _, opt := range options { + if opt == nil { + continue + } + if err := opt(l); err != nil { + return nil, err + } + } + return l, nil +} + +// LinuxFactory implements the default factory interface for linux based systems. +type LinuxFactory struct { + // Root directory for the factory to store state. + Root string + + // InitPath is the path for calling the init responsibilities for spawning + // a container. + InitPath string + + // InitArgs are arguments for calling the init responsibilities for spawning + // a container. + InitArgs []string + + // CriuPath is the path to the criu binary used for checkpoint and restore of + // containers. + CriuPath string + + // New{u,g}uidmapPath is the path to the binaries used for mapping with + // rootless containers. + NewuidmapPath string + NewgidmapPath string + + // Validator provides validation to container configurations. + Validator validate.Validator + + // NewCgroupsManager returns an initialized cgroups manager for a single container. + NewCgroupsManager func(config *configs.Cgroup, paths map[string]string) cgroups.Manager + + // NewIntelRdtManager returns an initialized Intel RDT manager for a single container. + NewIntelRdtManager func(config *configs.Config, id string, path string) intelrdt.Manager +} + +func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, error) { + if l.Root == "" { + return nil, newGenericError(fmt.Errorf("invalid root"), ConfigInvalid) + } + if err := l.validateID(id); err != nil { + return nil, err + } + if err := l.Validator.Validate(config); err != nil { + return nil, newGenericError(err, ConfigInvalid) + } + containerRoot, err := securejoin.SecureJoin(l.Root, id) + if err != nil { + return nil, err + } + if _, err := os.Stat(containerRoot); err == nil { + return nil, newGenericError(fmt.Errorf("container with id exists: %v", id), IdInUse) + } else if !os.IsNotExist(err) { + return nil, newGenericError(err, SystemError) + } + if err := os.MkdirAll(containerRoot, 0711); err != nil { + return nil, newGenericError(err, SystemError) + } + if err := os.Chown(containerRoot, unix.Geteuid(), unix.Getegid()); err != nil { + return nil, newGenericError(err, SystemError) + } + c := &linuxContainer{ + id: id, + root: containerRoot, + config: config, + initPath: l.InitPath, + initArgs: l.InitArgs, + criuPath: l.CriuPath, + newuidmapPath: l.NewuidmapPath, + newgidmapPath: l.NewgidmapPath, + cgroupManager: l.NewCgroupsManager(config.Cgroups, nil), + } + if intelrdt.IsCatEnabled() || intelrdt.IsMbaEnabled() { + c.intelRdtManager = l.NewIntelRdtManager(config, id, "") + } + c.state = &stoppedState{c: c} + return c, nil +} + +func (l *LinuxFactory) Load(id string) (Container, error) { + if l.Root == "" { + return nil, newGenericError(fmt.Errorf("invalid root"), ConfigInvalid) + } + //when load, we need to check id is valid or not. + if err := l.validateID(id); err != nil { + return nil, err + } + containerRoot, err := securejoin.SecureJoin(l.Root, id) + if err != nil { + return nil, err + } + state, err := l.loadState(containerRoot, id) + if err != nil { + return nil, err + } + r := &nonChildProcess{ + processPid: state.InitProcessPid, + processStartTime: state.InitProcessStartTime, + fds: state.ExternalDescriptors, + } + c := &linuxContainer{ + initProcess: r, + initProcessStartTime: state.InitProcessStartTime, + id: id, + config: &state.Config, + initPath: l.InitPath, + initArgs: l.InitArgs, + criuPath: l.CriuPath, + newuidmapPath: l.NewuidmapPath, + newgidmapPath: l.NewgidmapPath, + cgroupManager: l.NewCgroupsManager(state.Config.Cgroups, state.CgroupPaths), + root: containerRoot, + created: state.Created, + } + c.state = &loadedState{c: c} + if err := c.refreshState(); err != nil { + return nil, err + } + if intelrdt.IsCatEnabled() || intelrdt.IsMbaEnabled() { + c.intelRdtManager = l.NewIntelRdtManager(&state.Config, id, state.IntelRdtPath) + } + return c, nil +} + +func (l *LinuxFactory) Type() string { + return "libcontainer" +} + +// StartInitialization loads a container by opening the pipe fd from the parent to read the configuration and state +// This is a low level implementation detail of the reexec and should not be consumed externally +func (l *LinuxFactory) StartInitialization() (err error) { + var ( + pipefd, fifofd int + consoleSocket *os.File + envInitPipe = os.Getenv("_LIBCONTAINER_INITPIPE") + envFifoFd = os.Getenv("_LIBCONTAINER_FIFOFD") + envConsole = os.Getenv("_LIBCONTAINER_CONSOLE") + ) + + // Get the INITPIPE. + pipefd, err = strconv.Atoi(envInitPipe) + if err != nil { + return fmt.Errorf("unable to convert _LIBCONTAINER_INITPIPE=%s to int: %s", envInitPipe, err) + } + + var ( + pipe = os.NewFile(uintptr(pipefd), "pipe") + it = initType(os.Getenv("_LIBCONTAINER_INITTYPE")) + ) + defer pipe.Close() + + // Only init processes have FIFOFD. + fifofd = -1 + if it == initStandard { + if fifofd, err = strconv.Atoi(envFifoFd); err != nil { + return fmt.Errorf("unable to convert _LIBCONTAINER_FIFOFD=%s to int: %s", envFifoFd, err) + } + } + + if envConsole != "" { + console, err := strconv.Atoi(envConsole) + if err != nil { + return fmt.Errorf("unable to convert _LIBCONTAINER_CONSOLE=%s to int: %s", envConsole, err) + } + consoleSocket = os.NewFile(uintptr(console), "console-socket") + defer consoleSocket.Close() + } + + // clear the current process's environment to clean any libcontainer + // specific env vars. + os.Clearenv() + + defer func() { + // We have an error during the initialization of the container's init, + // send it back to the parent process in the form of an initError. + if werr := utils.WriteJSON(pipe, syncT{procError}); werr != nil { + fmt.Fprintln(os.Stderr, err) + return + } + if werr := utils.WriteJSON(pipe, newSystemError(err)); werr != nil { + fmt.Fprintln(os.Stderr, err) + return + } + }() + defer func() { + if e := recover(); e != nil { + err = fmt.Errorf("panic from initialization: %v, %v", e, string(debug.Stack())) + } + }() + + i, err := newContainerInit(it, pipe, consoleSocket, fifofd) + if err != nil { + return err + } + + // If Init succeeds, syscall.Exec will not return, hence none of the defers will be called. + return i.Init() +} + +func (l *LinuxFactory) loadState(root, id string) (*State, error) { + stateFilePath, err := securejoin.SecureJoin(root, stateFilename) + if err != nil { + return nil, err + } + f, err := os.Open(stateFilePath) + if err != nil { + if os.IsNotExist(err) { + return nil, newGenericError(fmt.Errorf("container %q does not exist", id), ContainerNotExists) + } + return nil, newGenericError(err, SystemError) + } + defer f.Close() + var state *State + if err := json.NewDecoder(f).Decode(&state); err != nil { + return nil, newGenericError(err, SystemError) + } + return state, nil +} + +func (l *LinuxFactory) validateID(id string) error { + if !idRegex.MatchString(id) || string(os.PathSeparator)+id != utils.CleanPath(string(os.PathSeparator)+id) { + return newGenericError(fmt.Errorf("invalid id format: %v", id), InvalidIdFormat) + } + + return nil +} + +// NewuidmapPath returns an option func to configure a LinuxFactory with the +// provided .. +func NewuidmapPath(newuidmapPath string) func(*LinuxFactory) error { + return func(l *LinuxFactory) error { + l.NewuidmapPath = newuidmapPath + return nil + } +} + +// NewgidmapPath returns an option func to configure a LinuxFactory with the +// provided .. +func NewgidmapPath(newgidmapPath string) func(*LinuxFactory) error { + return func(l *LinuxFactory) error { + l.NewgidmapPath = newgidmapPath + return nil + } +} diff --git a/libcontainer/factory_linux_test.go b/libcontainer/factory_linux_test.go new file mode 100644 index 0000000..8d0ca8a --- /dev/null +++ b/libcontainer/factory_linux_test.go @@ -0,0 +1,235 @@ +// +build linux + +package libcontainer + +import ( + "io/ioutil" + "os" + "path/filepath" + "reflect" + "testing" + + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/mount" + "github.com/opencontainers/runc/libcontainer/utils" + "github.com/opencontainers/runtime-spec/specs-go" + + "golang.org/x/sys/unix" +) + +func newTestRoot() (string, error) { + dir, err := ioutil.TempDir("", "libcontainer") + if err != nil { + return "", err + } + return dir, nil +} + +func TestFactoryNew(t *testing.T) { + root, rerr := newTestRoot() + if rerr != nil { + t.Fatal(rerr) + } + defer os.RemoveAll(root) + factory, err := New(root, Cgroupfs) + if err != nil { + t.Fatal(err) + } + if factory == nil { + t.Fatal("factory should not be nil") + } + lfactory, ok := factory.(*LinuxFactory) + if !ok { + t.Fatal("expected linux factory returned on linux based systems") + } + if lfactory.Root != root { + t.Fatalf("expected factory root to be %q but received %q", root, lfactory.Root) + } + + if factory.Type() != "libcontainer" { + t.Fatalf("unexpected factory type: %q, expected %q", factory.Type(), "libcontainer") + } +} + +func TestFactoryNewIntelRdt(t *testing.T) { + root, rerr := newTestRoot() + if rerr != nil { + t.Fatal(rerr) + } + defer os.RemoveAll(root) + factory, err := New(root, Cgroupfs, IntelRdtFs) + if err != nil { + t.Fatal(err) + } + if factory == nil { + t.Fatal("factory should not be nil") + } + lfactory, ok := factory.(*LinuxFactory) + if !ok { + t.Fatal("expected linux factory returned on linux based systems") + } + if lfactory.Root != root { + t.Fatalf("expected factory root to be %q but received %q", root, lfactory.Root) + } + + if factory.Type() != "libcontainer" { + t.Fatalf("unexpected factory type: %q, expected %q", factory.Type(), "libcontainer") + } +} + +func TestFactoryNewTmpfs(t *testing.T) { + root, rerr := newTestRoot() + if rerr != nil { + t.Fatal(rerr) + } + defer os.RemoveAll(root) + factory, err := New(root, Cgroupfs, TmpfsRoot) + if err != nil { + t.Fatal(err) + } + if factory == nil { + t.Fatal("factory should not be nil") + } + lfactory, ok := factory.(*LinuxFactory) + if !ok { + t.Fatal("expected linux factory returned on linux based systems") + } + if lfactory.Root != root { + t.Fatalf("expected factory root to be %q but received %q", root, lfactory.Root) + } + + if factory.Type() != "libcontainer" { + t.Fatalf("unexpected factory type: %q, expected %q", factory.Type(), "libcontainer") + } + mounted, err := mount.Mounted(lfactory.Root) + if err != nil { + t.Fatal(err) + } + if !mounted { + t.Fatalf("Factory Root is not mounted") + } + mounts, err := mount.GetMounts() + if err != nil { + t.Fatal(err) + } + var found bool + for _, m := range mounts { + if m.Mountpoint == lfactory.Root { + if m.Fstype != "tmpfs" { + t.Fatalf("Fstype of root: %s, expected %s", m.Fstype, "tmpfs") + } + if m.Source != "tmpfs" { + t.Fatalf("Source of root: %s, expected %s", m.Source, "tmpfs") + } + found = true + } + } + if !found { + t.Fatalf("Factory Root is not listed in mounts list") + } + defer unix.Unmount(root, unix.MNT_DETACH) +} + +func TestFactoryLoadNotExists(t *testing.T) { + root, rerr := newTestRoot() + if rerr != nil { + t.Fatal(rerr) + } + defer os.RemoveAll(root) + factory, err := New(root, Cgroupfs) + if err != nil { + t.Fatal(err) + } + _, err = factory.Load("nocontainer") + if err == nil { + t.Fatal("expected nil error loading non-existing container") + } + lerr, ok := err.(Error) + if !ok { + t.Fatal("expected libcontainer error type") + } + if lerr.Code() != ContainerNotExists { + t.Fatalf("expected error code %s but received %s", ContainerNotExists, lerr.Code()) + } +} + +func TestFactoryLoadContainer(t *testing.T) { + root, err := newTestRoot() + if err != nil { + t.Fatal(err) + } + defer os.RemoveAll(root) + // setup default container config and state for mocking + var ( + id = "1" + expectedHooks = &configs.Hooks{ + Prestart: []configs.Hook{ + configs.CommandHook{Command: configs.Command{Path: "prestart-hook"}}, + }, + Poststart: []configs.Hook{ + configs.CommandHook{Command: configs.Command{Path: "poststart-hook"}}, + }, + Poststop: []configs.Hook{ + unserializableHook{}, + configs.CommandHook{Command: configs.Command{Path: "poststop-hook"}}, + }, + } + expectedConfig = &configs.Config{ + Rootfs: "/mycontainer/root", + Hooks: expectedHooks, + } + expectedState = &State{ + BaseState: BaseState{ + InitProcessPid: 1024, + Config: *expectedConfig, + }, + } + ) + if err := os.Mkdir(filepath.Join(root, id), 0700); err != nil { + t.Fatal(err) + } + if err := marshal(filepath.Join(root, id, stateFilename), expectedState); err != nil { + t.Fatal(err) + } + factory, err := New(root, Cgroupfs, IntelRdtFs) + if err != nil { + t.Fatal(err) + } + container, err := factory.Load(id) + if err != nil { + t.Fatal(err) + } + if container.ID() != id { + t.Fatalf("expected container id %q but received %q", id, container.ID()) + } + config := container.Config() + if config.Rootfs != expectedConfig.Rootfs { + t.Fatalf("expected rootfs %q but received %q", expectedConfig.Rootfs, config.Rootfs) + } + expectedHooks.Poststop = expectedHooks.Poststop[1:] // expect unserializable hook to be skipped + if !reflect.DeepEqual(config.Hooks, expectedHooks) { + t.Fatalf("expects hooks %q but received %q", expectedHooks, config.Hooks) + } + lcontainer, ok := container.(*linuxContainer) + if !ok { + t.Fatal("expected linux container on linux based systems") + } + if lcontainer.initProcess.pid() != expectedState.InitProcessPid { + t.Fatalf("expected init pid %d but received %d", expectedState.InitProcessPid, lcontainer.initProcess.pid()) + } +} + +func marshal(path string, v interface{}) error { + f, err := os.Create(path) + if err != nil { + return err + } + defer f.Close() + return utils.WriteJSON(f, v) +} + +type unserializableHook struct{} + +func (unserializableHook) Run(*specs.State) error { + return nil +} diff --git a/libcontainer/generic_error.go b/libcontainer/generic_error.go new file mode 100644 index 0000000..6e7de2f --- /dev/null +++ b/libcontainer/generic_error.go @@ -0,0 +1,92 @@ +package libcontainer + +import ( + "fmt" + "io" + "text/template" + "time" + + "github.com/opencontainers/runc/libcontainer/stacktrace" +) + +var errorTemplate = template.Must(template.New("error").Parse(`Timestamp: {{.Timestamp}} +Code: {{.ECode}} +{{if .Message }} +Message: {{.Message}} +{{end}} +Frames:{{range $i, $frame := .Stack.Frames}} +--- +{{$i}}: {{$frame.Function}} +Package: {{$frame.Package}} +File: {{$frame.File}}@{{$frame.Line}}{{end}} +`)) + +func newGenericError(err error, c ErrorCode) Error { + if le, ok := err.(Error); ok { + return le + } + gerr := &genericError{ + Timestamp: time.Now(), + Err: err, + ECode: c, + Stack: stacktrace.Capture(1), + } + if err != nil { + gerr.Message = err.Error() + } + return gerr +} + +func newSystemError(err error) Error { + return createSystemError(err, "") +} + +func newSystemErrorWithCausef(err error, cause string, v ...interface{}) Error { + return createSystemError(err, fmt.Sprintf(cause, v...)) +} + +func newSystemErrorWithCause(err error, cause string) Error { + return createSystemError(err, cause) +} + +// createSystemError creates the specified error with the correct number of +// stack frames skipped. This is only to be called by the other functions for +// formatting the error. +func createSystemError(err error, cause string) Error { + gerr := &genericError{ + Timestamp: time.Now(), + Err: err, + ECode: SystemError, + Cause: cause, + Stack: stacktrace.Capture(2), + } + if err != nil { + gerr.Message = err.Error() + } + return gerr +} + +type genericError struct { + Timestamp time.Time + ECode ErrorCode + Err error `json:"-"` + Cause string + Message string + Stack stacktrace.Stacktrace +} + +func (e *genericError) Error() string { + if e.Cause == "" { + return e.Message + } + frame := e.Stack.Frames[0] + return fmt.Sprintf("%s:%d: %s caused %q", frame.File, frame.Line, e.Cause, e.Message) +} + +func (e *genericError) Code() ErrorCode { + return e.ECode +} + +func (e *genericError) Detail(w io.Writer) error { + return errorTemplate.Execute(w, e) +} diff --git a/libcontainer/generic_error_test.go b/libcontainer/generic_error_test.go new file mode 100644 index 0000000..8fbdd4d --- /dev/null +++ b/libcontainer/generic_error_test.go @@ -0,0 +1,49 @@ +package libcontainer + +import ( + "fmt" + "io/ioutil" + "testing" +) + +func TestErrorDetail(t *testing.T) { + err := newGenericError(fmt.Errorf("test error"), SystemError) + if derr := err.Detail(ioutil.Discard); derr != nil { + t.Fatal(derr) + } +} + +func TestErrorWithCode(t *testing.T) { + err := newGenericError(fmt.Errorf("test error"), SystemError) + if code := err.Code(); code != SystemError { + t.Fatalf("expected err code %q but %q", SystemError, code) + } +} + +func TestErrorWithError(t *testing.T) { + cc := []struct { + errmsg string + cause string + }{ + { + errmsg: "test error", + }, + { + errmsg: "test error", + cause: "test", + }, + } + + for _, v := range cc { + err := newSystemErrorWithCause(fmt.Errorf(v.errmsg), v.cause) + + msg := err.Error() + if v.cause == "" && msg != v.errmsg { + t.Fatalf("expected err(%q) equal errmsg(%q)", msg, v.errmsg) + } + if v.cause != "" && msg == v.errmsg { + t.Fatalf("unexpected err(%q) equal errmsg(%q)", msg, v.errmsg) + } + + } +} diff --git a/libcontainer/init_linux.go b/libcontainer/init_linux.go new file mode 100644 index 0000000..c1b1560 --- /dev/null +++ b/libcontainer/init_linux.go @@ -0,0 +1,537 @@ +// +build linux + +package libcontainer + +import ( + "encoding/json" + "fmt" + "io" + "io/ioutil" + "net" + "os" + "strings" + "syscall" // only for Errno + "unsafe" + + "golang.org/x/sys/unix" + + "github.com/containerd/console" + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/system" + "github.com/opencontainers/runc/libcontainer/user" + "github.com/opencontainers/runc/libcontainer/utils" + "github.com/pkg/errors" + "github.com/sirupsen/logrus" + "github.com/vishvananda/netlink" +) + +type initType string + +const ( + initSetns initType = "setns" + initStandard initType = "standard" +) + +type pid struct { + Pid int `json:"pid"` + PidFirstChild int `json:"pid_first"` +} + +// network is an internal struct used to setup container networks. +type network struct { + configs.Network + + // TempVethPeerName is a unique temporary veth peer name that was placed into + // the container's namespace. + TempVethPeerName string `json:"temp_veth_peer_name"` +} + +// initConfig is used for transferring parameters from Exec() to Init() +type initConfig struct { + Args []string `json:"args"` + Env []string `json:"env"` + Cwd string `json:"cwd"` + Capabilities *configs.Capabilities `json:"capabilities"` + ProcessLabel string `json:"process_label"` + AppArmorProfile string `json:"apparmor_profile"` + NoNewPrivileges bool `json:"no_new_privileges"` + User string `json:"user"` + AdditionalGroups []string `json:"additional_groups"` + Config *configs.Config `json:"config"` + Networks []*network `json:"network"` + PassedFilesCount int `json:"passed_files_count"` + ContainerId string `json:"containerid"` + Rlimits []configs.Rlimit `json:"rlimits"` + CreateConsole bool `json:"create_console"` + ConsoleWidth uint16 `json:"console_width"` + ConsoleHeight uint16 `json:"console_height"` + RootlessEUID bool `json:"rootless_euid,omitempty"` + RootlessCgroups bool `json:"rootless_cgroups,omitempty"` +} + +type initer interface { + Init() error +} + +func newContainerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd int) (initer, error) { + var config *initConfig + if err := json.NewDecoder(pipe).Decode(&config); err != nil { + return nil, err + } + if err := populateProcessEnvironment(config.Env); err != nil { + return nil, err + } + switch t { + case initSetns: + return &linuxSetnsInit{ + pipe: pipe, + consoleSocket: consoleSocket, + config: config, + }, nil + case initStandard: + return &linuxStandardInit{ + pipe: pipe, + consoleSocket: consoleSocket, + parentPid: unix.Getppid(), + config: config, + fifoFd: fifoFd, + }, nil + } + return nil, fmt.Errorf("unknown init type %q", t) +} + +// populateProcessEnvironment loads the provided environment variables into the +// current processes's environment. +func populateProcessEnvironment(env []string) error { + for _, pair := range env { + p := strings.SplitN(pair, "=", 2) + if len(p) < 2 { + return fmt.Errorf("invalid environment '%v'", pair) + } + if err := os.Setenv(p[0], p[1]); err != nil { + return err + } + } + return nil +} + +// finalizeNamespace drops the caps, sets the correct user +// and working dir, and closes any leaked file descriptors +// before executing the command inside the namespace +func finalizeNamespace(config *initConfig) error { + // Ensure that all unwanted fds we may have accidentally + // inherited are marked close-on-exec so they stay out of the + // container + if err := utils.CloseExecFrom(config.PassedFilesCount + 3); err != nil { + return errors.Wrap(err, "close exec fds") + } + + if config.Cwd != "" { + if err := unix.Chdir(config.Cwd); err != nil { + return fmt.Errorf("chdir to cwd (%q) set in config.json failed: %v", config.Cwd, err) + } + } + + capabilities := &configs.Capabilities{} + if config.Capabilities != nil { + capabilities = config.Capabilities + } else if config.Config.Capabilities != nil { + capabilities = config.Config.Capabilities + } + w, err := newContainerCapList(capabilities) + if err != nil { + return err + } + // drop capabilities in bounding set before changing user + if err := w.ApplyBoundingSet(); err != nil { + return errors.Wrap(err, "apply bounding set") + } + // preserve existing capabilities while we change users + if err := system.SetKeepCaps(); err != nil { + return errors.Wrap(err, "set keep caps") + } + if err := setupUser(config); err != nil { + return errors.Wrap(err, "setup user") + } + if err := system.ClearKeepCaps(); err != nil { + return errors.Wrap(err, "clear keep caps") + } + if err := w.ApplyCaps(); err != nil { + return errors.Wrap(err, "apply caps") + } + return nil +} + +// setupConsole sets up the console from inside the container, and sends the +// master pty fd to the config.Pipe (using cmsg). This is done to ensure that +// consoles are scoped to a container properly (see runc#814 and the many +// issues related to that). This has to be run *after* we've pivoted to the new +// rootfs (and the users' configuration is entirely set up). +func setupConsole(socket *os.File, config *initConfig, mount bool) error { + defer socket.Close() + // At this point, /dev/ptmx points to something that we would expect. We + // used to change the owner of the slave path, but since the /dev/pts mount + // can have gid=X set (at the users' option). So touching the owner of the + // slave PTY is not necessary, as the kernel will handle that for us. Note + // however, that setupUser (specifically fixStdioPermissions) *will* change + // the UID owner of the console to be the user the process will run as (so + // they can actually control their console). + + pty, slavePath, err := console.NewPty() + if err != nil { + return err + } + + if config.ConsoleHeight != 0 && config.ConsoleWidth != 0 { + err = pty.Resize(console.WinSize{ + Height: config.ConsoleHeight, + Width: config.ConsoleWidth, + }) + + if err != nil { + return err + } + } + + // After we return from here, we don't need the console anymore. + defer pty.Close() + + // Mount the console inside our rootfs. + if mount { + if err := mountConsole(slavePath); err != nil { + return err + } + } + // While we can access console.master, using the API is a good idea. + if err := utils.SendFd(socket, pty.Name(), pty.Fd()); err != nil { + return err + } + // Now, dup over all the things. + return dupStdio(slavePath) +} + +// syncParentReady sends to the given pipe a JSON payload which indicates that +// the init is ready to Exec the child process. It then waits for the parent to +// indicate that it is cleared to Exec. +func syncParentReady(pipe io.ReadWriter) error { + // Tell parent. + if err := writeSync(pipe, procReady); err != nil { + return err + } + + // Wait for parent to give the all-clear. + return readSync(pipe, procRun) +} + +// syncParentHooks sends to the given pipe a JSON payload which indicates that +// the parent should execute pre-start hooks. It then waits for the parent to +// indicate that it is cleared to resume. +func syncParentHooks(pipe io.ReadWriter) error { + // Tell parent. + if err := writeSync(pipe, procHooks); err != nil { + return err + } + + // Wait for parent to give the all-clear. + return readSync(pipe, procResume) +} + +// setupUser changes the groups, gid, and uid for the user inside the container +func setupUser(config *initConfig) error { + // Set up defaults. + defaultExecUser := user.ExecUser{ + Uid: 0, + Gid: 0, + Home: "/", + } + + passwdPath, err := user.GetPasswdPath() + if err != nil { + return err + } + + groupPath, err := user.GetGroupPath() + if err != nil { + return err + } + + execUser, err := user.GetExecUserPath(config.User, &defaultExecUser, passwdPath, groupPath) + if err != nil { + return err + } + + var addGroups []int + if len(config.AdditionalGroups) > 0 { + addGroups, err = user.GetAdditionalGroupsPath(config.AdditionalGroups, groupPath) + if err != nil { + return err + } + } + + // Rather than just erroring out later in setuid(2) and setgid(2), check + // that the user is mapped here. + if _, err := config.Config.HostUID(execUser.Uid); err != nil { + return fmt.Errorf("cannot set uid to unmapped user in user namespace") + } + if _, err := config.Config.HostGID(execUser.Gid); err != nil { + return fmt.Errorf("cannot set gid to unmapped user in user namespace") + } + + if config.RootlessEUID { + // We cannot set any additional groups in a rootless container and thus + // we bail if the user asked us to do so. TODO: We currently can't do + // this check earlier, but if libcontainer.Process.User was typesafe + // this might work. + if len(addGroups) > 0 { + return fmt.Errorf("cannot set any additional groups in a rootless container") + } + } + + // Before we change to the container's user make sure that the processes + // STDIO is correctly owned by the user that we are switching to. + if err := fixStdioPermissions(config, execUser); err != nil { + return err + } + + setgroups, err := ioutil.ReadFile("/proc/self/setgroups") + if err != nil && !os.IsNotExist(err) { + return err + } + + // This isn't allowed in an unprivileged user namespace since Linux 3.19. + // There's nothing we can do about /etc/group entries, so we silently + // ignore setting groups here (since the user didn't explicitly ask us to + // set the group). + allowSupGroups := !config.RootlessEUID && strings.TrimSpace(string(setgroups)) != "deny" + + if allowSupGroups { + suppGroups := append(execUser.Sgids, addGroups...) + if err := unix.Setgroups(suppGroups); err != nil { + return err + } + } + + if err := system.Setgid(execUser.Gid); err != nil { + return err + } + if err := system.Setuid(execUser.Uid); err != nil { + return err + } + + // if we didn't get HOME already, set it based on the user's HOME + if envHome := os.Getenv("HOME"); envHome == "" { + if err := os.Setenv("HOME", execUser.Home); err != nil { + return err + } + } + return nil +} + +// fixStdioPermissions fixes the permissions of PID 1's STDIO within the container to the specified user. +// The ownership needs to match because it is created outside of the container and needs to be +// localized. +func fixStdioPermissions(config *initConfig, u *user.ExecUser) error { + var null unix.Stat_t + if err := unix.Stat("/dev/null", &null); err != nil { + return err + } + for _, fd := range []uintptr{ + os.Stdin.Fd(), + os.Stderr.Fd(), + os.Stdout.Fd(), + } { + var s unix.Stat_t + if err := unix.Fstat(int(fd), &s); err != nil { + return err + } + + // Skip chown of /dev/null if it was used as one of the STDIO fds. + if s.Rdev == null.Rdev { + continue + } + + // We only change the uid owner (as it is possible for the mount to + // prefer a different gid, and there's no reason for us to change it). + // The reason why we don't just leave the default uid=X mount setup is + // that users expect to be able to actually use their console. Without + // this code, you couldn't effectively run as a non-root user inside a + // container and also have a console set up. + if err := unix.Fchown(int(fd), u.Uid, int(s.Gid)); err != nil { + // If we've hit an EINVAL then s.Gid isn't mapped in the user + // namespace. If we've hit an EPERM then the inode's current owner + // is not mapped in our user namespace (in particular, + // privileged_wrt_inode_uidgid() has failed). In either case, we + // are in a configuration where it's better for us to just not + // touch the stdio rather than bail at this point. + if err == unix.EINVAL || err == unix.EPERM { + continue + } + return err + } + } + return nil +} + +// setupNetwork sets up and initializes any network interface inside the container. +func setupNetwork(config *initConfig) error { + for _, config := range config.Networks { + strategy, err := getStrategy(config.Type) + if err != nil { + return err + } + if err := strategy.initialize(config); err != nil { + return err + } + } + return nil +} + +func setupRoute(config *configs.Config) error { + for _, config := range config.Routes { + _, dst, err := net.ParseCIDR(config.Destination) + if err != nil { + return err + } + src := net.ParseIP(config.Source) + if src == nil { + return fmt.Errorf("Invalid source for route: %s", config.Source) + } + gw := net.ParseIP(config.Gateway) + if gw == nil { + return fmt.Errorf("Invalid gateway for route: %s", config.Gateway) + } + l, err := netlink.LinkByName(config.InterfaceName) + if err != nil { + return err + } + route := &netlink.Route{ + Scope: netlink.SCOPE_UNIVERSE, + Dst: dst, + Src: src, + Gw: gw, + LinkIndex: l.Attrs().Index, + } + if err := netlink.RouteAdd(route); err != nil { + return err + } + } + return nil +} + +func setupRlimits(limits []configs.Rlimit, pid int) error { + for _, rlimit := range limits { + if err := system.Prlimit(pid, rlimit.Type, unix.Rlimit{Max: rlimit.Hard, Cur: rlimit.Soft}); err != nil { + return fmt.Errorf("error setting rlimit type %v: %v", rlimit.Type, err) + } + } + return nil +} + +const _P_PID = 1 + +type siginfo struct { + si_signo int32 + si_errno int32 + si_code int32 + // below here is a union; si_pid is the only field we use + si_pid int32 + // Pad to 128 bytes as detailed in blockUntilWaitable + pad [96]byte +} + +// isWaitable returns true if the process has exited false otherwise. +// Its based off blockUntilWaitable in src/os/wait_waitid.go +func isWaitable(pid int) (bool, error) { + si := &siginfo{} + _, _, e := unix.Syscall6(unix.SYS_WAITID, _P_PID, uintptr(pid), uintptr(unsafe.Pointer(si)), unix.WEXITED|unix.WNOWAIT|unix.WNOHANG, 0, 0) + if e != 0 { + return false, os.NewSyscallError("waitid", e) + } + + return si.si_pid != 0, nil +} + +// isNoChildren returns true if err represents a unix.ECHILD (formerly syscall.ECHILD) false otherwise +func isNoChildren(err error) bool { + switch err := err.(type) { + case syscall.Errno: + if err == unix.ECHILD { + return true + } + case *os.SyscallError: + if err.Err == unix.ECHILD { + return true + } + } + return false +} + +// signalAllProcesses freezes then iterates over all the processes inside the +// manager's cgroups sending the signal s to them. +// If s is SIGKILL then it will wait for each process to exit. +// For all other signals it will check if the process is ready to report its +// exit status and only if it is will a wait be performed. +func signalAllProcesses(m cgroups.Manager, s os.Signal) error { + var procs []*os.Process + if err := m.Freeze(configs.Frozen); err != nil { + logrus.Warn(err) + } + pids, err := m.GetAllPids() + if err != nil { + m.Freeze(configs.Thawed) + return err + } + for _, pid := range pids { + p, err := os.FindProcess(pid) + if err != nil { + logrus.Warn(err) + continue + } + procs = append(procs, p) + if err := p.Signal(s); err != nil { + logrus.Warn(err) + } + } + if err := m.Freeze(configs.Thawed); err != nil { + logrus.Warn(err) + } + + subreaper, err := system.GetSubreaper() + if err != nil { + // The error here means that PR_GET_CHILD_SUBREAPER is not + // supported because this code might run on a kernel older + // than 3.4. We don't want to throw an error in that case, + // and we simplify things, considering there is no subreaper + // set. + subreaper = 0 + } + + for _, p := range procs { + if s != unix.SIGKILL { + if ok, err := isWaitable(p.Pid); err != nil { + if !isNoChildren(err) { + logrus.Warn("signalAllProcesses: ", p.Pid, err) + } + continue + } else if !ok { + // Not ready to report so don't wait + continue + } + } + + // In case a subreaper has been setup, this code must not + // wait for the process. Otherwise, we cannot be sure the + // current process will be reaped by the subreaper, while + // the subreaper might be waiting for this process in order + // to retrieve its exit code. + if subreaper == 0 { + if _, err := p.Wait(); err != nil { + if !isNoChildren(err) { + logrus.Warn("wait: ", err) + } + } + } + } + return nil +} diff --git a/libcontainer/integration/checkpoint_test.go b/libcontainer/integration/checkpoint_test.go new file mode 100644 index 0000000..cdb6810 --- /dev/null +++ b/libcontainer/integration/checkpoint_test.go @@ -0,0 +1,264 @@ +package integration + +import ( + "bufio" + "bytes" + "io/ioutil" + "os" + "os/exec" + "path/filepath" + "strings" + "testing" + + "github.com/opencontainers/runc/libcontainer" + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" + + "golang.org/x/sys/unix" +) + +func showFile(t *testing.T, fname string) error { + t.Logf("=== %s ===\n", fname) + + f, err := os.Open(fname) + if err != nil { + t.Log(err) + return err + } + defer f.Close() + + scanner := bufio.NewScanner(f) + for scanner.Scan() { + t.Log(scanner.Text()) + } + + if err := scanner.Err(); err != nil { + return err + } + + t.Logf("=== END ===\n") + + return nil +} + +func TestUsernsCheckpoint(t *testing.T) { + t.Skip("Ubuntu kernel is broken to run criu (#2196, #2198)") + if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) { + t.Skip("userns is unsupported") + } + cmd := exec.Command("criu", "check", "--feature", "userns") + if err := cmd.Run(); err != nil { + t.Skip("Unable to c/r a container with userns") + } + testCheckpoint(t, true) +} + +func TestCheckpoint(t *testing.T) { + t.Skip("Ubuntu kernel is broken to run criu (#2196, #2198)") + testCheckpoint(t, false) +} + +func testCheckpoint(t *testing.T, userns bool) { + if testing.Short() { + return + } + if cgroups.IsCgroup2UnifiedMode() { + t.Skip("cgroup v1 is not supported") + } + + root, err := newTestRoot() + if err != nil { + t.Fatal(err) + } + defer os.RemoveAll(root) + + rootfs, err := newRootfs() + if err != nil { + t.Fatal(err) + } + defer remove(rootfs) + + config := newTemplateConfig(rootfs) + + config.Mounts = append(config.Mounts, &configs.Mount{ + Destination: "/sys/fs/cgroup", + Device: "cgroup", + Flags: defaultMountFlags | unix.MS_RDONLY, + }) + + if userns { + config.UidMappings = []configs.IDMap{{HostID: 0, ContainerID: 0, Size: 1000}} + config.GidMappings = []configs.IDMap{{HostID: 0, ContainerID: 0, Size: 1000}} + config.Namespaces = append(config.Namespaces, configs.Namespace{Type: configs.NEWUSER}) + } + + factory, err := libcontainer.New(root, libcontainer.Cgroupfs) + + if err != nil { + t.Fatal(err) + } + + container, err := factory.Create("test", config) + if err != nil { + t.Fatal(err) + } + defer container.Destroy() + + stdinR, stdinW, err := os.Pipe() + if err != nil { + t.Fatal(err) + } + + var stdout bytes.Buffer + + pconfig := libcontainer.Process{ + Cwd: "/", + Args: []string{"cat"}, + Env: standardEnvironment, + Stdin: stdinR, + Stdout: &stdout, + Init: true, + } + + err = container.Run(&pconfig) + stdinR.Close() + defer stdinW.Close() + if err != nil { + t.Fatal(err) + } + + pid, err := pconfig.Pid() + if err != nil { + t.Fatal(err) + } + + process, err := os.FindProcess(pid) + if err != nil { + t.Fatal(err) + } + + parentDir, err := ioutil.TempDir("", "criu-parent") + if err != nil { + t.Fatal(err) + } + defer os.RemoveAll(parentDir) + + preDumpOpts := &libcontainer.CriuOpts{ + ImagesDirectory: parentDir, + WorkDirectory: parentDir, + PreDump: true, + } + preDumpLog := filepath.Join(preDumpOpts.WorkDirectory, "dump.log") + + if err := container.Checkpoint(preDumpOpts); err != nil { + showFile(t, preDumpLog) + t.Fatal(err) + } + + state, err := container.Status() + if err != nil { + t.Fatal(err) + } + + if state != libcontainer.Running { + t.Fatal("Unexpected preDump state: ", state) + } + + imagesDir, err := ioutil.TempDir("", "criu") + if err != nil { + t.Fatal(err) + } + defer os.RemoveAll(imagesDir) + + checkpointOpts := &libcontainer.CriuOpts{ + ImagesDirectory: imagesDir, + WorkDirectory: imagesDir, + ParentImage: "../criu-parent", + } + dumpLog := filepath.Join(checkpointOpts.WorkDirectory, "dump.log") + restoreLog := filepath.Join(checkpointOpts.WorkDirectory, "restore.log") + + if err := container.Checkpoint(checkpointOpts); err != nil { + showFile(t, dumpLog) + t.Fatal(err) + } + + state, err = container.Status() + if err != nil { + t.Fatal(err) + } + + if state != libcontainer.Stopped { + t.Fatal("Unexpected state checkpoint: ", state) + } + + stdinW.Close() + _, err = process.Wait() + if err != nil { + t.Fatal(err) + } + + // reload the container + container, err = factory.Load("test") + if err != nil { + t.Fatal(err) + } + + restoreStdinR, restoreStdinW, err := os.Pipe() + if err != nil { + t.Fatal(err) + } + + restoreProcessConfig := &libcontainer.Process{ + Cwd: "/", + Stdin: restoreStdinR, + Stdout: &stdout, + Init: true, + } + + err = container.Restore(restoreProcessConfig, checkpointOpts) + restoreStdinR.Close() + defer restoreStdinW.Close() + if err != nil { + showFile(t, restoreLog) + t.Fatal(err) + } + + state, err = container.Status() + if err != nil { + t.Fatal(err) + } + if state != libcontainer.Running { + t.Fatal("Unexpected restore state: ", state) + } + + pid, err = restoreProcessConfig.Pid() + if err != nil { + t.Fatal(err) + } + + process, err = os.FindProcess(pid) + if err != nil { + t.Fatal(err) + } + + _, err = restoreStdinW.WriteString("Hello!") + if err != nil { + t.Fatal(err) + } + + restoreStdinW.Close() + s, err := process.Wait() + if err != nil { + t.Fatal(err) + } + + if !s.Success() { + t.Fatal(s.String(), pid) + } + + output := string(stdout.Bytes()) + if !strings.Contains(output, "Hello!") { + t.Fatal("Did not restore the pipe correctly:", output) + } +} diff --git a/libcontainer/integration/doc.go b/libcontainer/integration/doc.go new file mode 100644 index 0000000..87545bc --- /dev/null +++ b/libcontainer/integration/doc.go @@ -0,0 +1,2 @@ +// integration is used for integration testing of libcontainer +package integration diff --git a/libcontainer/integration/exec_test.go b/libcontainer/integration/exec_test.go new file mode 100644 index 0000000..7822fa8 --- /dev/null +++ b/libcontainer/integration/exec_test.go @@ -0,0 +1,1793 @@ +package integration + +import ( + "bytes" + "encoding/json" + "fmt" + "io/ioutil" + "os" + "os/exec" + "path/filepath" + "reflect" + "strconv" + "strings" + "testing" + + "github.com/opencontainers/runc/libcontainer" + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/systemd" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runtime-spec/specs-go" + + "golang.org/x/sys/unix" +) + +func TestExecPS(t *testing.T) { + testExecPS(t, false) +} + +func TestUsernsExecPS(t *testing.T) { + if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) { + t.Skip("userns is unsupported") + } + testExecPS(t, true) +} + +func testExecPS(t *testing.T, userns bool) { + if testing.Short() { + return + } + rootfs, err := newRootfs() + ok(t, err) + defer remove(rootfs) + config := newTemplateConfig(rootfs) + if userns { + config.UidMappings = []configs.IDMap{{HostID: 0, ContainerID: 0, Size: 1000}} + config.GidMappings = []configs.IDMap{{HostID: 0, ContainerID: 0, Size: 1000}} + config.Namespaces = append(config.Namespaces, configs.Namespace{Type: configs.NEWUSER}) + } + + buffers, exitCode, err := runContainer(config, "", "ps", "-o", "pid,user,comm") + if err != nil { + t.Fatalf("%s: %s", buffers, err) + } + if exitCode != 0 { + t.Fatalf("exit code not 0. code %d stderr %q", exitCode, buffers.Stderr) + } + lines := strings.Split(buffers.Stdout.String(), "\n") + if len(lines) < 2 { + t.Fatalf("more than one process running for output %q", buffers.Stdout.String()) + } + expected := `1 root ps` + actual := strings.Trim(lines[1], "\n ") + if actual != expected { + t.Fatalf("expected output %q but received %q", expected, actual) + } +} + +func TestIPCPrivate(t *testing.T) { + if testing.Short() { + return + } + + rootfs, err := newRootfs() + ok(t, err) + defer remove(rootfs) + + l, err := os.Readlink("/proc/1/ns/ipc") + ok(t, err) + + config := newTemplateConfig(rootfs) + buffers, exitCode, err := runContainer(config, "", "readlink", "/proc/self/ns/ipc") + ok(t, err) + + if exitCode != 0 { + t.Fatalf("exit code not 0. code %d stderr %q", exitCode, buffers.Stderr) + } + + if actual := strings.Trim(buffers.Stdout.String(), "\n"); actual == l { + t.Fatalf("ipc link should be private to the container but equals host %q %q", actual, l) + } +} + +func TestIPCHost(t *testing.T) { + if testing.Short() { + return + } + + rootfs, err := newRootfs() + ok(t, err) + defer remove(rootfs) + + l, err := os.Readlink("/proc/1/ns/ipc") + ok(t, err) + + config := newTemplateConfig(rootfs) + config.Namespaces.Remove(configs.NEWIPC) + buffers, exitCode, err := runContainer(config, "", "readlink", "/proc/self/ns/ipc") + ok(t, err) + + if exitCode != 0 { + t.Fatalf("exit code not 0. code %d stderr %q", exitCode, buffers.Stderr) + } + + if actual := strings.Trim(buffers.Stdout.String(), "\n"); actual != l { + t.Fatalf("ipc link not equal to host link %q %q", actual, l) + } +} + +func TestIPCJoinPath(t *testing.T) { + if testing.Short() { + return + } + + rootfs, err := newRootfs() + ok(t, err) + defer remove(rootfs) + + l, err := os.Readlink("/proc/1/ns/ipc") + ok(t, err) + + config := newTemplateConfig(rootfs) + config.Namespaces.Add(configs.NEWIPC, "/proc/1/ns/ipc") + + buffers, exitCode, err := runContainer(config, "", "readlink", "/proc/self/ns/ipc") + ok(t, err) + + if exitCode != 0 { + t.Fatalf("exit code not 0. code %d stderr %q", exitCode, buffers.Stderr) + } + + if actual := strings.Trim(buffers.Stdout.String(), "\n"); actual != l { + t.Fatalf("ipc link not equal to host link %q %q", actual, l) + } +} + +func TestIPCBadPath(t *testing.T) { + if testing.Short() { + return + } + + rootfs, err := newRootfs() + ok(t, err) + defer remove(rootfs) + + config := newTemplateConfig(rootfs) + config.Namespaces.Add(configs.NEWIPC, "/proc/1/ns/ipcc") + + _, _, err = runContainer(config, "", "true") + if err == nil { + t.Fatal("container succeeded with bad ipc path") + } +} + +func TestRlimit(t *testing.T) { + testRlimit(t, false) +} + +func TestUsernsRlimit(t *testing.T) { + if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) { + t.Skip("userns is unsupported") + } + + testRlimit(t, true) +} + +func testRlimit(t *testing.T, userns bool) { + if testing.Short() { + return + } + + rootfs, err := newRootfs() + ok(t, err) + defer remove(rootfs) + + config := newTemplateConfig(rootfs) + if userns { + config.UidMappings = []configs.IDMap{{HostID: 0, ContainerID: 0, Size: 1000}} + config.GidMappings = []configs.IDMap{{HostID: 0, ContainerID: 0, Size: 1000}} + config.Namespaces = append(config.Namespaces, configs.Namespace{Type: configs.NEWUSER}) + } + + // ensure limit is lower than what the config requests to test that in a user namespace + // the Setrlimit call happens early enough that we still have permissions to raise the limit. + ok(t, unix.Setrlimit(unix.RLIMIT_NOFILE, &unix.Rlimit{ + Max: 1024, + Cur: 1024, + })) + + out, _, err := runContainer(config, "", "/bin/sh", "-c", "ulimit -n") + ok(t, err) + if limit := strings.TrimSpace(out.Stdout.String()); limit != "1025" { + t.Fatalf("expected rlimit to be 1025, got %s", limit) + } +} + +func TestEnter(t *testing.T) { + if testing.Short() { + return + } + if cgroups.IsCgroup2UnifiedMode() { + t.Skip("cgroup v1 is not supported") + } + + rootfs, err := newRootfs() + ok(t, err) + defer remove(rootfs) + + config := newTemplateConfig(rootfs) + + container, err := newContainerWithName("test", config) + ok(t, err) + defer container.Destroy() + + // Execute a first process in the container + stdinR, stdinW, err := os.Pipe() + ok(t, err) + + var stdout, stdout2 bytes.Buffer + + pconfig := libcontainer.Process{ + Cwd: "/", + Args: []string{"sh", "-c", "cat && readlink /proc/self/ns/pid"}, + Env: standardEnvironment, + Stdin: stdinR, + Stdout: &stdout, + Init: true, + } + err = container.Run(&pconfig) + stdinR.Close() + defer stdinW.Close() + ok(t, err) + pid, err := pconfig.Pid() + ok(t, err) + + // Execute another process in the container + stdinR2, stdinW2, err := os.Pipe() + ok(t, err) + pconfig2 := libcontainer.Process{ + Cwd: "/", + Env: standardEnvironment, + } + pconfig2.Args = []string{"sh", "-c", "cat && readlink /proc/self/ns/pid"} + pconfig2.Stdin = stdinR2 + pconfig2.Stdout = &stdout2 + + err = container.Run(&pconfig2) + stdinR2.Close() + defer stdinW2.Close() + ok(t, err) + + pid2, err := pconfig2.Pid() + ok(t, err) + + processes, err := container.Processes() + ok(t, err) + + n := 0 + for i := range processes { + if processes[i] == pid || processes[i] == pid2 { + n++ + } + } + if n != 2 { + t.Fatal("unexpected number of processes", processes, pid, pid2) + } + + // Wait processes + stdinW2.Close() + waitProcess(&pconfig2, t) + + stdinW.Close() + waitProcess(&pconfig, t) + + // Check that both processes live in the same pidns + pidns := string(stdout.Bytes()) + ok(t, err) + + pidns2 := string(stdout2.Bytes()) + ok(t, err) + + if pidns != pidns2 { + t.Fatal("The second process isn't in the required pid namespace", pidns, pidns2) + } +} + +func TestProcessEnv(t *testing.T) { + if testing.Short() { + return + } + + rootfs, err := newRootfs() + ok(t, err) + defer remove(rootfs) + + config := newTemplateConfig(rootfs) + + container, err := newContainerWithName("test", config) + ok(t, err) + defer container.Destroy() + + var stdout bytes.Buffer + pconfig := libcontainer.Process{ + Cwd: "/", + Args: []string{"sh", "-c", "env"}, + Env: []string{ + "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", + "HOSTNAME=integration", + "TERM=xterm", + "FOO=BAR", + }, + Stdin: nil, + Stdout: &stdout, + Init: true, + } + err = container.Run(&pconfig) + ok(t, err) + + // Wait for process + waitProcess(&pconfig, t) + + outputEnv := string(stdout.Bytes()) + + // Check that the environment has the key/value pair we added + if !strings.Contains(outputEnv, "FOO=BAR") { + t.Fatal("Environment doesn't have the expected FOO=BAR key/value pair: ", outputEnv) + } + + // Make sure that HOME is set + if !strings.Contains(outputEnv, "HOME=/root") { + t.Fatal("Environment doesn't have HOME set: ", outputEnv) + } +} + +func TestProcessEmptyCaps(t *testing.T) { + if testing.Short() { + return + } + + rootfs, err := newRootfs() + ok(t, err) + defer remove(rootfs) + + config := newTemplateConfig(rootfs) + config.Capabilities = nil + + container, err := newContainerWithName("test", config) + ok(t, err) + defer container.Destroy() + + var stdout bytes.Buffer + pconfig := libcontainer.Process{ + Cwd: "/", + Args: []string{"sh", "-c", "cat /proc/self/status"}, + Env: standardEnvironment, + Stdin: nil, + Stdout: &stdout, + Init: true, + } + err = container.Run(&pconfig) + ok(t, err) + + // Wait for process + waitProcess(&pconfig, t) + + outputStatus := string(stdout.Bytes()) + + lines := strings.Split(outputStatus, "\n") + + effectiveCapsLine := "" + for _, l := range lines { + line := strings.TrimSpace(l) + if strings.Contains(line, "CapEff:") { + effectiveCapsLine = line + break + } + } + + if effectiveCapsLine == "" { + t.Fatal("Couldn't find effective caps: ", outputStatus) + } +} + +func TestProcessCaps(t *testing.T) { + if testing.Short() { + return + } + + rootfs, err := newRootfs() + ok(t, err) + defer remove(rootfs) + + config := newTemplateConfig(rootfs) + + container, err := newContainerWithName("test", config) + ok(t, err) + defer container.Destroy() + + var stdout bytes.Buffer + pconfig := libcontainer.Process{ + Cwd: "/", + Args: []string{"sh", "-c", "cat /proc/self/status"}, + Env: standardEnvironment, + Stdin: nil, + Stdout: &stdout, + Capabilities: &configs.Capabilities{}, + Init: true, + } + pconfig.Capabilities.Bounding = append(config.Capabilities.Bounding, "CAP_NET_ADMIN") + pconfig.Capabilities.Permitted = append(config.Capabilities.Permitted, "CAP_NET_ADMIN") + pconfig.Capabilities.Effective = append(config.Capabilities.Effective, "CAP_NET_ADMIN") + pconfig.Capabilities.Inheritable = append(config.Capabilities.Inheritable, "CAP_NET_ADMIN") + err = container.Run(&pconfig) + ok(t, err) + + // Wait for process + waitProcess(&pconfig, t) + + outputStatus := string(stdout.Bytes()) + + lines := strings.Split(outputStatus, "\n") + + effectiveCapsLine := "" + for _, l := range lines { + line := strings.TrimSpace(l) + if strings.Contains(line, "CapEff:") { + effectiveCapsLine = line + break + } + } + + if effectiveCapsLine == "" { + t.Fatal("Couldn't find effective caps: ", outputStatus) + } + + parts := strings.Split(effectiveCapsLine, ":") + effectiveCapsStr := strings.TrimSpace(parts[1]) + + effectiveCaps, err := strconv.ParseUint(effectiveCapsStr, 16, 64) + if err != nil { + t.Fatal("Could not parse effective caps", err) + } + + var netAdminMask uint64 + var netAdminBit uint + netAdminBit = 12 // from capability.h + netAdminMask = 1 << netAdminBit + if effectiveCaps&netAdminMask != netAdminMask { + t.Fatal("CAP_NET_ADMIN is not set as expected") + } +} + +func TestAdditionalGroups(t *testing.T) { + if testing.Short() { + return + } + rootfs, err := newRootfs() + ok(t, err) + defer remove(rootfs) + + config := newTemplateConfig(rootfs) + + container, err := newContainerWithName("test", config) + ok(t, err) + defer container.Destroy() + + var stdout bytes.Buffer + pconfig := libcontainer.Process{ + Cwd: "/", + Args: []string{"sh", "-c", "id", "-Gn"}, + Env: standardEnvironment, + Stdin: nil, + Stdout: &stdout, + AdditionalGroups: []string{"plugdev", "audio"}, + Init: true, + } + err = container.Run(&pconfig) + ok(t, err) + + // Wait for process + waitProcess(&pconfig, t) + + outputGroups := string(stdout.Bytes()) + + // Check that the groups output has the groups that we specified + if !strings.Contains(outputGroups, "audio") { + t.Fatalf("Listed groups do not contain the audio group as expected: %v", outputGroups) + } + + if !strings.Contains(outputGroups, "plugdev") { + t.Fatalf("Listed groups do not contain the plugdev group as expected: %v", outputGroups) + } +} + +func TestFreeze(t *testing.T) { + testFreeze(t, false) +} + +func TestSystemdFreeze(t *testing.T) { + if !systemd.UseSystemd() { + t.Skip("Systemd is unsupported") + } + testFreeze(t, true) +} + +func testFreeze(t *testing.T, systemd bool) { + if testing.Short() { + return + } + if cgroups.IsCgroup2UnifiedMode() { + t.Skip("cgroup v1 is not supported") + } + + rootfs, err := newRootfs() + ok(t, err) + defer remove(rootfs) + + config := newTemplateConfig(rootfs) + container, err := newContainerWithName("test", config) + ok(t, err) + defer container.Destroy() + + stdinR, stdinW, err := os.Pipe() + ok(t, err) + + pconfig := &libcontainer.Process{ + Cwd: "/", + Args: []string{"cat"}, + Env: standardEnvironment, + Stdin: stdinR, + Init: true, + } + err = container.Run(pconfig) + stdinR.Close() + defer stdinW.Close() + ok(t, err) + + err = container.Pause() + ok(t, err) + state, err := container.Status() + ok(t, err) + err = container.Resume() + ok(t, err) + if state != libcontainer.Paused { + t.Fatal("Unexpected state: ", state) + } + + stdinW.Close() + waitProcess(pconfig, t) +} + +func TestCpuShares(t *testing.T) { + testCpuShares(t, false) +} + +func TestCpuSharesSystemd(t *testing.T) { + if !systemd.UseSystemd() { + t.Skip("Systemd is unsupported") + } + testCpuShares(t, true) +} + +func testCpuShares(t *testing.T, systemd bool) { + if testing.Short() { + return + } + if cgroups.IsCgroup2UnifiedMode() { + t.Skip("cgroup v1 is not supported") + } + + rootfs, err := newRootfs() + ok(t, err) + defer remove(rootfs) + + config := newTemplateConfig(rootfs) + if systemd { + config.Cgroups.Parent = "system.slice" + } + config.Cgroups.Resources.CpuShares = 1 + + _, _, err = runContainer(config, "", "ps") + if err == nil { + t.Fatalf("runContainer should failed with invalid CpuShares") + } +} + +func TestPids(t *testing.T) { + testPids(t, false) +} + +func TestPidsSystemd(t *testing.T) { + if !systemd.UseSystemd() { + t.Skip("Systemd is unsupported") + } + testPids(t, true) +} + +func testPids(t *testing.T, systemd bool) { + if testing.Short() { + return + } + if cgroups.IsCgroup2UnifiedMode() { + t.Skip("cgroup v1 is not supported") + } + + rootfs, err := newRootfs() + ok(t, err) + defer remove(rootfs) + + config := newTemplateConfig(rootfs) + if systemd { + config.Cgroups.Parent = "system.slice" + } + config.Cgroups.Resources.PidsLimit = -1 + + // Running multiple processes. + _, ret, err := runContainer(config, "", "/bin/sh", "-c", "/bin/true | /bin/true | /bin/true | /bin/true") + if err != nil && strings.Contains(err.Error(), "no such directory for pids.max") { + t.Skip("PIDs cgroup is unsupported") + } + ok(t, err) + + if ret != 0 { + t.Fatalf("expected fork() to succeed with no pids limit") + } + + // Enforce a permissive limit. This needs to be fairly hand-wavey due to the + // issues with running Go binaries with pids restrictions (see below). + config.Cgroups.Resources.PidsLimit = 64 + _, ret, err = runContainer(config, "", "/bin/sh", "-c", ` + /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | bin/true | /bin/true | + /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | bin/true | /bin/true | + /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | bin/true | /bin/true | + /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | bin/true | /bin/true`) + if err != nil && strings.Contains(err.Error(), "no such directory for pids.max") { + t.Skip("PIDs cgroup is unsupported") + } + ok(t, err) + + if ret != 0 { + t.Fatalf("expected fork() to succeed with permissive pids limit") + } + + // Enforce a restrictive limit. 64 * /bin/true + 1 * shell should cause this + // to fail reliability. + config.Cgroups.Resources.PidsLimit = 64 + out, _, err := runContainer(config, "", "/bin/sh", "-c", ` + /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | bin/true | /bin/true | + /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | bin/true | /bin/true | + /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | bin/true | /bin/true | + /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | bin/true | /bin/true | + /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | bin/true | /bin/true | + /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | bin/true | /bin/true | + /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | bin/true | /bin/true | + /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | bin/true | /bin/true`) + if err != nil && strings.Contains(err.Error(), "no such directory for pids.max") { + t.Skip("PIDs cgroup is unsupported") + } + if err != nil && !strings.Contains(out.String(), "sh: can't fork") { + ok(t, err) + } + + if err == nil { + t.Fatalf("expected fork() to fail with restrictive pids limit") + } + + // Minimal restrictions are not really supported, due to quirks in using Go + // due to the fact that it spawns random processes. While we do our best with + // late setting cgroup values, it's just too unreliable with very small pids.max. + // As such, we don't test that case. YMMV. +} + +func TestRunWithKernelMemory(t *testing.T) { + testRunWithKernelMemory(t, false) +} + +func TestRunWithKernelMemorySystemd(t *testing.T) { + if !systemd.UseSystemd() { + t.Skip("Systemd is unsupported") + } + testRunWithKernelMemory(t, true) +} + +func testRunWithKernelMemory(t *testing.T, systemd bool) { + if testing.Short() { + return + } + if cgroups.IsCgroup2UnifiedMode() { + t.Skip("cgroup v1 is not supported") + } + + rootfs, err := newRootfs() + ok(t, err) + defer remove(rootfs) + + config := newTemplateConfig(rootfs) + if systemd { + config.Cgroups.Parent = "system.slice" + } + config.Cgroups.Resources.KernelMemory = 52428800 + + _, _, err = runContainer(config, "", "ps") + if err != nil { + t.Fatalf("runContainer failed with kernel memory limit: %v", err) + } +} + +func TestContainerState(t *testing.T) { + if testing.Short() { + return + } + + rootfs, err := newRootfs() + if err != nil { + t.Fatal(err) + } + defer remove(rootfs) + + l, err := os.Readlink("/proc/1/ns/ipc") + if err != nil { + t.Fatal(err) + } + + config := newTemplateConfig(rootfs) + config.Namespaces = configs.Namespaces([]configs.Namespace{ + {Type: configs.NEWNS}, + {Type: configs.NEWUTS}, + // host for IPC + //{Type: configs.NEWIPC}, + {Type: configs.NEWPID}, + {Type: configs.NEWNET}, + }) + + container, err := newContainerWithName("test", config) + if err != nil { + t.Fatal(err) + } + defer container.Destroy() + + stdinR, stdinW, err := os.Pipe() + if err != nil { + t.Fatal(err) + } + p := &libcontainer.Process{ + Cwd: "/", + Args: []string{"cat"}, + Env: standardEnvironment, + Stdin: stdinR, + Init: true, + } + err = container.Run(p) + if err != nil { + t.Fatal(err) + } + stdinR.Close() + defer stdinW.Close() + + st, err := container.State() + if err != nil { + t.Fatal(err) + } + + l1, err := os.Readlink(st.NamespacePaths[configs.NEWIPC]) + if err != nil { + t.Fatal(err) + } + if l1 != l { + t.Fatal("Container using non-host ipc namespace") + } + stdinW.Close() + waitProcess(p, t) +} + +func TestPassExtraFiles(t *testing.T) { + if testing.Short() { + return + } + + rootfs, err := newRootfs() + if err != nil { + t.Fatal(err) + } + defer remove(rootfs) + + config := newTemplateConfig(rootfs) + + container, err := newContainerWithName("test", config) + if err != nil { + t.Fatal(err) + } + defer container.Destroy() + + var stdout bytes.Buffer + pipeout1, pipein1, err := os.Pipe() + if err != nil { + t.Fatal(err) + } + pipeout2, pipein2, err := os.Pipe() + if err != nil { + t.Fatal(err) + } + process := libcontainer.Process{ + Cwd: "/", + Args: []string{"sh", "-c", "cd /proc/$$/fd; echo -n *; echo -n 1 >3; echo -n 2 >4"}, + Env: []string{"PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"}, + ExtraFiles: []*os.File{pipein1, pipein2}, + Stdin: nil, + Stdout: &stdout, + Init: true, + } + err = container.Run(&process) + if err != nil { + t.Fatal(err) + } + + waitProcess(&process, t) + + out := string(stdout.Bytes()) + // fd 5 is the directory handle for /proc/$$/fd + if out != "0 1 2 3 4 5" { + t.Fatalf("expected to have the file descriptors '0 1 2 3 4 5' passed to init, got '%s'", out) + } + var buf = []byte{0} + _, err = pipeout1.Read(buf) + if err != nil { + t.Fatal(err) + } + out1 := string(buf) + if out1 != "1" { + t.Fatalf("expected first pipe to receive '1', got '%s'", out1) + } + + _, err = pipeout2.Read(buf) + if err != nil { + t.Fatal(err) + } + out2 := string(buf) + if out2 != "2" { + t.Fatalf("expected second pipe to receive '2', got '%s'", out2) + } +} + +func TestMountCmds(t *testing.T) { + if testing.Short() { + return + } + + rootfs, err := newRootfs() + if err != nil { + t.Fatal(err) + } + defer remove(rootfs) + + tmpDir, err := ioutil.TempDir("", "tmpdir") + if err != nil { + t.Fatal(err) + } + defer os.RemoveAll(tmpDir) + + config := newTemplateConfig(rootfs) + config.Mounts = append(config.Mounts, &configs.Mount{ + Source: tmpDir, + Destination: "/tmp", + Device: "bind", + Flags: unix.MS_BIND | unix.MS_REC, + PremountCmds: []configs.Command{ + {Path: "touch", Args: []string{filepath.Join(tmpDir, "hello")}}, + {Path: "touch", Args: []string{filepath.Join(tmpDir, "world")}}, + }, + PostmountCmds: []configs.Command{ + {Path: "cp", Args: []string{filepath.Join(rootfs, "tmp", "hello"), filepath.Join(rootfs, "tmp", "hello-backup")}}, + {Path: "cp", Args: []string{filepath.Join(rootfs, "tmp", "world"), filepath.Join(rootfs, "tmp", "world-backup")}}, + }, + }) + + container, err := newContainerWithName("test", config) + if err != nil { + t.Fatal(err) + } + defer container.Destroy() + + pconfig := libcontainer.Process{ + Cwd: "/", + Args: []string{"sh", "-c", "env"}, + Env: standardEnvironment, + Init: true, + } + err = container.Run(&pconfig) + if err != nil { + t.Fatal(err) + } + + // Wait for process + waitProcess(&pconfig, t) + + entries, err := ioutil.ReadDir(tmpDir) + if err != nil { + t.Fatal(err) + } + expected := []string{"hello", "hello-backup", "world", "world-backup"} + for i, e := range entries { + if e.Name() != expected[i] { + t.Errorf("Got(%s), expect %s", e.Name(), expected[i]) + } + } +} + +func TestSysctl(t *testing.T) { + if testing.Short() { + return + } + + rootfs, err := newRootfs() + ok(t, err) + defer remove(rootfs) + + config := newTemplateConfig(rootfs) + config.Sysctl = map[string]string{ + "kernel.shmmni": "8192", + } + + container, err := newContainerWithName("test", config) + ok(t, err) + defer container.Destroy() + + var stdout bytes.Buffer + pconfig := libcontainer.Process{ + Cwd: "/", + Args: []string{"sh", "-c", "cat /proc/sys/kernel/shmmni"}, + Env: standardEnvironment, + Stdin: nil, + Stdout: &stdout, + Init: true, + } + err = container.Run(&pconfig) + ok(t, err) + + // Wait for process + waitProcess(&pconfig, t) + + shmmniOutput := strings.TrimSpace(string(stdout.Bytes())) + if shmmniOutput != "8192" { + t.Fatalf("kernel.shmmni property expected to be 8192, but is %s", shmmniOutput) + } +} + +func TestMountCgroupRO(t *testing.T) { + if testing.Short() { + return + } + rootfs, err := newRootfs() + ok(t, err) + defer remove(rootfs) + config := newTemplateConfig(rootfs) + + config.Mounts = append(config.Mounts, &configs.Mount{ + Destination: "/sys/fs/cgroup", + Device: "cgroup", + Flags: defaultMountFlags | unix.MS_RDONLY, + }) + + buffers, exitCode, err := runContainer(config, "", "mount") + if err != nil { + t.Fatalf("%s: %s", buffers, err) + } + if exitCode != 0 { + t.Fatalf("exit code not 0. code %d stderr %q", exitCode, buffers.Stderr) + } + mountInfo := buffers.Stdout.String() + lines := strings.Split(mountInfo, "\n") + for _, l := range lines { + if strings.HasPrefix(l, "tmpfs on /sys/fs/cgroup") { + if !strings.Contains(l, "ro") || + !strings.Contains(l, "nosuid") || + !strings.Contains(l, "nodev") || + !strings.Contains(l, "noexec") { + t.Fatalf("Mode expected to contain 'ro,nosuid,nodev,noexec': %s", l) + } + if !strings.Contains(l, "mode=755") { + t.Fatalf("Mode expected to contain 'mode=755': %s", l) + } + continue + } + if !strings.HasPrefix(l, "cgroup") { + continue + } + if !strings.Contains(l, "ro") || + !strings.Contains(l, "nosuid") || + !strings.Contains(l, "nodev") || + !strings.Contains(l, "noexec") { + t.Fatalf("Mode expected to contain 'ro,nosuid,nodev,noexec': %s", l) + } + } +} + +func TestMountCgroupRW(t *testing.T) { + if testing.Short() { + return + } + rootfs, err := newRootfs() + ok(t, err) + defer remove(rootfs) + config := newTemplateConfig(rootfs) + + config.Mounts = append(config.Mounts, &configs.Mount{ + Destination: "/sys/fs/cgroup", + Device: "cgroup", + Flags: defaultMountFlags, + }) + + buffers, exitCode, err := runContainer(config, "", "mount") + if err != nil { + t.Fatalf("%s: %s", buffers, err) + } + if exitCode != 0 { + t.Fatalf("exit code not 0. code %d stderr %q", exitCode, buffers.Stderr) + } + mountInfo := buffers.Stdout.String() + lines := strings.Split(mountInfo, "\n") + for _, l := range lines { + if strings.HasPrefix(l, "tmpfs on /sys/fs/cgroup") { + if !strings.Contains(l, "rw") || + !strings.Contains(l, "nosuid") || + !strings.Contains(l, "nodev") || + !strings.Contains(l, "noexec") { + t.Fatalf("Mode expected to contain 'rw,nosuid,nodev,noexec': %s", l) + } + if !strings.Contains(l, "mode=755") { + t.Fatalf("Mode expected to contain 'mode=755': %s", l) + } + continue + } + if !strings.HasPrefix(l, "cgroup") { + continue + } + if !strings.Contains(l, "rw") || + !strings.Contains(l, "nosuid") || + !strings.Contains(l, "nodev") || + !strings.Contains(l, "noexec") { + t.Fatalf("Mode expected to contain 'rw,nosuid,nodev,noexec': %s", l) + } + } +} + +func TestOomScoreAdj(t *testing.T) { + if testing.Short() { + return + } + + rootfs, err := newRootfs() + ok(t, err) + defer remove(rootfs) + + config := newTemplateConfig(rootfs) + config.OomScoreAdj = ptrInt(200) + + container, err := newContainerWithName("test", config) + ok(t, err) + defer container.Destroy() + + var stdout bytes.Buffer + pconfig := libcontainer.Process{ + Cwd: "/", + Args: []string{"sh", "-c", "cat /proc/self/oom_score_adj"}, + Env: standardEnvironment, + Stdin: nil, + Stdout: &stdout, + Init: true, + } + err = container.Run(&pconfig) + ok(t, err) + + // Wait for process + waitProcess(&pconfig, t) + outputOomScoreAdj := strings.TrimSpace(string(stdout.Bytes())) + + // Check that the oom_score_adj matches the value that was set as part of config. + if outputOomScoreAdj != strconv.Itoa(*config.OomScoreAdj) { + t.Fatalf("Expected oom_score_adj %d; got %q", *config.OomScoreAdj, outputOomScoreAdj) + } +} + +func TestHook(t *testing.T) { + if testing.Short() { + return + } + + bundle, err := newTestBundle() + ok(t, err) + defer remove(bundle) + + rootfs, err := newRootfs() + ok(t, err) + defer remove(rootfs) + + config := newTemplateConfig(rootfs) + expectedBundle := bundle + config.Labels = append(config.Labels, fmt.Sprintf("bundle=%s", expectedBundle)) + + getRootfsFromBundle := func(bundle string) (string, error) { + f, err := os.Open(filepath.Join(bundle, "config.json")) + if err != nil { + return "", err + } + + var config configs.Config + if err = json.NewDecoder(f).Decode(&config); err != nil { + return "", err + } + return config.Rootfs, nil + } + + config.Hooks = &configs.Hooks{ + Prestart: []configs.Hook{ + configs.NewFunctionHook(func(s *specs.State) error { + if s.Bundle != expectedBundle { + t.Fatalf("Expected prestart hook bundlePath '%s'; got '%s'", expectedBundle, s.Bundle) + } + + root, err := getRootfsFromBundle(s.Bundle) + if err != nil { + return err + } + f, err := os.Create(filepath.Join(root, "test")) + if err != nil { + return err + } + return f.Close() + }), + }, + Poststart: []configs.Hook{ + configs.NewFunctionHook(func(s *specs.State) error { + if s.Bundle != expectedBundle { + t.Fatalf("Expected poststart hook bundlePath '%s'; got '%s'", expectedBundle, s.Bundle) + } + + root, err := getRootfsFromBundle(s.Bundle) + if err != nil { + return err + } + return ioutil.WriteFile(filepath.Join(root, "test"), []byte("hello world"), 0755) + }), + }, + Poststop: []configs.Hook{ + configs.NewFunctionHook(func(s *specs.State) error { + if s.Bundle != expectedBundle { + t.Fatalf("Expected poststop hook bundlePath '%s'; got '%s'", expectedBundle, s.Bundle) + } + + root, err := getRootfsFromBundle(s.Bundle) + if err != nil { + return err + } + return os.RemoveAll(filepath.Join(root, "test")) + }), + }, + } + + // write config of json format into config.json under bundle + f, err := os.OpenFile(filepath.Join(bundle, "config.json"), os.O_CREATE|os.O_RDWR, 0644) + ok(t, err) + ok(t, json.NewEncoder(f).Encode(config)) + + container, err := newContainerWithName("test", config) + ok(t, err) + + var stdout bytes.Buffer + pconfig := libcontainer.Process{ + Cwd: "/", + Args: []string{"sh", "-c", "ls /test"}, + Env: standardEnvironment, + Stdin: nil, + Stdout: &stdout, + Init: true, + } + err = container.Run(&pconfig) + ok(t, err) + + // Wait for process + waitProcess(&pconfig, t) + + outputLs := string(stdout.Bytes()) + + // Check that the ls output has the expected file touched by the prestart hook + if !strings.Contains(outputLs, "/test") { + container.Destroy() + t.Fatalf("ls output doesn't have the expected file: %s", outputLs) + } + + // Check that the file is written by the poststart hook + testFilePath := filepath.Join(rootfs, "test") + contents, err := ioutil.ReadFile(testFilePath) + if err != nil { + t.Fatalf("cannot read file '%s': %s", testFilePath, err) + } + if string(contents) != "hello world" { + t.Fatalf("Expected test file to contain 'hello world'; got '%s'", string(contents)) + } + + if err := container.Destroy(); err != nil { + t.Fatalf("container destroy %s", err) + } + fi, err := os.Stat(filepath.Join(rootfs, "test")) + if err == nil || !os.IsNotExist(err) { + t.Fatalf("expected file to not exist, got %s", fi.Name()) + } +} + +func TestSTDIOPermissions(t *testing.T) { + if testing.Short() { + return + } + + rootfs, err := newRootfs() + ok(t, err) + defer remove(rootfs) + config := newTemplateConfig(rootfs) + buffers, exitCode, err := runContainer(config, "", "sh", "-c", "echo hi > /dev/stderr") + ok(t, err) + if exitCode != 0 { + t.Fatalf("exit code not 0. code %d stderr %q", exitCode, buffers.Stderr) + } + + if actual := strings.Trim(buffers.Stderr.String(), "\n"); actual != "hi" { + t.Fatalf("stderr should equal be equal %q %q", actual, "hi") + } +} + +func unmountOp(path string) error { + return unix.Unmount(path, unix.MNT_DETACH) +} + +// Launch container with rootfsPropagation in rslave mode. Also +// bind mount a volume /mnt1host at /mnt1cont at the time of launch. Now do +// another mount on host (/mnt1host/mnt2host) and this new mount should +// propagate to container (/mnt1cont/mnt2host) +func TestRootfsPropagationSlaveMount(t *testing.T) { + var mountPropagated bool + var dir1cont string + var dir2cont string + + dir1cont = "/root/mnt1cont" + + if testing.Short() { + return + } + rootfs, err := newRootfs() + ok(t, err) + defer remove(rootfs) + config := newTemplateConfig(rootfs) + + config.RootPropagation = unix.MS_SLAVE | unix.MS_REC + + // Bind mount a volume + dir1host, err := ioutil.TempDir("", "mnt1host") + ok(t, err) + defer os.RemoveAll(dir1host) + + // Make this dir a "shared" mount point. This will make sure a + // slave relationship can be established in container. + err = unix.Mount(dir1host, dir1host, "bind", unix.MS_BIND|unix.MS_REC, "") + ok(t, err) + err = unix.Mount("", dir1host, "", unix.MS_SHARED|unix.MS_REC, "") + ok(t, err) + defer unmountOp(dir1host) + + config.Mounts = append(config.Mounts, &configs.Mount{ + Source: dir1host, + Destination: dir1cont, + Device: "bind", + Flags: unix.MS_BIND | unix.MS_REC}) + + container, err := newContainerWithName("testSlaveMount", config) + ok(t, err) + defer container.Destroy() + + stdinR, stdinW, err := os.Pipe() + ok(t, err) + + pconfig := &libcontainer.Process{ + Cwd: "/", + Args: []string{"cat"}, + Env: standardEnvironment, + Stdin: stdinR, + Init: true, + } + + err = container.Run(pconfig) + stdinR.Close() + defer stdinW.Close() + ok(t, err) + + // Create mnt1host/mnt2host and bind mount itself on top of it. This + // should be visible in container. + dir2host, err := ioutil.TempDir(dir1host, "mnt2host") + ok(t, err) + defer os.RemoveAll(dir2host) + + err = unix.Mount(dir2host, dir2host, "bind", unix.MS_BIND, "") + defer unmountOp(dir2host) + ok(t, err) + + // Run "cat /proc/self/mountinfo" in container and look at mount points. + var stdout2 bytes.Buffer + + stdinR2, stdinW2, err := os.Pipe() + ok(t, err) + + pconfig2 := &libcontainer.Process{ + Cwd: "/", + Args: []string{"cat", "/proc/self/mountinfo"}, + Env: standardEnvironment, + Stdin: stdinR2, + Stdout: &stdout2, + } + + err = container.Run(pconfig2) + stdinR2.Close() + defer stdinW2.Close() + ok(t, err) + + stdinW2.Close() + waitProcess(pconfig2, t) + stdinW.Close() + waitProcess(pconfig, t) + + mountPropagated = false + dir2cont = filepath.Join(dir1cont, filepath.Base(dir2host)) + + propagationInfo := string(stdout2.Bytes()) + lines := strings.Split(propagationInfo, "\n") + for _, l := range lines { + linefields := strings.Split(l, " ") + if len(linefields) < 5 { + continue + } + + if linefields[4] == dir2cont { + mountPropagated = true + break + } + } + + if mountPropagated != true { + t.Fatalf("Mount on host %s did not propagate in container at %s\n", dir2host, dir2cont) + } +} + +// Launch container with rootfsPropagation 0 so no propagation flags are +// applied. Also bind mount a volume /mnt1host at /mnt1cont at the time of +// launch. Now do a mount in container (/mnt1cont/mnt2cont) and this new +// mount should propagate to host (/mnt1host/mnt2cont) + +func TestRootfsPropagationSharedMount(t *testing.T) { + var dir1cont string + var dir2cont string + + dir1cont = "/root/mnt1cont" + + if testing.Short() { + return + } + rootfs, err := newRootfs() + ok(t, err) + defer remove(rootfs) + config := newTemplateConfig(rootfs) + config.RootPropagation = unix.MS_PRIVATE + + // Bind mount a volume + dir1host, err := ioutil.TempDir("", "mnt1host") + ok(t, err) + defer os.RemoveAll(dir1host) + + // Make this dir a "shared" mount point. This will make sure a + // shared relationship can be established in container. + err = unix.Mount(dir1host, dir1host, "bind", unix.MS_BIND|unix.MS_REC, "") + ok(t, err) + err = unix.Mount("", dir1host, "", unix.MS_SHARED|unix.MS_REC, "") + ok(t, err) + defer unmountOp(dir1host) + + config.Mounts = append(config.Mounts, &configs.Mount{ + Source: dir1host, + Destination: dir1cont, + Device: "bind", + Flags: unix.MS_BIND | unix.MS_REC}) + + container, err := newContainerWithName("testSharedMount", config) + ok(t, err) + defer container.Destroy() + + stdinR, stdinW, err := os.Pipe() + ok(t, err) + + pconfig := &libcontainer.Process{ + Cwd: "/", + Args: []string{"cat"}, + Env: standardEnvironment, + Stdin: stdinR, + Init: true, + } + + err = container.Run(pconfig) + stdinR.Close() + defer stdinW.Close() + ok(t, err) + + // Create mnt1host/mnt2cont. This will become visible inside container + // at mnt1cont/mnt2cont. Bind mount itself on top of it. This + // should be visible on host now. + dir2host, err := ioutil.TempDir(dir1host, "mnt2cont") + ok(t, err) + defer os.RemoveAll(dir2host) + + dir2cont = filepath.Join(dir1cont, filepath.Base(dir2host)) + + // Mount something in container and see if it is visible on host. + var stdout2 bytes.Buffer + + stdinR2, stdinW2, err := os.Pipe() + ok(t, err) + + pconfig2 := &libcontainer.Process{ + Cwd: "/", + Args: []string{"mount", "--bind", dir2cont, dir2cont}, + Env: standardEnvironment, + Stdin: stdinR2, + Stdout: &stdout2, + Capabilities: &configs.Capabilities{}, + } + + // Provide CAP_SYS_ADMIN + pconfig2.Capabilities.Bounding = append(config.Capabilities.Bounding, "CAP_SYS_ADMIN") + pconfig2.Capabilities.Permitted = append(config.Capabilities.Permitted, "CAP_SYS_ADMIN") + pconfig2.Capabilities.Effective = append(config.Capabilities.Effective, "CAP_SYS_ADMIN") + pconfig2.Capabilities.Inheritable = append(config.Capabilities.Inheritable, "CAP_SYS_ADMIN") + + err = container.Run(pconfig2) + stdinR2.Close() + defer stdinW2.Close() + ok(t, err) + + // Wait for process + stdinW2.Close() + waitProcess(pconfig2, t) + stdinW.Close() + waitProcess(pconfig, t) + + defer unmountOp(dir2host) + + // Check if mount is visible on host or not. + out, err := exec.Command("findmnt", "-n", "-f", "-oTARGET", dir2host).CombinedOutput() + outtrim := strings.TrimSpace(string(out)) + if err != nil { + t.Logf("findmnt error %q: %q", err, outtrim) + } + + if string(outtrim) != dir2host { + t.Fatalf("Mount in container on %s did not propagate to host on %s. finmnt output=%s", dir2cont, dir2host, outtrim) + } +} + +func TestPIDHost(t *testing.T) { + if testing.Short() { + return + } + + rootfs, err := newRootfs() + ok(t, err) + defer remove(rootfs) + + l, err := os.Readlink("/proc/1/ns/pid") + ok(t, err) + + config := newTemplateConfig(rootfs) + config.Namespaces.Remove(configs.NEWPID) + buffers, exitCode, err := runContainer(config, "", "readlink", "/proc/self/ns/pid") + ok(t, err) + + if exitCode != 0 { + t.Fatalf("exit code not 0. code %d stderr %q", exitCode, buffers.Stderr) + } + + if actual := strings.Trim(buffers.Stdout.String(), "\n"); actual != l { + t.Fatalf("ipc link not equal to host link %q %q", actual, l) + } +} + +func TestInitJoinPID(t *testing.T) { + if testing.Short() { + return + } + rootfs, err := newRootfs() + ok(t, err) + defer remove(rootfs) + + // Execute a long-running container + container1, err := newContainer(newTemplateConfig(rootfs)) + ok(t, err) + defer container1.Destroy() + + stdinR1, stdinW1, err := os.Pipe() + ok(t, err) + init1 := &libcontainer.Process{ + Cwd: "/", + Args: []string{"cat"}, + Env: standardEnvironment, + Stdin: stdinR1, + Init: true, + } + err = container1.Run(init1) + stdinR1.Close() + defer stdinW1.Close() + ok(t, err) + + // get the state of the first container + state1, err := container1.State() + ok(t, err) + pidns1 := state1.NamespacePaths[configs.NEWPID] + + // Run a container inside the existing pidns but with different cgroups + config2 := newTemplateConfig(rootfs) + config2.Namespaces.Add(configs.NEWPID, pidns1) + config2.Cgroups.Path = "integration/test2" + container2, err := newContainerWithName("testCT2", config2) + ok(t, err) + defer container2.Destroy() + + stdinR2, stdinW2, err := os.Pipe() + ok(t, err) + init2 := &libcontainer.Process{ + Cwd: "/", + Args: []string{"cat"}, + Env: standardEnvironment, + Stdin: stdinR2, + Init: true, + } + err = container2.Run(init2) + stdinR2.Close() + defer stdinW2.Close() + ok(t, err) + // get the state of the second container + state2, err := container2.State() + ok(t, err) + + ns1, err := os.Readlink(fmt.Sprintf("/proc/%d/ns/pid", state1.InitProcessPid)) + ok(t, err) + ns2, err := os.Readlink(fmt.Sprintf("/proc/%d/ns/pid", state2.InitProcessPid)) + ok(t, err) + if ns1 != ns2 { + t.Errorf("pidns(%s), wanted %s", ns2, ns1) + } + + // check that namespaces are not the same + if reflect.DeepEqual(state2.NamespacePaths, state1.NamespacePaths) { + t.Errorf("Namespaces(%v), original %v", state2.NamespacePaths, + state1.NamespacePaths) + } + // check that pidns is joined correctly. The initial container process list + // should contain the second container's init process + buffers := newStdBuffers() + ps := &libcontainer.Process{ + Cwd: "/", + Args: []string{"ps"}, + Env: standardEnvironment, + Stdout: buffers.Stdout, + } + err = container1.Run(ps) + ok(t, err) + waitProcess(ps, t) + + // Stop init processes one by one. Stop the second container should + // not stop the first. + stdinW2.Close() + waitProcess(init2, t) + stdinW1.Close() + waitProcess(init1, t) + + out := strings.TrimSpace(buffers.Stdout.String()) + // output of ps inside the initial PID namespace should have + // 1 line of header, + // 2 lines of init processes, + // 1 line of ps process + if len(strings.Split(out, "\n")) != 4 { + t.Errorf("unexpected running process, output %q", out) + } +} + +func TestInitJoinNetworkAndUser(t *testing.T) { + if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) { + t.Skip("userns is unsupported") + } + if testing.Short() { + return + } + rootfs, err := newRootfs() + ok(t, err) + defer remove(rootfs) + + // Execute a long-running container + config1 := newTemplateConfig(rootfs) + config1.UidMappings = []configs.IDMap{{HostID: 0, ContainerID: 0, Size: 1000}} + config1.GidMappings = []configs.IDMap{{HostID: 0, ContainerID: 0, Size: 1000}} + config1.Namespaces = append(config1.Namespaces, configs.Namespace{Type: configs.NEWUSER}) + container1, err := newContainer(config1) + ok(t, err) + defer container1.Destroy() + + stdinR1, stdinW1, err := os.Pipe() + ok(t, err) + init1 := &libcontainer.Process{ + Cwd: "/", + Args: []string{"cat"}, + Env: standardEnvironment, + Stdin: stdinR1, + Init: true, + } + err = container1.Run(init1) + stdinR1.Close() + defer stdinW1.Close() + ok(t, err) + + // get the state of the first container + state1, err := container1.State() + ok(t, err) + netns1 := state1.NamespacePaths[configs.NEWNET] + userns1 := state1.NamespacePaths[configs.NEWUSER] + + // Run a container inside the existing pidns but with different cgroups + rootfs2, err := newRootfs() + ok(t, err) + defer remove(rootfs2) + + config2 := newTemplateConfig(rootfs2) + config2.UidMappings = []configs.IDMap{{HostID: 0, ContainerID: 0, Size: 1000}} + config2.GidMappings = []configs.IDMap{{HostID: 0, ContainerID: 0, Size: 1000}} + config2.Namespaces.Add(configs.NEWNET, netns1) + config2.Namespaces.Add(configs.NEWUSER, userns1) + config2.Cgroups.Path = "integration/test2" + container2, err := newContainerWithName("testCT2", config2) + ok(t, err) + defer container2.Destroy() + + stdinR2, stdinW2, err := os.Pipe() + ok(t, err) + init2 := &libcontainer.Process{ + Cwd: "/", + Args: []string{"cat"}, + Env: standardEnvironment, + Stdin: stdinR2, + Init: true, + } + err = container2.Run(init2) + stdinR2.Close() + defer stdinW2.Close() + ok(t, err) + + // get the state of the second container + state2, err := container2.State() + ok(t, err) + + for _, ns := range []string{"net", "user"} { + ns1, err := os.Readlink(fmt.Sprintf("/proc/%d/ns/%s", state1.InitProcessPid, ns)) + ok(t, err) + ns2, err := os.Readlink(fmt.Sprintf("/proc/%d/ns/%s", state2.InitProcessPid, ns)) + ok(t, err) + if ns1 != ns2 { + t.Errorf("%s(%s), wanted %s", ns, ns2, ns1) + } + } + + // check that namespaces are not the same + if reflect.DeepEqual(state2.NamespacePaths, state1.NamespacePaths) { + t.Errorf("Namespaces(%v), original %v", state2.NamespacePaths, + state1.NamespacePaths) + } + // Stop init processes one by one. Stop the second container should + // not stop the first. + stdinW2.Close() + waitProcess(init2, t) + stdinW1.Close() + waitProcess(init1, t) +} + +func TestTmpfsCopyUp(t *testing.T) { + if testing.Short() { + return + } + + rootfs, err := newRootfs() + ok(t, err) + defer remove(rootfs) + + config := newTemplateConfig(rootfs) + + config.Mounts = append(config.Mounts, &configs.Mount{ + Source: "tmpfs", + Destination: "/etc", + Device: "tmpfs", + Extensions: configs.EXT_COPYUP, + }) + + container, err := newContainerWithName("test", config) + ok(t, err) + defer container.Destroy() + + var stdout bytes.Buffer + pconfig := libcontainer.Process{ + Args: []string{"ls", "/etc/passwd"}, + Env: standardEnvironment, + Stdin: nil, + Stdout: &stdout, + Init: true, + } + err = container.Run(&pconfig) + ok(t, err) + + // Wait for process + waitProcess(&pconfig, t) + + outputLs := string(stdout.Bytes()) + + // Check that the ls output has /etc/passwd + if !strings.Contains(outputLs, "/etc/passwd") { + t.Fatalf("/etc/passwd not copied up as expected: %v", outputLs) + } +} + +func TestCGROUPPrivate(t *testing.T) { + if _, err := os.Stat("/proc/self/ns/cgroup"); os.IsNotExist(err) { + t.Skip("cgroupns is unsupported") + } + if testing.Short() { + return + } + + rootfs, err := newRootfs() + ok(t, err) + defer remove(rootfs) + + l, err := os.Readlink("/proc/1/ns/cgroup") + ok(t, err) + + config := newTemplateConfig(rootfs) + config.Namespaces.Add(configs.NEWCGROUP, "") + buffers, exitCode, err := runContainer(config, "", "readlink", "/proc/self/ns/cgroup") + ok(t, err) + + if exitCode != 0 { + t.Fatalf("exit code not 0. code %d stderr %q", exitCode, buffers.Stderr) + } + + if actual := strings.Trim(buffers.Stdout.String(), "\n"); actual == l { + t.Fatalf("cgroup link should be private to the container but equals host %q %q", actual, l) + } +} + +func TestCGROUPHost(t *testing.T) { + if _, err := os.Stat("/proc/self/ns/cgroup"); os.IsNotExist(err) { + t.Skip("cgroupns is unsupported") + } + if testing.Short() { + return + } + + rootfs, err := newRootfs() + ok(t, err) + defer remove(rootfs) + + l, err := os.Readlink("/proc/1/ns/cgroup") + ok(t, err) + + config := newTemplateConfig(rootfs) + buffers, exitCode, err := runContainer(config, "", "readlink", "/proc/self/ns/cgroup") + ok(t, err) + + if exitCode != 0 { + t.Fatalf("exit code not 0. code %d stderr %q", exitCode, buffers.Stderr) + } + + if actual := strings.Trim(buffers.Stdout.String(), "\n"); actual != l { + t.Fatalf("cgroup link not equal to host link %q %q", actual, l) + } +} diff --git a/libcontainer/integration/execin_test.go b/libcontainer/integration/execin_test.go new file mode 100644 index 0000000..14f8a59 --- /dev/null +++ b/libcontainer/integration/execin_test.go @@ -0,0 +1,608 @@ +package integration + +import ( + "bytes" + "fmt" + "io" + "os" + "strconv" + "strings" + "testing" + "time" + + "github.com/containerd/console" + "github.com/opencontainers/runc/libcontainer" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/utils" + + "golang.org/x/sys/unix" +) + +func TestExecIn(t *testing.T) { + if testing.Short() { + return + } + rootfs, err := newRootfs() + ok(t, err) + defer remove(rootfs) + config := newTemplateConfig(rootfs) + container, err := newContainer(config) + ok(t, err) + defer container.Destroy() + + // Execute a first process in the container + stdinR, stdinW, err := os.Pipe() + ok(t, err) + process := &libcontainer.Process{ + Cwd: "/", + Args: []string{"cat"}, + Env: standardEnvironment, + Stdin: stdinR, + Init: true, + } + err = container.Run(process) + stdinR.Close() + defer stdinW.Close() + ok(t, err) + + buffers := newStdBuffers() + ps := &libcontainer.Process{ + Cwd: "/", + Args: []string{"ps"}, + Env: standardEnvironment, + Stdin: buffers.Stdin, + Stdout: buffers.Stdout, + Stderr: buffers.Stderr, + } + + err = container.Run(ps) + ok(t, err) + waitProcess(ps, t) + stdinW.Close() + waitProcess(process, t) + + out := buffers.Stdout.String() + if !strings.Contains(out, "cat") || !strings.Contains(out, "ps") { + t.Fatalf("unexpected running process, output %q", out) + } + if strings.Contains(out, "\r") { + t.Fatalf("unexpected carriage-return in output") + } +} + +func TestExecInUsernsRlimit(t *testing.T) { + if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) { + t.Skip("userns is unsupported") + } + + testExecInRlimit(t, true) +} + +func TestExecInRlimit(t *testing.T) { + testExecInRlimit(t, false) +} + +func testExecInRlimit(t *testing.T, userns bool) { + if testing.Short() { + return + } + + rootfs, err := newRootfs() + ok(t, err) + defer remove(rootfs) + + config := newTemplateConfig(rootfs) + if userns { + config.UidMappings = []configs.IDMap{{HostID: 0, ContainerID: 0, Size: 1000}} + config.GidMappings = []configs.IDMap{{HostID: 0, ContainerID: 0, Size: 1000}} + config.Namespaces = append(config.Namespaces, configs.Namespace{Type: configs.NEWUSER}) + } + + container, err := newContainer(config) + ok(t, err) + defer container.Destroy() + + stdinR, stdinW, err := os.Pipe() + ok(t, err) + process := &libcontainer.Process{ + Cwd: "/", + Args: []string{"cat"}, + Env: standardEnvironment, + Stdin: stdinR, + Init: true, + } + err = container.Run(process) + stdinR.Close() + defer stdinW.Close() + ok(t, err) + + buffers := newStdBuffers() + ps := &libcontainer.Process{ + Cwd: "/", + Args: []string{"/bin/sh", "-c", "ulimit -n"}, + Env: standardEnvironment, + Stdin: buffers.Stdin, + Stdout: buffers.Stdout, + Stderr: buffers.Stderr, + Rlimits: []configs.Rlimit{ + // increase process rlimit higher than container rlimit to test per-process limit + {Type: unix.RLIMIT_NOFILE, Hard: 1026, Soft: 1026}, + }, + Init: true, + } + err = container.Run(ps) + ok(t, err) + waitProcess(ps, t) + + stdinW.Close() + waitProcess(process, t) + + out := buffers.Stdout.String() + if limit := strings.TrimSpace(out); limit != "1026" { + t.Fatalf("expected rlimit to be 1026, got %s", limit) + } +} + +func TestExecInAdditionalGroups(t *testing.T) { + if testing.Short() { + return + } + + rootfs, err := newRootfs() + ok(t, err) + defer remove(rootfs) + + config := newTemplateConfig(rootfs) + container, err := newContainer(config) + ok(t, err) + defer container.Destroy() + + // Execute a first process in the container + stdinR, stdinW, err := os.Pipe() + ok(t, err) + process := &libcontainer.Process{ + Cwd: "/", + Args: []string{"cat"}, + Env: standardEnvironment, + Stdin: stdinR, + Init: true, + } + err = container.Run(process) + stdinR.Close() + defer stdinW.Close() + ok(t, err) + + var stdout bytes.Buffer + pconfig := libcontainer.Process{ + Cwd: "/", + Args: []string{"sh", "-c", "id", "-Gn"}, + Env: standardEnvironment, + Stdin: nil, + Stdout: &stdout, + AdditionalGroups: []string{"plugdev", "audio"}, + } + err = container.Run(&pconfig) + ok(t, err) + + // Wait for process + waitProcess(&pconfig, t) + + stdinW.Close() + waitProcess(process, t) + + outputGroups := string(stdout.Bytes()) + + // Check that the groups output has the groups that we specified + if !strings.Contains(outputGroups, "audio") { + t.Fatalf("Listed groups do not contain the audio group as expected: %v", outputGroups) + } + + if !strings.Contains(outputGroups, "plugdev") { + t.Fatalf("Listed groups do not contain the plugdev group as expected: %v", outputGroups) + } +} + +func TestExecInError(t *testing.T) { + if testing.Short() { + return + } + rootfs, err := newRootfs() + ok(t, err) + defer remove(rootfs) + config := newTemplateConfig(rootfs) + container, err := newContainer(config) + ok(t, err) + defer container.Destroy() + + // Execute a first process in the container + stdinR, stdinW, err := os.Pipe() + ok(t, err) + process := &libcontainer.Process{ + Cwd: "/", + Args: []string{"cat"}, + Env: standardEnvironment, + Stdin: stdinR, + Init: true, + } + err = container.Run(process) + stdinR.Close() + defer func() { + stdinW.Close() + if _, err := process.Wait(); err != nil { + t.Log(err) + } + }() + ok(t, err) + + for i := 0; i < 42; i++ { + var out bytes.Buffer + unexistent := &libcontainer.Process{ + Cwd: "/", + Args: []string{"unexistent"}, + Env: standardEnvironment, + Stderr: &out, + } + err = container.Run(unexistent) + if err == nil { + t.Fatal("Should be an error") + } + if !strings.Contains(err.Error(), "executable file not found") { + t.Fatalf("Should be error about not found executable, got %s", err) + } + if !bytes.Contains(out.Bytes(), []byte("executable file not found")) { + t.Fatalf("executable file not found error not delivered to stdio:\n%s", out.String()) + } + } +} + +func TestExecInTTY(t *testing.T) { + if testing.Short() { + return + } + rootfs, err := newRootfs() + ok(t, err) + defer remove(rootfs) + config := newTemplateConfig(rootfs) + container, err := newContainer(config) + ok(t, err) + defer container.Destroy() + + // Execute a first process in the container + stdinR, stdinW, err := os.Pipe() + ok(t, err) + process := &libcontainer.Process{ + Cwd: "/", + Args: []string{"cat"}, + Env: standardEnvironment, + Stdin: stdinR, + Init: true, + } + err = container.Run(process) + stdinR.Close() + defer stdinW.Close() + ok(t, err) + + var stdout bytes.Buffer + ps := &libcontainer.Process{ + Cwd: "/", + Args: []string{"ps"}, + Env: standardEnvironment, + } + parent, child, err := utils.NewSockPair("console") + if err != nil { + ok(t, err) + } + defer parent.Close() + defer child.Close() + ps.ConsoleSocket = child + type cdata struct { + c console.Console + err error + } + dc := make(chan *cdata, 1) + go func() { + f, err := utils.RecvFd(parent) + if err != nil { + dc <- &cdata{ + err: err, + } + return + } + c, err := console.ConsoleFromFile(f) + if err != nil { + dc <- &cdata{ + err: err, + } + return + } + console.ClearONLCR(c.Fd()) + dc <- &cdata{ + c: c, + } + }() + err = container.Run(ps) + ok(t, err) + data := <-dc + if data.err != nil { + ok(t, data.err) + } + console := data.c + copy := make(chan struct{}) + go func() { + io.Copy(&stdout, console) + close(copy) + }() + ok(t, err) + select { + case <-time.After(5 * time.Second): + t.Fatal("Waiting for copy timed out") + case <-copy: + } + waitProcess(ps, t) + + stdinW.Close() + waitProcess(process, t) + + out := stdout.String() + if !strings.Contains(out, "cat") || !strings.Contains(out, "ps") { + t.Fatalf("unexpected running process, output %q", out) + } + if strings.Contains(out, "\r") { + t.Fatalf("unexpected carriage-return in output") + } +} + +func TestExecInEnvironment(t *testing.T) { + if testing.Short() { + return + } + rootfs, err := newRootfs() + ok(t, err) + defer remove(rootfs) + config := newTemplateConfig(rootfs) + container, err := newContainer(config) + ok(t, err) + defer container.Destroy() + + // Execute a first process in the container + stdinR, stdinW, err := os.Pipe() + ok(t, err) + process := &libcontainer.Process{ + Cwd: "/", + Args: []string{"cat"}, + Env: standardEnvironment, + Stdin: stdinR, + Init: true, + } + err = container.Run(process) + stdinR.Close() + defer stdinW.Close() + ok(t, err) + + buffers := newStdBuffers() + process2 := &libcontainer.Process{ + Cwd: "/", + Args: []string{"env"}, + Env: []string{ + "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", + "DEBUG=true", + "DEBUG=false", + "ENV=test", + }, + Stdin: buffers.Stdin, + Stdout: buffers.Stdout, + Stderr: buffers.Stderr, + Init: true, + } + err = container.Run(process2) + ok(t, err) + waitProcess(process2, t) + + stdinW.Close() + waitProcess(process, t) + + out := buffers.Stdout.String() + // check execin's process environment + if !strings.Contains(out, "DEBUG=false") || + !strings.Contains(out, "ENV=test") || + !strings.Contains(out, "HOME=/root") || + !strings.Contains(out, "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin") || + strings.Contains(out, "DEBUG=true") { + t.Fatalf("unexpected running process, output %q", out) + } +} + +func TestExecinPassExtraFiles(t *testing.T) { + if testing.Short() { + return + } + rootfs, err := newRootfs() + if err != nil { + t.Fatal(err) + } + defer remove(rootfs) + config := newTemplateConfig(rootfs) + container, err := newContainer(config) + if err != nil { + t.Fatal(err) + } + defer container.Destroy() + + // Execute a first process in the container + stdinR, stdinW, err := os.Pipe() + if err != nil { + t.Fatal(err) + } + process := &libcontainer.Process{ + Cwd: "/", + Args: []string{"cat"}, + Env: standardEnvironment, + Stdin: stdinR, + Init: true, + } + err = container.Run(process) + stdinR.Close() + defer stdinW.Close() + if err != nil { + t.Fatal(err) + } + + var stdout bytes.Buffer + pipeout1, pipein1, err := os.Pipe() + if err != nil { + t.Fatal(err) + } + pipeout2, pipein2, err := os.Pipe() + if err != nil { + t.Fatal(err) + } + inprocess := &libcontainer.Process{ + Cwd: "/", + Args: []string{"sh", "-c", "cd /proc/$$/fd; echo -n *; echo -n 1 >3; echo -n 2 >4"}, + Env: []string{"PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"}, + ExtraFiles: []*os.File{pipein1, pipein2}, + Stdin: nil, + Stdout: &stdout, + } + err = container.Run(inprocess) + if err != nil { + t.Fatal(err) + } + + waitProcess(inprocess, t) + stdinW.Close() + waitProcess(process, t) + + out := string(stdout.Bytes()) + // fd 5 is the directory handle for /proc/$$/fd + if out != "0 1 2 3 4 5" { + t.Fatalf("expected to have the file descriptors '0 1 2 3 4 5' passed to exec, got '%s'", out) + } + var buf = []byte{0} + _, err = pipeout1.Read(buf) + if err != nil { + t.Fatal(err) + } + out1 := string(buf) + if out1 != "1" { + t.Fatalf("expected first pipe to receive '1', got '%s'", out1) + } + + _, err = pipeout2.Read(buf) + if err != nil { + t.Fatal(err) + } + out2 := string(buf) + if out2 != "2" { + t.Fatalf("expected second pipe to receive '2', got '%s'", out2) + } +} + +func TestExecInOomScoreAdj(t *testing.T) { + if testing.Short() { + return + } + rootfs, err := newRootfs() + ok(t, err) + defer remove(rootfs) + config := newTemplateConfig(rootfs) + config.OomScoreAdj = ptrInt(200) + container, err := newContainer(config) + ok(t, err) + defer container.Destroy() + + stdinR, stdinW, err := os.Pipe() + ok(t, err) + process := &libcontainer.Process{ + Cwd: "/", + Args: []string{"cat"}, + Env: standardEnvironment, + Stdin: stdinR, + Init: true, + } + err = container.Run(process) + stdinR.Close() + defer stdinW.Close() + ok(t, err) + + buffers := newStdBuffers() + ps := &libcontainer.Process{ + Cwd: "/", + Args: []string{"/bin/sh", "-c", "cat /proc/self/oom_score_adj"}, + Env: standardEnvironment, + Stdin: buffers.Stdin, + Stdout: buffers.Stdout, + Stderr: buffers.Stderr, + } + err = container.Run(ps) + ok(t, err) + waitProcess(ps, t) + + stdinW.Close() + waitProcess(process, t) + + out := buffers.Stdout.String() + if oomScoreAdj := strings.TrimSpace(out); oomScoreAdj != strconv.Itoa(*config.OomScoreAdj) { + t.Fatalf("expected oomScoreAdj to be %d, got %s", *config.OomScoreAdj, oomScoreAdj) + } +} + +func TestExecInUserns(t *testing.T) { + if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) { + t.Skip("userns is unsupported") + } + if testing.Short() { + return + } + rootfs, err := newRootfs() + ok(t, err) + defer remove(rootfs) + config := newTemplateConfig(rootfs) + config.UidMappings = []configs.IDMap{{HostID: 0, ContainerID: 0, Size: 1000}} + config.GidMappings = []configs.IDMap{{HostID: 0, ContainerID: 0, Size: 1000}} + config.Namespaces = append(config.Namespaces, configs.Namespace{Type: configs.NEWUSER}) + container, err := newContainer(config) + ok(t, err) + defer container.Destroy() + + // Execute a first process in the container + stdinR, stdinW, err := os.Pipe() + ok(t, err) + + process := &libcontainer.Process{ + Cwd: "/", + Args: []string{"cat"}, + Env: standardEnvironment, + Stdin: stdinR, + Init: true, + } + err = container.Run(process) + stdinR.Close() + defer stdinW.Close() + ok(t, err) + + initPID, err := process.Pid() + ok(t, err) + initUserns, err := os.Readlink(fmt.Sprintf("/proc/%d/ns/user", initPID)) + ok(t, err) + + buffers := newStdBuffers() + process2 := &libcontainer.Process{ + Cwd: "/", + Args: []string{"readlink", "/proc/self/ns/user"}, + Env: []string{ + "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", + }, + Stdout: buffers.Stdout, + Stderr: os.Stderr, + } + err = container.Run(process2) + ok(t, err) + waitProcess(process2, t) + stdinW.Close() + waitProcess(process, t) + + if out := strings.TrimSpace(buffers.Stdout.String()); out != initUserns { + t.Errorf("execin userns(%s), wanted %s", out, initUserns) + } +} diff --git a/libcontainer/integration/init_test.go b/libcontainer/integration/init_test.go new file mode 100644 index 0000000..f5180ea --- /dev/null +++ b/libcontainer/integration/init_test.go @@ -0,0 +1,46 @@ +package integration + +import ( + "os" + "runtime" + "testing" + + "github.com/opencontainers/runc/libcontainer" + _ "github.com/opencontainers/runc/libcontainer/nsenter" + + "github.com/sirupsen/logrus" +) + +// init runs the libcontainer initialization code because of the busybox style needs +// to work around the go runtime and the issues with forking +func init() { + if len(os.Args) < 2 || os.Args[1] != "init" { + return + } + runtime.GOMAXPROCS(1) + runtime.LockOSThread() + factory, err := libcontainer.New("") + if err != nil { + logrus.Fatalf("unable to initialize for container: %s", err) + } + if err := factory.StartInitialization(); err != nil { + logrus.Fatal(err) + } +} + +var testRoots []string + +func TestMain(m *testing.M) { + logrus.SetOutput(os.Stderr) + logrus.SetLevel(logrus.InfoLevel) + + // Clean up roots after running everything. + defer func() { + for _, root := range testRoots { + os.RemoveAll(root) + } + }() + + ret := m.Run() + os.Exit(ret) +} diff --git a/libcontainer/integration/seccomp_test.go b/libcontainer/integration/seccomp_test.go new file mode 100644 index 0000000..77f1a8d --- /dev/null +++ b/libcontainer/integration/seccomp_test.go @@ -0,0 +1,422 @@ +// +build linux,cgo,seccomp + +package integration + +import ( + "strings" + "syscall" + "testing" + + "github.com/opencontainers/runc/libcontainer" + "github.com/opencontainers/runc/libcontainer/configs" + libseccomp "github.com/seccomp/libseccomp-golang" +) + +func TestSeccompDenyGetcwd(t *testing.T) { + if testing.Short() { + return + } + + rootfs, err := newRootfs() + if err != nil { + t.Fatal(err) + } + defer remove(rootfs) + + config := newTemplateConfig(rootfs) + config.Seccomp = &configs.Seccomp{ + DefaultAction: configs.Allow, + Syscalls: []*configs.Syscall{ + { + Name: "getcwd", + Action: configs.Errno, + }, + }, + } + + container, err := newContainer(config) + if err != nil { + t.Fatal(err) + } + defer container.Destroy() + + buffers := newStdBuffers() + pwd := &libcontainer.Process{ + Cwd: "/", + Args: []string{"pwd"}, + Env: standardEnvironment, + Stdin: buffers.Stdin, + Stdout: buffers.Stdout, + Stderr: buffers.Stderr, + Init: true, + } + + err = container.Run(pwd) + if err != nil { + t.Fatal(err) + } + ps, err := pwd.Wait() + if err == nil { + t.Fatal("Expecting error (negative return code); instead exited cleanly!") + } + + var exitCode int + status := ps.Sys().(syscall.WaitStatus) + if status.Exited() { + exitCode = status.ExitStatus() + } else if status.Signaled() { + exitCode = -int(status.Signal()) + } else { + t.Fatalf("Unrecognized exit reason!") + } + + if exitCode == 0 { + t.Fatalf("Getcwd should fail with negative exit code, instead got %d!", exitCode) + } + + expected := "pwd: getcwd: Operation not permitted" + actual := strings.Trim(buffers.Stderr.String(), "\n") + if actual != expected { + t.Fatalf("Expected output %s but got %s\n", expected, actual) + } +} + +func TestSeccompPermitWriteConditional(t *testing.T) { + if testing.Short() { + return + } + + rootfs, err := newRootfs() + if err != nil { + t.Fatal(err) + } + defer remove(rootfs) + + config := newTemplateConfig(rootfs) + config.Seccomp = &configs.Seccomp{ + DefaultAction: configs.Allow, + Syscalls: []*configs.Syscall{ + { + Name: "write", + Action: configs.Errno, + Args: []*configs.Arg{ + { + Index: 0, + Value: 2, + Op: configs.EqualTo, + }, + }, + }, + }, + } + + container, err := newContainer(config) + if err != nil { + t.Fatal(err) + } + defer container.Destroy() + + buffers := newStdBuffers() + dmesg := &libcontainer.Process{ + Cwd: "/", + Args: []string{"busybox", "ls", "/"}, + Env: standardEnvironment, + Stdin: buffers.Stdin, + Stdout: buffers.Stdout, + Stderr: buffers.Stderr, + Init: true, + } + + err = container.Run(dmesg) + if err != nil { + t.Fatal(err) + } + if _, err := dmesg.Wait(); err != nil { + t.Fatalf("%s: %s", err, buffers.Stderr) + } +} + +func TestSeccompDenyWriteConditional(t *testing.T) { + if testing.Short() { + return + } + + // Only test if library version is v2.2.1 or higher + // Conditional filtering will always error in v2.2.0 and lower + major, minor, micro := libseccomp.GetLibraryVersion() + if (major == 2 && minor < 2) || (major == 2 && minor == 2 && micro < 1) { + return + } + + rootfs, err := newRootfs() + if err != nil { + t.Fatal(err) + } + defer remove(rootfs) + + config := newTemplateConfig(rootfs) + config.Seccomp = &configs.Seccomp{ + DefaultAction: configs.Allow, + Syscalls: []*configs.Syscall{ + { + Name: "write", + Action: configs.Errno, + Args: []*configs.Arg{ + { + Index: 0, + Value: 2, + Op: configs.EqualTo, + }, + }, + }, + }, + } + + container, err := newContainer(config) + if err != nil { + t.Fatal(err) + } + defer container.Destroy() + + buffers := newStdBuffers() + dmesg := &libcontainer.Process{ + Cwd: "/", + Args: []string{"busybox", "ls", "does_not_exist"}, + Env: standardEnvironment, + Stdin: buffers.Stdin, + Stdout: buffers.Stdout, + Stderr: buffers.Stderr, + Init: true, + } + + err = container.Run(dmesg) + if err != nil { + t.Fatal(err) + } + + ps, err := dmesg.Wait() + if err == nil { + t.Fatal("Expecting negative return, instead got 0!") + } + + var exitCode int + status := ps.Sys().(syscall.WaitStatus) + if status.Exited() { + exitCode = status.ExitStatus() + } else if status.Signaled() { + exitCode = -int(status.Signal()) + } else { + t.Fatalf("Unrecognized exit reason!") + } + + if exitCode == 0 { + t.Fatalf("Busybox should fail with negative exit code, instead got %d!", exitCode) + } + + // We're denying write to stderr, so we expect an empty buffer + expected := "" + actual := strings.Trim(buffers.Stderr.String(), "\n") + if actual != expected { + t.Fatalf("Expected output %s but got %s\n", expected, actual) + } +} + +func TestSeccompPermitWriteMultipleConditions(t *testing.T) { + if testing.Short() { + return + } + + rootfs, err := newRootfs() + if err != nil { + t.Fatal(err) + } + defer remove(rootfs) + + config := newTemplateConfig(rootfs) + config.Seccomp = &configs.Seccomp{ + DefaultAction: configs.Allow, + Syscalls: []*configs.Syscall{ + { + Name: "write", + Action: configs.Errno, + Args: []*configs.Arg{ + { + Index: 0, + Value: 2, + Op: configs.EqualTo, + }, + { + Index: 2, + Value: 0, + Op: configs.NotEqualTo, + }, + }, + }, + }, + } + + buffers, exitCode, err := runContainer(config, "", "ls", "/") + if err != nil { + t.Fatalf("%s: %s", buffers, err) + } + if exitCode != 0 { + t.Fatalf("exit code not 0. code %d buffers %s", exitCode, buffers) + } + // We don't need to verify the actual thing printed + // Just that something was written to stdout + if len(buffers.Stdout.String()) == 0 { + t.Fatalf("Nothing was written to stdout, write call failed!\n") + } +} + +func TestSeccompDenyWriteMultipleConditions(t *testing.T) { + if testing.Short() { + return + } + + // Only test if library version is v2.2.1 or higher + // Conditional filtering will always error in v2.2.0 and lower + major, minor, micro := libseccomp.GetLibraryVersion() + if (major == 2 && minor < 2) || (major == 2 && minor == 2 && micro < 1) { + return + } + + rootfs, err := newRootfs() + if err != nil { + t.Fatal(err) + } + defer remove(rootfs) + + config := newTemplateConfig(rootfs) + config.Seccomp = &configs.Seccomp{ + DefaultAction: configs.Allow, + Syscalls: []*configs.Syscall{ + { + Name: "write", + Action: configs.Errno, + Args: []*configs.Arg{ + { + Index: 0, + Value: 2, + Op: configs.EqualTo, + }, + { + Index: 2, + Value: 0, + Op: configs.NotEqualTo, + }, + }, + }, + }, + } + + buffers, exitCode, err := runContainer(config, "", "ls", "/does_not_exist") + if err == nil { + t.Fatalf("Expecting error return, instead got 0") + } + if exitCode == 0 { + t.Fatalf("Busybox should fail with negative exit code, instead got %d!", exitCode) + } + + expected := "" + actual := strings.Trim(buffers.Stderr.String(), "\n") + if actual != expected { + t.Fatalf("Expected output %s but got %s\n", expected, actual) + } +} + +func TestSeccompMultipleConditionSameArgDeniesStdout(t *testing.T) { + if testing.Short() { + return + } + + rootfs, err := newRootfs() + if err != nil { + t.Fatal(err) + } + defer remove(rootfs) + + // Prevent writing to both stdout and stderr + config := newTemplateConfig(rootfs) + config.Seccomp = &configs.Seccomp{ + DefaultAction: configs.Allow, + Syscalls: []*configs.Syscall{ + { + Name: "write", + Action: configs.Errno, + Args: []*configs.Arg{ + { + Index: 0, + Value: 1, + Op: configs.EqualTo, + }, + { + Index: 0, + Value: 2, + Op: configs.EqualTo, + }, + }, + }, + }, + } + + buffers, exitCode, err := runContainer(config, "", "ls", "/") + if err != nil { + t.Fatalf("%s: %s", buffers, err) + } + if exitCode != 0 { + t.Fatalf("exit code not 0. code %d buffers %s", exitCode, buffers) + } + // Verify that nothing was printed + if len(buffers.Stdout.String()) != 0 { + t.Fatalf("Something was written to stdout, write call succeeded!\n") + } +} + +func TestSeccompMultipleConditionSameArgDeniesStderr(t *testing.T) { + if testing.Short() { + return + } + + rootfs, err := newRootfs() + if err != nil { + t.Fatal(err) + } + defer remove(rootfs) + + // Prevent writing to both stdout and stderr + config := newTemplateConfig(rootfs) + config.Seccomp = &configs.Seccomp{ + DefaultAction: configs.Allow, + Syscalls: []*configs.Syscall{ + { + Name: "write", + Action: configs.Errno, + Args: []*configs.Arg{ + { + Index: 0, + Value: 1, + Op: configs.EqualTo, + }, + { + Index: 0, + Value: 2, + Op: configs.EqualTo, + }, + }, + }, + }, + } + + buffers, exitCode, err := runContainer(config, "", "ls", "/does_not_exist") + if err == nil { + t.Fatalf("Expecting error return, instead got 0") + } + if exitCode == 0 { + t.Fatalf("Busybox should fail with negative exit code, instead got %d!", exitCode) + } + // Verify nothing was printed + if len(buffers.Stderr.String()) != 0 { + t.Fatalf("Something was written to stderr, write call succeeded!\n") + } +} diff --git a/libcontainer/integration/template_test.go b/libcontainer/integration/template_test.go new file mode 100644 index 0000000..5f7cab5 --- /dev/null +++ b/libcontainer/integration/template_test.go @@ -0,0 +1,191 @@ +package integration + +import ( + "github.com/opencontainers/runc/libcontainer/configs" + + "golang.org/x/sys/unix" +) + +var standardEnvironment = []string{ + "HOME=/root", + "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", + "HOSTNAME=integration", + "TERM=xterm", +} + +const defaultMountFlags = unix.MS_NOEXEC | unix.MS_NOSUID | unix.MS_NODEV + +// newTemplateConfig returns a base template for running a container +// +// it uses a network strategy of just setting a loopback interface +// and the default setup for devices +func newTemplateConfig(rootfs string) *configs.Config { + allowAllDevices := false + return &configs.Config{ + Rootfs: rootfs, + Capabilities: &configs.Capabilities{ + Bounding: []string{ + "CAP_CHOWN", + "CAP_DAC_OVERRIDE", + "CAP_FSETID", + "CAP_FOWNER", + "CAP_MKNOD", + "CAP_NET_RAW", + "CAP_SETGID", + "CAP_SETUID", + "CAP_SETFCAP", + "CAP_SETPCAP", + "CAP_NET_BIND_SERVICE", + "CAP_SYS_CHROOT", + "CAP_KILL", + "CAP_AUDIT_WRITE", + }, + Permitted: []string{ + "CAP_CHOWN", + "CAP_DAC_OVERRIDE", + "CAP_FSETID", + "CAP_FOWNER", + "CAP_MKNOD", + "CAP_NET_RAW", + "CAP_SETGID", + "CAP_SETUID", + "CAP_SETFCAP", + "CAP_SETPCAP", + "CAP_NET_BIND_SERVICE", + "CAP_SYS_CHROOT", + "CAP_KILL", + "CAP_AUDIT_WRITE", + }, + Inheritable: []string{ + "CAP_CHOWN", + "CAP_DAC_OVERRIDE", + "CAP_FSETID", + "CAP_FOWNER", + "CAP_MKNOD", + "CAP_NET_RAW", + "CAP_SETGID", + "CAP_SETUID", + "CAP_SETFCAP", + "CAP_SETPCAP", + "CAP_NET_BIND_SERVICE", + "CAP_SYS_CHROOT", + "CAP_KILL", + "CAP_AUDIT_WRITE", + }, + Ambient: []string{ + "CAP_CHOWN", + "CAP_DAC_OVERRIDE", + "CAP_FSETID", + "CAP_FOWNER", + "CAP_MKNOD", + "CAP_NET_RAW", + "CAP_SETGID", + "CAP_SETUID", + "CAP_SETFCAP", + "CAP_SETPCAP", + "CAP_NET_BIND_SERVICE", + "CAP_SYS_CHROOT", + "CAP_KILL", + "CAP_AUDIT_WRITE", + }, + Effective: []string{ + "CAP_CHOWN", + "CAP_DAC_OVERRIDE", + "CAP_FSETID", + "CAP_FOWNER", + "CAP_MKNOD", + "CAP_NET_RAW", + "CAP_SETGID", + "CAP_SETUID", + "CAP_SETFCAP", + "CAP_SETPCAP", + "CAP_NET_BIND_SERVICE", + "CAP_SYS_CHROOT", + "CAP_KILL", + "CAP_AUDIT_WRITE", + }, + }, + Namespaces: configs.Namespaces([]configs.Namespace{ + {Type: configs.NEWNS}, + {Type: configs.NEWUTS}, + {Type: configs.NEWIPC}, + {Type: configs.NEWPID}, + {Type: configs.NEWNET}, + }), + Cgroups: &configs.Cgroup{ + Path: "integration/test", + Resources: &configs.Resources{ + MemorySwappiness: nil, + AllowAllDevices: &allowAllDevices, + AllowedDevices: configs.DefaultAllowedDevices, + }, + }, + MaskPaths: []string{ + "/proc/kcore", + "/sys/firmware", + }, + ReadonlyPaths: []string{ + "/proc/sys", "/proc/sysrq-trigger", "/proc/irq", "/proc/bus", + }, + Devices: configs.DefaultAutoCreatedDevices, + Hostname: "integration", + Mounts: []*configs.Mount{ + { + Source: "proc", + Destination: "/proc", + Device: "proc", + Flags: defaultMountFlags, + }, + { + Source: "tmpfs", + Destination: "/dev", + Device: "tmpfs", + Flags: unix.MS_NOSUID | unix.MS_STRICTATIME, + Data: "mode=755", + }, + { + Source: "devpts", + Destination: "/dev/pts", + Device: "devpts", + Flags: unix.MS_NOSUID | unix.MS_NOEXEC, + Data: "newinstance,ptmxmode=0666,mode=0620,gid=5", + }, + { + Device: "tmpfs", + Source: "shm", + Destination: "/dev/shm", + Data: "mode=1777,size=65536k", + Flags: defaultMountFlags, + }, + /* + CI is broken on the debian based kernels with this + { + Source: "mqueue", + Destination: "/dev/mqueue", + Device: "mqueue", + Flags: defaultMountFlags, + }, + */ + { + Source: "sysfs", + Destination: "/sys", + Device: "sysfs", + Flags: defaultMountFlags | unix.MS_RDONLY, + }, + }, + Networks: []*configs.Network{ + { + Type: "loopback", + Address: "127.0.0.1/0", + Gateway: "localhost", + }, + }, + Rlimits: []configs.Rlimit{ + { + Type: unix.RLIMIT_NOFILE, + Hard: uint64(1025), + Soft: uint64(1025), + }, + }, + } +} diff --git a/libcontainer/integration/utils_test.go b/libcontainer/integration/utils_test.go new file mode 100644 index 0000000..8b2d714 --- /dev/null +++ b/libcontainer/integration/utils_test.go @@ -0,0 +1,187 @@ +package integration + +import ( + "bytes" + "crypto/md5" + "encoding/hex" + "fmt" + "io/ioutil" + "os" + "os/exec" + "path/filepath" + "runtime" + "strings" + "syscall" + "testing" + "time" + + "github.com/opencontainers/runc/libcontainer" + "github.com/opencontainers/runc/libcontainer/configs" +) + +func ptrInt(v int) *int { + return &v +} + +func newStdBuffers() *stdBuffers { + return &stdBuffers{ + Stdin: bytes.NewBuffer(nil), + Stdout: bytes.NewBuffer(nil), + Stderr: bytes.NewBuffer(nil), + } +} + +type stdBuffers struct { + Stdin *bytes.Buffer + Stdout *bytes.Buffer + Stderr *bytes.Buffer +} + +func (b *stdBuffers) String() string { + s := []string{} + if b.Stderr != nil { + s = append(s, b.Stderr.String()) + } + if b.Stdout != nil { + s = append(s, b.Stdout.String()) + } + return strings.Join(s, "|") +} + +// ok fails the test if an err is not nil. +func ok(t testing.TB, err error) { + if err != nil { + _, file, line, _ := runtime.Caller(1) + t.Fatalf("%s:%d: unexpected error: %s\n\n", filepath.Base(file), line, err.Error()) + } +} + +func waitProcess(p *libcontainer.Process, t *testing.T) { + _, file, line, _ := runtime.Caller(1) + status, err := p.Wait() + + if err != nil { + t.Fatalf("%s:%d: unexpected error: %s\n\n", filepath.Base(file), line, err.Error()) + } + + if !status.Success() { + t.Fatalf("%s:%d: unexpected status: %s\n\n", filepath.Base(file), line, status.String()) + } +} + +func newTestRoot() (string, error) { + dir, err := ioutil.TempDir("", "libcontainer") + if err != nil { + return "", err + } + if err := os.MkdirAll(dir, 0700); err != nil { + return "", err + } + testRoots = append(testRoots, dir) + return dir, nil +} + +func newTestBundle() (string, error) { + dir, err := ioutil.TempDir("", "bundle") + if err != nil { + return "", err + } + if err := os.MkdirAll(dir, 0700); err != nil { + return "", err + } + return dir, nil +} + +// newRootfs creates a new tmp directory and copies the busybox root filesystem +func newRootfs() (string, error) { + dir, err := ioutil.TempDir("", "") + if err != nil { + return "", err + } + if err := os.MkdirAll(dir, 0700); err != nil { + return "", err + } + if err := copyBusybox(dir); err != nil { + return "", err + } + return dir, nil +} + +func remove(dir string) { + os.RemoveAll(dir) +} + +// copyBusybox copies the rootfs for a busybox container created for the test image +// into the new directory for the specific test +func copyBusybox(dest string) error { + out, err := exec.Command("sh", "-c", fmt.Sprintf("cp -a /busybox/* %s/", dest)).CombinedOutput() + if err != nil { + return fmt.Errorf("copy error %q: %q", err, out) + } + return nil +} + +func newContainer(config *configs.Config) (libcontainer.Container, error) { + h := md5.New() + h.Write([]byte(time.Now().String())) + return newContainerWithName(hex.EncodeToString(h.Sum(nil)), config) +} + +func newContainerWithName(name string, config *configs.Config) (libcontainer.Container, error) { + root, err := newTestRoot() + if err != nil { + return nil, err + } + + f, err := libcontainer.New(root, libcontainer.Cgroupfs) + if err != nil { + return nil, err + } + if config.Cgroups != nil && config.Cgroups.Parent == "system.slice" { + f, err = libcontainer.New(root, libcontainer.SystemdCgroups) + if err != nil { + return nil, err + } + } + return f.Create(name, config) +} + +// runContainer runs the container with the specific config and arguments +// +// buffers are returned containing the STDOUT and STDERR output for the run +// along with the exit code and any go error +func runContainer(config *configs.Config, console string, args ...string) (buffers *stdBuffers, exitCode int, err error) { + container, err := newContainer(config) + if err != nil { + return nil, -1, err + } + defer container.Destroy() + buffers = newStdBuffers() + process := &libcontainer.Process{ + Cwd: "/", + Args: args, + Env: standardEnvironment, + Stdin: buffers.Stdin, + Stdout: buffers.Stdout, + Stderr: buffers.Stderr, + Init: true, + } + + err = container.Run(process) + if err != nil { + return buffers, -1, err + } + ps, err := process.Wait() + if err != nil { + return buffers, -1, err + } + status := ps.Sys().(syscall.WaitStatus) + if status.Exited() { + exitCode = status.ExitStatus() + } else if status.Signaled() { + exitCode = -int(status.Signal()) + } else { + return buffers, -1, err + } + return +} diff --git a/libcontainer/intelrdt/intelrdt.go b/libcontainer/intelrdt/intelrdt.go new file mode 100644 index 0000000..0071ce7 --- /dev/null +++ b/libcontainer/intelrdt/intelrdt.go @@ -0,0 +1,773 @@ +// +build linux + +package intelrdt + +import ( + "bufio" + "fmt" + "io/ioutil" + "os" + "path/filepath" + "strconv" + "strings" + "sync" + + "github.com/opencontainers/runc/libcontainer/configs" +) + +/* + * About Intel RDT features: + * Intel platforms with new Xeon CPU support Resource Director Technology (RDT). + * Cache Allocation Technology (CAT) and Memory Bandwidth Allocation (MBA) are + * two sub-features of RDT. + * + * Cache Allocation Technology (CAT) provides a way for the software to restrict + * cache allocation to a defined 'subset' of L3 cache which may be overlapping + * with other 'subsets'. The different subsets are identified by class of + * service (CLOS) and each CLOS has a capacity bitmask (CBM). + * + * Memory Bandwidth Allocation (MBA) provides indirect and approximate throttle + * over memory bandwidth for the software. A user controls the resource by + * indicating the percentage of maximum memory bandwidth or memory bandwidth + * limit in MBps unit if MBA Software Controller is enabled. + * + * More details about Intel RDT CAT and MBA can be found in the section 17.18 + * of Intel Software Developer Manual: + * https://software.intel.com/en-us/articles/intel-sdm + * + * About Intel RDT kernel interface: + * In Linux 4.10 kernel or newer, the interface is defined and exposed via + * "resource control" filesystem, which is a "cgroup-like" interface. + * + * Comparing with cgroups, it has similar process management lifecycle and + * interfaces in a container. But unlike cgroups' hierarchy, it has single level + * filesystem layout. + * + * CAT and MBA features are introduced in Linux 4.10 and 4.12 kernel via + * "resource control" filesystem. + * + * Intel RDT "resource control" filesystem hierarchy: + * mount -t resctrl resctrl /sys/fs/resctrl + * tree /sys/fs/resctrl + * /sys/fs/resctrl/ + * |-- info + * | |-- L3 + * | | |-- cbm_mask + * | | |-- min_cbm_bits + * | | |-- num_closids + * | |-- MB + * | |-- bandwidth_gran + * | |-- delay_linear + * | |-- min_bandwidth + * | |-- num_closids + * |-- ... + * |-- schemata + * |-- tasks + * |-- + * |-- ... + * |-- schemata + * |-- tasks + * + * For runc, we can make use of `tasks` and `schemata` configuration for L3 + * cache and memory bandwidth resources constraints. + * + * The file `tasks` has a list of tasks that belongs to this group (e.g., + * " group). Tasks can be added to a group by writing the task ID + * to the "tasks" file (which will automatically remove them from the previous + * group to which they belonged). New tasks created by fork(2) and clone(2) are + * added to the same group as their parent. + * + * The file `schemata` has a list of all the resources available to this group. + * Each resource (L3 cache, memory bandwidth) has its own line and format. + * + * L3 cache schema: + * It has allocation bitmasks/values for L3 cache on each socket, which + * contains L3 cache id and capacity bitmask (CBM). + * Format: "L3:=;=;..." + * For example, on a two-socket machine, the schema line could be "L3:0=ff;1=c0" + * which means L3 cache id 0's CBM is 0xff, and L3 cache id 1's CBM is 0xc0. + * + * The valid L3 cache CBM is a *contiguous bits set* and number of bits that can + * be set is less than the max bit. The max bits in the CBM is varied among + * supported Intel CPU models. Kernel will check if it is valid when writing. + * e.g., default value 0xfffff in root indicates the max bits of CBM is 20 + * bits, which mapping to entire L3 cache capacity. Some valid CBM values to + * set in a group: 0xf, 0xf0, 0x3ff, 0x1f00 and etc. + * + * Memory bandwidth schema: + * It has allocation values for memory bandwidth on each socket, which contains + * L3 cache id and memory bandwidth. + * Format: "MB:=bandwidth0;=bandwidth1;..." + * For example, on a two-socket machine, the schema line could be "MB:0=20;1=70" + * + * The minimum bandwidth percentage value for each CPU model is predefined and + * can be looked up through "info/MB/min_bandwidth". The bandwidth granularity + * that is allocated is also dependent on the CPU model and can be looked up at + * "info/MB/bandwidth_gran". The available bandwidth control steps are: + * min_bw + N * bw_gran. Intermediate values are rounded to the next control + * step available on the hardware. + * + * If MBA Software Controller is enabled through mount option "-o mba_MBps": + * mount -t resctrl resctrl -o mba_MBps /sys/fs/resctrl + * We could specify memory bandwidth in "MBps" (Mega Bytes per second) unit + * instead of "percentages". The kernel underneath would use a software feedback + * mechanism or a "Software Controller" which reads the actual bandwidth using + * MBM counters and adjust the memory bandwidth percentages to ensure: + * "actual memory bandwidth < user specified memory bandwidth". + * + * For example, on a two-socket machine, the schema line could be + * "MB:0=5000;1=7000" which means 5000 MBps memory bandwidth limit on socket 0 + * and 7000 MBps memory bandwidth limit on socket 1. + * + * For more information about Intel RDT kernel interface: + * https://www.kernel.org/doc/Documentation/x86/intel_rdt_ui.txt + * + * An example for runc: + * Consider a two-socket machine with two L3 caches where the default CBM is + * 0x7ff and the max CBM length is 11 bits, and minimum memory bandwidth of 10% + * with a memory bandwidth granularity of 10%. + * + * Tasks inside the container only have access to the "upper" 7/11 of L3 cache + * on socket 0 and the "lower" 5/11 L3 cache on socket 1, and may use a + * maximum memory bandwidth of 20% on socket 0 and 70% on socket 1. + * + * "linux": { + * "intelRdt": { + * "l3CacheSchema": "L3:0=7f0;1=1f", + * "memBwSchema": "MB:0=20;1=70" + * } + * } + */ + +type Manager interface { + // Applies Intel RDT configuration to the process with the specified pid + Apply(pid int) error + + // Returns statistics for Intel RDT + GetStats() (*Stats, error) + + // Destroys the Intel RDT 'container_id' group + Destroy() error + + // Returns Intel RDT path to save in a state file and to be able to + // restore the object later + GetPath() string + + // Set Intel RDT "resource control" filesystem as configured. + Set(container *configs.Config) error +} + +// This implements interface Manager +type IntelRdtManager struct { + mu sync.Mutex + Config *configs.Config + Id string + Path string +} + +const ( + IntelRdtTasks = "tasks" +) + +var ( + // The absolute root path of the Intel RDT "resource control" filesystem + intelRdtRoot string + intelRdtRootLock sync.Mutex + + // The flag to indicate if Intel RDT/CAT is enabled + isCatEnabled bool + // The flag to indicate if Intel RDT/MBA is enabled + isMbaEnabled bool + // The flag to indicate if Intel RDT/MBA Software Controller is enabled + isMbaScEnabled bool +) + +type intelRdtData struct { + root string + config *configs.Config + pid int +} + +// Check if Intel RDT sub-features are enabled in init() +func init() { + // 1. Check if hardware and kernel support Intel RDT sub-features + // "cat_l3" flag for CAT and "mba" flag for MBA + isCatFlagSet, isMbaFlagSet, err := parseCpuInfoFile("/proc/cpuinfo") + if err != nil { + return + } + + // 2. Check if Intel RDT "resource control" filesystem is mounted + // The user guarantees to mount the filesystem + if !isIntelRdtMounted() { + return + } + + // 3. Double check if Intel RDT sub-features are available in + // "resource control" filesystem. Intel RDT sub-features can be + // selectively disabled or enabled by kernel command line + // (e.g., rdt=!l3cat,mba) in 4.14 and newer kernel + if isCatFlagSet { + if _, err := os.Stat(filepath.Join(intelRdtRoot, "info", "L3")); err == nil { + isCatEnabled = true + } + } + if isMbaScEnabled { + // We confirm MBA Software Controller is enabled in step 2, + // MBA should be enabled because MBA Software Controller + // depends on MBA + isMbaEnabled = true + } else if isMbaFlagSet { + if _, err := os.Stat(filepath.Join(intelRdtRoot, "info", "MB")); err == nil { + isMbaEnabled = true + } + } +} + +// Return the mount point path of Intel RDT "resource control" filesysem +func findIntelRdtMountpointDir() (string, error) { + f, err := os.Open("/proc/self/mountinfo") + if err != nil { + return "", err + } + defer f.Close() + + s := bufio.NewScanner(f) + for s.Scan() { + text := s.Text() + fields := strings.Split(text, " ") + // Safe as mountinfo encodes mountpoints with spaces as \040. + index := strings.Index(text, " - ") + postSeparatorFields := strings.Fields(text[index+3:]) + numPostFields := len(postSeparatorFields) + + // This is an error as we can't detect if the mount is for "Intel RDT" + if numPostFields == 0 { + return "", fmt.Errorf("Found no fields post '-' in %q", text) + } + + if postSeparatorFields[0] == "resctrl" { + // Check that the mount is properly formatted. + if numPostFields < 3 { + return "", fmt.Errorf("Error found less than 3 fields post '-' in %q", text) + } + + // Check if MBA Software Controller is enabled through mount option "-o mba_MBps" + if strings.Contains(postSeparatorFields[2], "mba_MBps") { + isMbaScEnabled = true + } + + return fields[4], nil + } + } + if err := s.Err(); err != nil { + return "", err + } + + return "", NewNotFoundError("Intel RDT") +} + +// Gets the root path of Intel RDT "resource control" filesystem +func getIntelRdtRoot() (string, error) { + intelRdtRootLock.Lock() + defer intelRdtRootLock.Unlock() + + if intelRdtRoot != "" { + return intelRdtRoot, nil + } + + root, err := findIntelRdtMountpointDir() + if err != nil { + return "", err + } + + if _, err := os.Stat(root); err != nil { + return "", err + } + + intelRdtRoot = root + return intelRdtRoot, nil +} + +func isIntelRdtMounted() bool { + _, err := getIntelRdtRoot() + if err != nil { + return false + } + + return true +} + +func parseCpuInfoFile(path string) (bool, bool, error) { + isCatFlagSet := false + isMbaFlagSet := false + + f, err := os.Open(path) + if err != nil { + return false, false, err + } + defer f.Close() + + s := bufio.NewScanner(f) + for s.Scan() { + if err := s.Err(); err != nil { + return false, false, err + } + + line := s.Text() + + // Search "cat_l3" and "mba" flags in first "flags" line + if strings.Contains(line, "flags") { + flags := strings.Split(line, " ") + // "cat_l3" flag for CAT and "mba" flag for MBA + for _, flag := range flags { + switch flag { + case "cat_l3": + isCatFlagSet = true + case "mba": + isMbaFlagSet = true + } + } + return isCatFlagSet, isMbaFlagSet, nil + } + } + return isCatFlagSet, isMbaFlagSet, nil +} + +func parseUint(s string, base, bitSize int) (uint64, error) { + value, err := strconv.ParseUint(s, base, bitSize) + if err != nil { + intValue, intErr := strconv.ParseInt(s, base, bitSize) + // 1. Handle negative values greater than MinInt64 (and) + // 2. Handle negative values lesser than MinInt64 + if intErr == nil && intValue < 0 { + return 0, nil + } else if intErr != nil && intErr.(*strconv.NumError).Err == strconv.ErrRange && intValue < 0 { + return 0, nil + } + + return value, err + } + + return value, nil +} + +// Gets a single uint64 value from the specified file. +func getIntelRdtParamUint(path, file string) (uint64, error) { + fileName := filepath.Join(path, file) + contents, err := ioutil.ReadFile(fileName) + if err != nil { + return 0, err + } + + res, err := parseUint(strings.TrimSpace(string(contents)), 10, 64) + if err != nil { + return res, fmt.Errorf("unable to parse %q as a uint from file %q", string(contents), fileName) + } + return res, nil +} + +// Gets a string value from the specified file +func getIntelRdtParamString(path, file string) (string, error) { + contents, err := ioutil.ReadFile(filepath.Join(path, file)) + if err != nil { + return "", err + } + + return strings.TrimSpace(string(contents)), nil +} + +func writeFile(dir, file, data string) error { + if dir == "" { + return fmt.Errorf("no such directory for %s", file) + } + if err := ioutil.WriteFile(filepath.Join(dir, file), []byte(data+"\n"), 0700); err != nil { + return fmt.Errorf("failed to write %v to %v: %v", data, file, err) + } + return nil +} + +func getIntelRdtData(c *configs.Config, pid int) (*intelRdtData, error) { + rootPath, err := getIntelRdtRoot() + if err != nil { + return nil, err + } + return &intelRdtData{ + root: rootPath, + config: c, + pid: pid, + }, nil +} + +// Get the read-only L3 cache information +func getL3CacheInfo() (*L3CacheInfo, error) { + l3CacheInfo := &L3CacheInfo{} + + rootPath, err := getIntelRdtRoot() + if err != nil { + return l3CacheInfo, err + } + + path := filepath.Join(rootPath, "info", "L3") + cbmMask, err := getIntelRdtParamString(path, "cbm_mask") + if err != nil { + return l3CacheInfo, err + } + minCbmBits, err := getIntelRdtParamUint(path, "min_cbm_bits") + if err != nil { + return l3CacheInfo, err + } + numClosids, err := getIntelRdtParamUint(path, "num_closids") + if err != nil { + return l3CacheInfo, err + } + + l3CacheInfo.CbmMask = cbmMask + l3CacheInfo.MinCbmBits = minCbmBits + l3CacheInfo.NumClosids = numClosids + + return l3CacheInfo, nil +} + +// Get the read-only memory bandwidth information +func getMemBwInfo() (*MemBwInfo, error) { + memBwInfo := &MemBwInfo{} + + rootPath, err := getIntelRdtRoot() + if err != nil { + return memBwInfo, err + } + + path := filepath.Join(rootPath, "info", "MB") + bandwidthGran, err := getIntelRdtParamUint(path, "bandwidth_gran") + if err != nil { + return memBwInfo, err + } + delayLinear, err := getIntelRdtParamUint(path, "delay_linear") + if err != nil { + return memBwInfo, err + } + minBandwidth, err := getIntelRdtParamUint(path, "min_bandwidth") + if err != nil { + return memBwInfo, err + } + numClosids, err := getIntelRdtParamUint(path, "num_closids") + if err != nil { + return memBwInfo, err + } + + memBwInfo.BandwidthGran = bandwidthGran + memBwInfo.DelayLinear = delayLinear + memBwInfo.MinBandwidth = minBandwidth + memBwInfo.NumClosids = numClosids + + return memBwInfo, nil +} + +// Get diagnostics for last filesystem operation error from file info/last_cmd_status +func getLastCmdStatus() (string, error) { + rootPath, err := getIntelRdtRoot() + if err != nil { + return "", err + } + + path := filepath.Join(rootPath, "info") + lastCmdStatus, err := getIntelRdtParamString(path, "last_cmd_status") + if err != nil { + return "", err + } + + return lastCmdStatus, nil +} + +// WriteIntelRdtTasks writes the specified pid into the "tasks" file +func WriteIntelRdtTasks(dir string, pid int) error { + if dir == "" { + return fmt.Errorf("no such directory for %s", IntelRdtTasks) + } + + // Don't attach any pid if -1 is specified as a pid + if pid != -1 { + if err := ioutil.WriteFile(filepath.Join(dir, IntelRdtTasks), []byte(strconv.Itoa(pid)), 0700); err != nil { + return fmt.Errorf("failed to write %v to %v: %v", pid, IntelRdtTasks, err) + } + } + return nil +} + +// Check if Intel RDT/CAT is enabled +func IsCatEnabled() bool { + return isCatEnabled +} + +// Check if Intel RDT/MBA is enabled +func IsMbaEnabled() bool { + return isMbaEnabled +} + +// Check if Intel RDT/MBA Software Controller is enabled +func IsMbaScEnabled() bool { + return isMbaScEnabled +} + +// Get the 'container_id' path in Intel RDT "resource control" filesystem +func GetIntelRdtPath(id string) (string, error) { + rootPath, err := getIntelRdtRoot() + if err != nil { + return "", err + } + + path := filepath.Join(rootPath, id) + return path, nil +} + +// Applies Intel RDT configuration to the process with the specified pid +func (m *IntelRdtManager) Apply(pid int) (err error) { + // If intelRdt is not specified in config, we do nothing + if m.Config.IntelRdt == nil { + return nil + } + d, err := getIntelRdtData(m.Config, pid) + if err != nil && !IsNotFound(err) { + return err + } + + m.mu.Lock() + defer m.mu.Unlock() + path, err := d.join(m.Id) + if err != nil { + return err + } + + m.Path = path + return nil +} + +// Destroys the Intel RDT 'container_id' group +func (m *IntelRdtManager) Destroy() error { + m.mu.Lock() + defer m.mu.Unlock() + if err := os.RemoveAll(m.GetPath()); err != nil { + return err + } + m.Path = "" + return nil +} + +// Returns Intel RDT path to save in a state file and to be able to +// restore the object later +func (m *IntelRdtManager) GetPath() string { + if m.Path == "" { + m.Path, _ = GetIntelRdtPath(m.Id) + } + return m.Path +} + +// Returns statistics for Intel RDT +func (m *IntelRdtManager) GetStats() (*Stats, error) { + // If intelRdt is not specified in config + if m.Config.IntelRdt == nil { + return nil, nil + } + + m.mu.Lock() + defer m.mu.Unlock() + stats := NewStats() + + rootPath, err := getIntelRdtRoot() + if err != nil { + return nil, err + } + // The read-only L3 cache and memory bandwidth schemata in root + tmpRootStrings, err := getIntelRdtParamString(rootPath, "schemata") + if err != nil { + return nil, err + } + schemaRootStrings := strings.Split(tmpRootStrings, "\n") + + // The L3 cache and memory bandwidth schemata in 'container_id' group + tmpStrings, err := getIntelRdtParamString(m.GetPath(), "schemata") + if err != nil { + return nil, err + } + schemaStrings := strings.Split(tmpStrings, "\n") + + if IsCatEnabled() { + // The read-only L3 cache information + l3CacheInfo, err := getL3CacheInfo() + if err != nil { + return nil, err + } + stats.L3CacheInfo = l3CacheInfo + + // The read-only L3 cache schema in root + for _, schemaRoot := range schemaRootStrings { + if strings.Contains(schemaRoot, "L3") { + stats.L3CacheSchemaRoot = strings.TrimSpace(schemaRoot) + } + } + + // The L3 cache schema in 'container_id' group + for _, schema := range schemaStrings { + if strings.Contains(schema, "L3") { + stats.L3CacheSchema = strings.TrimSpace(schema) + } + } + } + + if IsMbaEnabled() { + // The read-only memory bandwidth information + memBwInfo, err := getMemBwInfo() + if err != nil { + return nil, err + } + stats.MemBwInfo = memBwInfo + + // The read-only memory bandwidth information + for _, schemaRoot := range schemaRootStrings { + if strings.Contains(schemaRoot, "MB") { + stats.MemBwSchemaRoot = strings.TrimSpace(schemaRoot) + } + } + + // The memory bandwidth schema in 'container_id' group + for _, schema := range schemaStrings { + if strings.Contains(schema, "MB") { + stats.MemBwSchema = strings.TrimSpace(schema) + } + } + } + + return stats, nil +} + +// Set Intel RDT "resource control" filesystem as configured. +func (m *IntelRdtManager) Set(container *configs.Config) error { + // About L3 cache schema: + // It has allocation bitmasks/values for L3 cache on each socket, + // which contains L3 cache id and capacity bitmask (CBM). + // Format: "L3:=;=;..." + // For example, on a two-socket machine, the schema line could be: + // L3:0=ff;1=c0 + // which means L3 cache id 0's CBM is 0xff, and L3 cache id 1's CBM + // is 0xc0. + // + // The valid L3 cache CBM is a *contiguous bits set* and number of + // bits that can be set is less than the max bit. The max bits in the + // CBM is varied among supported Intel CPU models. Kernel will check + // if it is valid when writing. e.g., default value 0xfffff in root + // indicates the max bits of CBM is 20 bits, which mapping to entire + // L3 cache capacity. Some valid CBM values to set in a group: + // 0xf, 0xf0, 0x3ff, 0x1f00 and etc. + // + // + // About memory bandwidth schema: + // It has allocation values for memory bandwidth on each socket, which + // contains L3 cache id and memory bandwidth. + // Format: "MB:=bandwidth0;=bandwidth1;..." + // For example, on a two-socket machine, the schema line could be: + // "MB:0=20;1=70" + // + // The minimum bandwidth percentage value for each CPU model is + // predefined and can be looked up through "info/MB/min_bandwidth". + // The bandwidth granularity that is allocated is also dependent on + // the CPU model and can be looked up at "info/MB/bandwidth_gran". + // The available bandwidth control steps are: min_bw + N * bw_gran. + // Intermediate values are rounded to the next control step available + // on the hardware. + // + // If MBA Software Controller is enabled through mount option + // "-o mba_MBps": mount -t resctrl resctrl -o mba_MBps /sys/fs/resctrl + // We could specify memory bandwidth in "MBps" (Mega Bytes per second) + // unit instead of "percentages". The kernel underneath would use a + // software feedback mechanism or a "Software Controller" which reads + // the actual bandwidth using MBM counters and adjust the memory + // bandwidth percentages to ensure: + // "actual memory bandwidth < user specified memory bandwidth". + // + // For example, on a two-socket machine, the schema line could be + // "MB:0=5000;1=7000" which means 5000 MBps memory bandwidth limit on + // socket 0 and 7000 MBps memory bandwidth limit on socket 1. + if container.IntelRdt != nil { + path := m.GetPath() + l3CacheSchema := container.IntelRdt.L3CacheSchema + memBwSchema := container.IntelRdt.MemBwSchema + + // Write a single joint schema string to schemata file + if l3CacheSchema != "" && memBwSchema != "" { + if err := writeFile(path, "schemata", l3CacheSchema+"\n"+memBwSchema); err != nil { + return NewLastCmdError(err) + } + } + + // Write only L3 cache schema string to schemata file + if l3CacheSchema != "" && memBwSchema == "" { + if err := writeFile(path, "schemata", l3CacheSchema); err != nil { + return NewLastCmdError(err) + } + } + + // Write only memory bandwidth schema string to schemata file + if l3CacheSchema == "" && memBwSchema != "" { + if err := writeFile(path, "schemata", memBwSchema); err != nil { + return NewLastCmdError(err) + } + } + } + + return nil +} + +func (raw *intelRdtData) join(id string) (string, error) { + path := filepath.Join(raw.root, id) + if err := os.MkdirAll(path, 0755); err != nil { + return "", NewLastCmdError(err) + } + + if err := WriteIntelRdtTasks(path, raw.pid); err != nil { + return "", NewLastCmdError(err) + } + return path, nil +} + +type NotFoundError struct { + ResourceControl string +} + +func (e *NotFoundError) Error() string { + return fmt.Sprintf("mountpoint for %s not found", e.ResourceControl) +} + +func NewNotFoundError(res string) error { + return &NotFoundError{ + ResourceControl: res, + } +} + +func IsNotFound(err error) bool { + if err == nil { + return false + } + _, ok := err.(*NotFoundError) + return ok +} + +type LastCmdError struct { + LastCmdStatus string + Err error +} + +func (e *LastCmdError) Error() string { + return fmt.Sprintf(e.Err.Error() + ", last_cmd_status: " + e.LastCmdStatus) +} + +func NewLastCmdError(err error) error { + lastCmdStatus, err1 := getLastCmdStatus() + if err1 == nil { + return &LastCmdError{ + LastCmdStatus: lastCmdStatus, + Err: err, + } + } + return err +} diff --git a/libcontainer/intelrdt/intelrdt_test.go b/libcontainer/intelrdt/intelrdt_test.go new file mode 100644 index 0000000..a19b961 --- /dev/null +++ b/libcontainer/intelrdt/intelrdt_test.go @@ -0,0 +1,122 @@ +// +build linux + +package intelrdt + +import ( + "strings" + "testing" +) + +func TestIntelRdtSetL3CacheSchema(t *testing.T) { + if !IsCatEnabled() { + return + } + + helper := NewIntelRdtTestUtil(t) + defer helper.cleanup() + + const ( + l3CacheSchemaBefore = "L3:0=f;1=f0" + l3CacheSchemeAfter = "L3:0=f0;1=f" + ) + + helper.writeFileContents(map[string]string{ + "schemata": l3CacheSchemaBefore + "\n", + }) + + helper.IntelRdtData.config.IntelRdt.L3CacheSchema = l3CacheSchemeAfter + intelrdt := &IntelRdtManager{ + Config: helper.IntelRdtData.config, + Path: helper.IntelRdtPath, + } + if err := intelrdt.Set(helper.IntelRdtData.config); err != nil { + t.Fatal(err) + } + + tmpStrings, err := getIntelRdtParamString(helper.IntelRdtPath, "schemata") + if err != nil { + t.Fatalf("Failed to parse file 'schemata' - %s", err) + } + values := strings.Split(tmpStrings, "\n") + value := values[0] + + if value != l3CacheSchemeAfter { + t.Fatal("Got the wrong value, set 'schemata' failed.") + } +} + +func TestIntelRdtSetMemBwSchema(t *testing.T) { + if !IsMbaEnabled() { + return + } + + helper := NewIntelRdtTestUtil(t) + defer helper.cleanup() + + const ( + memBwSchemaBefore = "MB:0=20;1=70" + memBwSchemeAfter = "MB:0=70;1=20" + ) + + helper.writeFileContents(map[string]string{ + "schemata": memBwSchemaBefore + "\n", + }) + + helper.IntelRdtData.config.IntelRdt.MemBwSchema = memBwSchemeAfter + intelrdt := &IntelRdtManager{ + Config: helper.IntelRdtData.config, + Path: helper.IntelRdtPath, + } + if err := intelrdt.Set(helper.IntelRdtData.config); err != nil { + t.Fatal(err) + } + + tmpStrings, err := getIntelRdtParamString(helper.IntelRdtPath, "schemata") + if err != nil { + t.Fatalf("Failed to parse file 'schemata' - %s", err) + } + values := strings.Split(tmpStrings, "\n") + value := values[0] + + if value != memBwSchemeAfter { + t.Fatal("Got the wrong value, set 'schemata' failed.") + } +} + +func TestIntelRdtSetMemBwScSchema(t *testing.T) { + if !IsMbaScEnabled() { + return + } + + helper := NewIntelRdtTestUtil(t) + defer helper.cleanup() + + const ( + memBwScSchemaBefore = "MB:0=5000;1=7000" + memBwScSchemeAfter = "MB:0=9000;1=4000" + ) + + helper.writeFileContents(map[string]string{ + "schemata": memBwScSchemaBefore + "\n", + }) + + helper.IntelRdtData.config.IntelRdt.MemBwSchema = memBwScSchemeAfter + intelrdt := &IntelRdtManager{ + Config: helper.IntelRdtData.config, + Path: helper.IntelRdtPath, + } + if err := intelrdt.Set(helper.IntelRdtData.config); err != nil { + t.Fatal(err) + } + + tmpStrings, err := getIntelRdtParamString(helper.IntelRdtPath, "schemata") + if err != nil { + t.Fatalf("Failed to parse file 'schemata' - %s", err) + } + values := strings.Split(tmpStrings, "\n") + value := values[0] + + if value != memBwScSchemeAfter { + t.Fatal("Got the wrong value, set 'schemata' failed.") + } +} diff --git a/libcontainer/intelrdt/stats.go b/libcontainer/intelrdt/stats.go new file mode 100644 index 0000000..df5686f --- /dev/null +++ b/libcontainer/intelrdt/stats.go @@ -0,0 +1,40 @@ +// +build linux + +package intelrdt + +type L3CacheInfo struct { + CbmMask string `json:"cbm_mask,omitempty"` + MinCbmBits uint64 `json:"min_cbm_bits,omitempty"` + NumClosids uint64 `json:"num_closids,omitempty"` +} + +type MemBwInfo struct { + BandwidthGran uint64 `json:"bandwidth_gran,omitempty"` + DelayLinear uint64 `json:"delay_linear,omitempty"` + MinBandwidth uint64 `json:"min_bandwidth,omitempty"` + NumClosids uint64 `json:"num_closids,omitempty"` +} + +type Stats struct { + // The read-only L3 cache information + L3CacheInfo *L3CacheInfo `json:"l3_cache_info,omitempty"` + + // The read-only L3 cache schema in root + L3CacheSchemaRoot string `json:"l3_cache_schema_root,omitempty"` + + // The L3 cache schema in 'container_id' group + L3CacheSchema string `json:"l3_cache_schema,omitempty"` + + // The read-only memory bandwidth information + MemBwInfo *MemBwInfo `json:"mem_bw_info,omitempty"` + + // The read-only memory bandwidth schema in root + MemBwSchemaRoot string `json:"mem_bw_schema_root,omitempty"` + + // The memory bandwidth schema in 'container_id' group + MemBwSchema string `json:"mem_bw_schema,omitempty"` +} + +func NewStats() *Stats { + return &Stats{} +} diff --git a/libcontainer/intelrdt/util_test.go b/libcontainer/intelrdt/util_test.go new file mode 100644 index 0000000..970b6ce --- /dev/null +++ b/libcontainer/intelrdt/util_test.go @@ -0,0 +1,67 @@ +// +build linux + +/* + * Utility for testing Intel RDT operations. + * Creates a mock of the Intel RDT "resource control" filesystem for the duration of the test. + */ +package intelrdt + +import ( + "io/ioutil" + "os" + "path/filepath" + "testing" + + "github.com/opencontainers/runc/libcontainer/configs" +) + +type intelRdtTestUtil struct { + // intelRdt data to use in tests + IntelRdtData *intelRdtData + + // Path to the mock Intel RDT "resource control" filesystem directory + IntelRdtPath string + + // Temporary directory to store mock Intel RDT "resource control" filesystem + tempDir string + t *testing.T +} + +// Creates a new test util +func NewIntelRdtTestUtil(t *testing.T) *intelRdtTestUtil { + d := &intelRdtData{ + config: &configs.Config{ + IntelRdt: &configs.IntelRdt{}, + }, + } + tempDir, err := ioutil.TempDir("", "intelrdt_test") + if err != nil { + t.Fatal(err) + } + d.root = tempDir + testIntelRdtPath := filepath.Join(d.root, "resctrl") + if err != nil { + t.Fatal(err) + } + + // Ensure the full mock Intel RDT "resource control" filesystem path exists + err = os.MkdirAll(testIntelRdtPath, 0755) + if err != nil { + t.Fatal(err) + } + return &intelRdtTestUtil{IntelRdtData: d, IntelRdtPath: testIntelRdtPath, tempDir: tempDir, t: t} +} + +func (c *intelRdtTestUtil) cleanup() { + os.RemoveAll(c.tempDir) +} + +// Write the specified contents on the mock of the specified Intel RDT "resource control" files +func (c *intelRdtTestUtil) writeFileContents(fileContents map[string]string) { + for file, contents := range fileContents { + err := writeFile(c.IntelRdtPath, file, contents) + if err != nil { + c.t.Fatal(err) + } + } +} diff --git a/libcontainer/keys/keyctl.go b/libcontainer/keys/keyctl.go new file mode 100644 index 0000000..74dedd5 --- /dev/null +++ b/libcontainer/keys/keyctl.go @@ -0,0 +1,48 @@ +// +build linux + +package keys + +import ( + "fmt" + "strconv" + "strings" + + "github.com/pkg/errors" + + "golang.org/x/sys/unix" +) + +type KeySerial uint32 + +func JoinSessionKeyring(name string) (KeySerial, error) { + sessKeyId, err := unix.KeyctlJoinSessionKeyring(name) + if err != nil { + return 0, errors.Wrap(err, "create session key") + } + return KeySerial(sessKeyId), nil +} + +// ModKeyringPerm modifies permissions on a keyring by reading the current permissions, +// anding the bits with the given mask (clearing permissions) and setting +// additional permission bits +func ModKeyringPerm(ringId KeySerial, mask, setbits uint32) error { + dest, err := unix.KeyctlString(unix.KEYCTL_DESCRIBE, int(ringId)) + if err != nil { + return err + } + + res := strings.Split(dest, ";") + if len(res) < 5 { + return fmt.Errorf("Destination buffer for key description is too small") + } + + // parse permissions + perm64, err := strconv.ParseUint(res[3], 16, 32) + if err != nil { + return err + } + + perm := (uint32(perm64) & mask) | setbits + + return unix.KeyctlSetperm(int(ringId), perm) +} diff --git a/libcontainer/logs/logs.go b/libcontainer/logs/logs.go new file mode 100644 index 0000000..1077e7b --- /dev/null +++ b/libcontainer/logs/logs.go @@ -0,0 +1,102 @@ +package logs + +import ( + "bufio" + "encoding/json" + "fmt" + "io" + "os" + "strconv" + "sync" + + "github.com/sirupsen/logrus" +) + +var ( + configureMutex = sync.Mutex{} + // loggingConfigured will be set once logging has been configured via invoking `ConfigureLogging`. + // Subsequent invocations of `ConfigureLogging` would be no-op + loggingConfigured = false +) + +type Config struct { + LogLevel logrus.Level + LogFormat string + LogFilePath string + LogPipeFd string +} + +func ForwardLogs(logPipe io.Reader) { + lineReader := bufio.NewReader(logPipe) + for { + line, err := lineReader.ReadBytes('\n') + if len(line) > 0 { + processEntry(line) + } + if err == io.EOF { + logrus.Debugf("log pipe has been closed: %+v", err) + return + } + if err != nil { + logrus.Errorf("log pipe read error: %+v", err) + } + } +} + +func processEntry(text []byte) { + type jsonLog struct { + Level string `json:"level"` + Msg string `json:"msg"` + } + + var jl jsonLog + if err := json.Unmarshal(text, &jl); err != nil { + logrus.Errorf("failed to decode %q to json: %+v", text, err) + return + } + + lvl, err := logrus.ParseLevel(jl.Level) + if err != nil { + logrus.Errorf("failed to parse log level %q: %v\n", jl.Level, err) + return + } + logrus.StandardLogger().Logf(lvl, jl.Msg) +} + +func ConfigureLogging(config Config) error { + configureMutex.Lock() + defer configureMutex.Unlock() + + if loggingConfigured { + logrus.Debug("logging has already been configured") + return nil + } + + logrus.SetLevel(config.LogLevel) + + if config.LogPipeFd != "" { + logPipeFdInt, err := strconv.Atoi(config.LogPipeFd) + if err != nil { + return fmt.Errorf("failed to convert _LIBCONTAINER_LOGPIPE environment variable value %q to int: %v", config.LogPipeFd, err) + } + logrus.SetOutput(os.NewFile(uintptr(logPipeFdInt), "logpipe")) + } else if config.LogFilePath != "" { + f, err := os.OpenFile(config.LogFilePath, os.O_CREATE|os.O_WRONLY|os.O_APPEND|os.O_SYNC, 0644) + if err != nil { + return err + } + logrus.SetOutput(f) + } + + switch config.LogFormat { + case "text": + // retain logrus's default. + case "json": + logrus.SetFormatter(new(logrus.JSONFormatter)) + default: + return fmt.Errorf("unknown log-format %q", config.LogFormat) + } + + loggingConfigured = true + return nil +} diff --git a/libcontainer/logs/logs_linux_test.go b/libcontainer/logs/logs_linux_test.go new file mode 100644 index 0000000..83166fa --- /dev/null +++ b/libcontainer/logs/logs_linux_test.go @@ -0,0 +1,160 @@ +package logs + +import ( + "errors" + "io/ioutil" + "os" + "strings" + "testing" + "time" + + "github.com/sirupsen/logrus" +) + +func TestLoggingToFile(t *testing.T) { + logW, logFile, _ := runLogForwarding(t) + defer os.Remove(logFile) + defer logW.Close() + + logToLogWriter(t, logW, `{"level": "info","msg":"kitten"}`) + + logFileContent := waitForLogContent(t, logFile) + if !strings.Contains(string(logFileContent), "kitten") { + t.Fatalf("%s does not contain kitten", string(logFileContent)) + } +} + +func TestLogForwardingDoesNotStopOnJsonDecodeErr(t *testing.T) { + logW, logFile, _ := runLogForwarding(t) + defer os.Remove(logFile) + defer logW.Close() + + logToLogWriter(t, logW, "invalid-json-with-kitten") + + logFileContent := waitForLogContent(t, logFile) + if !strings.Contains(string(logFileContent), "failed to decode") { + t.Fatalf("%q does not contain decoding error", string(logFileContent)) + } + + truncateLogFile(t, logFile) + + logToLogWriter(t, logW, `{"level": "info","msg":"puppy"}`) + + logFileContent = waitForLogContent(t, logFile) + if !strings.Contains(string(logFileContent), "puppy") { + t.Fatalf("%s does not contain puppy", string(logFileContent)) + } +} + +func TestLogForwardingDoesNotStopOnLogLevelParsingErr(t *testing.T) { + logW, logFile, _ := runLogForwarding(t) + defer os.Remove(logFile) + defer logW.Close() + + logToLogWriter(t, logW, `{"level": "alert","msg":"puppy"}`) + + logFileContent := waitForLogContent(t, logFile) + if !strings.Contains(string(logFileContent), "failed to parse log level") { + t.Fatalf("%q does not contain log level parsing error", string(logFileContent)) + } + + truncateLogFile(t, logFile) + + logToLogWriter(t, logW, `{"level": "info","msg":"puppy"}`) + + logFileContent = waitForLogContent(t, logFile) + if !strings.Contains(string(logFileContent), "puppy") { + t.Fatalf("%s does not contain puppy", string(logFileContent)) + } +} + +func TestLogForwardingStopsAfterClosingTheWriter(t *testing.T) { + logW, logFile, doneForwarding := runLogForwarding(t) + defer os.Remove(logFile) + + logToLogWriter(t, logW, `{"level": "info","msg":"sync"}`) + + logFileContent := waitForLogContent(t, logFile) + if !strings.Contains(string(logFileContent), "sync") { + t.Fatalf("%q does not contain sync message", string(logFileContent)) + } + + logW.Close() + select { + case <-doneForwarding: + case <-time.After(10 * time.Second): + t.Fatal("log forwarding did not stop after closing the pipe") + } +} + +func logToLogWriter(t *testing.T, logW *os.File, message string) { + _, err := logW.Write([]byte(message + "\n")) + if err != nil { + t.Fatalf("failed to write %q to log writer: %v", message, err) + } +} + +func runLogForwarding(t *testing.T) (*os.File, string, chan struct{}) { + logR, logW, err := os.Pipe() + if err != nil { + t.Fatal(err) + } + + tempFile, err := ioutil.TempFile("", "") + if err != nil { + t.Fatal(err) + } + logFile := tempFile.Name() + + logConfig := Config{LogLevel: logrus.InfoLevel, LogFormat: "json", LogFilePath: logFile} + return logW, logFile, startLogForwarding(t, logConfig, logR) +} + +func startLogForwarding(t *testing.T, logConfig Config, logR *os.File) chan struct{} { + loggingConfigured = false + if err := ConfigureLogging(logConfig); err != nil { + t.Fatal(err) + } + doneForwarding := make(chan struct{}) + go func() { + ForwardLogs(logR) + close(doneForwarding) + }() + return doneForwarding +} + +func waitForLogContent(t *testing.T, logFile string) string { + startTime := time.Now() + + for { + if time.Now().After(startTime.Add(10 * time.Second)) { + t.Fatal(errors.New("No content in log file after 10 seconds")) + break + } + + fileContent, err := ioutil.ReadFile(logFile) + if err != nil { + t.Fatal(err) + } + if len(fileContent) == 0 { + continue + } + return string(fileContent) + } + + return "" +} + +func truncateLogFile(t *testing.T, logFile string) { + file, err := os.OpenFile(logFile, os.O_RDWR, 0666) + if err != nil { + t.Fatalf("failed to open log file: %v", err) + return + } + defer file.Close() + + err = file.Truncate(0) + if err != nil { + t.Fatalf("failed to truncate log file: %v", err) + } +} diff --git a/libcontainer/message_linux.go b/libcontainer/message_linux.go new file mode 100644 index 0000000..1d4f503 --- /dev/null +++ b/libcontainer/message_linux.go @@ -0,0 +1,89 @@ +// +build linux + +package libcontainer + +import ( + "github.com/vishvananda/netlink/nl" + "golang.org/x/sys/unix" +) + +// list of known message types we want to send to bootstrap program +// The number is randomly chosen to not conflict with known netlink types +const ( + InitMsg uint16 = 62000 + CloneFlagsAttr uint16 = 27281 + NsPathsAttr uint16 = 27282 + UidmapAttr uint16 = 27283 + GidmapAttr uint16 = 27284 + SetgroupAttr uint16 = 27285 + OomScoreAdjAttr uint16 = 27286 + RootlessEUIDAttr uint16 = 27287 + UidmapPathAttr uint16 = 27288 + GidmapPathAttr uint16 = 27289 +) + +type Int32msg struct { + Type uint16 + Value uint32 +} + +// Serialize serializes the message. +// Int32msg has the following representation +// | nlattr len | nlattr type | +// | uint32 value | +func (msg *Int32msg) Serialize() []byte { + buf := make([]byte, msg.Len()) + native := nl.NativeEndian() + native.PutUint16(buf[0:2], uint16(msg.Len())) + native.PutUint16(buf[2:4], msg.Type) + native.PutUint32(buf[4:8], msg.Value) + return buf +} + +func (msg *Int32msg) Len() int { + return unix.NLA_HDRLEN + 4 +} + +// Bytemsg has the following representation +// | nlattr len | nlattr type | +// | value | pad | +type Bytemsg struct { + Type uint16 + Value []byte +} + +func (msg *Bytemsg) Serialize() []byte { + l := msg.Len() + buf := make([]byte, (l+unix.NLA_ALIGNTO-1) & ^(unix.NLA_ALIGNTO-1)) + native := nl.NativeEndian() + native.PutUint16(buf[0:2], uint16(l)) + native.PutUint16(buf[2:4], msg.Type) + copy(buf[4:], msg.Value) + return buf +} + +func (msg *Bytemsg) Len() int { + return unix.NLA_HDRLEN + len(msg.Value) + 1 // null-terminated +} + +type Boolmsg struct { + Type uint16 + Value bool +} + +func (msg *Boolmsg) Serialize() []byte { + buf := make([]byte, msg.Len()) + native := nl.NativeEndian() + native.PutUint16(buf[0:2], uint16(msg.Len())) + native.PutUint16(buf[2:4], msg.Type) + if msg.Value { + native.PutUint32(buf[4:8], uint32(1)) + } else { + native.PutUint32(buf[4:8], uint32(0)) + } + return buf +} + +func (msg *Boolmsg) Len() int { + return unix.NLA_HDRLEN + 4 // alignment +} diff --git a/libcontainer/mount/mount.go b/libcontainer/mount/mount.go new file mode 100644 index 0000000..e8965e0 --- /dev/null +++ b/libcontainer/mount/mount.go @@ -0,0 +1,23 @@ +package mount + +// GetMounts retrieves a list of mounts for the current running process. +func GetMounts() ([]*Info, error) { + return parseMountTable() +} + +// Mounted looks at /proc/self/mountinfo to determine of the specified +// mountpoint has been mounted +func Mounted(mountpoint string) (bool, error) { + entries, err := parseMountTable() + if err != nil { + return false, err + } + + // Search the table for the mountpoint + for _, e := range entries { + if e.Mountpoint == mountpoint { + return true, nil + } + } + return false, nil +} diff --git a/libcontainer/mount/mount_linux.go b/libcontainer/mount/mount_linux.go new file mode 100644 index 0000000..1e51919 --- /dev/null +++ b/libcontainer/mount/mount_linux.go @@ -0,0 +1,82 @@ +// +build linux + +package mount + +import ( + "bufio" + "fmt" + "io" + "os" + "strings" +) + +const ( + /* 36 35 98:0 /mnt1 /mnt2 rw,noatime master:1 - ext3 /dev/root rw,errors=continue + (1)(2)(3) (4) (5) (6) (7) (8) (9) (10) (11) + + (1) mount ID: unique identifier of the mount (may be reused after umount) + (2) parent ID: ID of parent (or of self for the top of the mount tree) + (3) major:minor: value of st_dev for files on filesystem + (4) root: root of the mount within the filesystem + (5) mount point: mount point relative to the process's root + (6) mount options: per mount options + (7) optional fields: zero or more fields of the form "tag[:value]" + (8) separator: marks the end of the optional fields + (9) filesystem type: name of filesystem of the form "type[.subtype]" + (10) mount source: filesystem specific information or "none" + (11) super options: per super block options*/ + mountinfoFormat = "%d %d %d:%d %s %s %s %s" +) + +// Parse /proc/self/mountinfo because comparing Dev and ino does not work from +// bind mounts +func parseMountTable() ([]*Info, error) { + f, err := os.Open("/proc/self/mountinfo") + if err != nil { + return nil, err + } + defer f.Close() + + return parseInfoFile(f) +} + +func parseInfoFile(r io.Reader) ([]*Info, error) { + var ( + s = bufio.NewScanner(r) + out = []*Info{} + ) + + for s.Scan() { + if err := s.Err(); err != nil { + return nil, err + } + + var ( + p = &Info{} + text = s.Text() + optionalFields string + ) + + if _, err := fmt.Sscanf(text, mountinfoFormat, + &p.ID, &p.Parent, &p.Major, &p.Minor, + &p.Root, &p.Mountpoint, &p.Opts, &optionalFields); err != nil { + return nil, fmt.Errorf("Scanning '%s' failed: %s", text, err) + } + // Safe as mountinfo encodes mountpoints with spaces as \040. + index := strings.Index(text, " - ") + postSeparatorFields := strings.Fields(text[index+3:]) + if len(postSeparatorFields) < 3 { + return nil, fmt.Errorf("Error found less than 3 fields post '-' in %q", text) + } + + if optionalFields != "-" { + p.Optional = optionalFields + } + + p.Fstype = postSeparatorFields[0] + p.Source = postSeparatorFields[1] + p.VfsOpts = strings.Join(postSeparatorFields[2:], " ") + out = append(out, p) + } + return out, nil +} diff --git a/libcontainer/mount/mountinfo.go b/libcontainer/mount/mountinfo.go new file mode 100644 index 0000000..e3fc353 --- /dev/null +++ b/libcontainer/mount/mountinfo.go @@ -0,0 +1,40 @@ +package mount + +// Info reveals information about a particular mounted filesystem. This +// struct is populated from the content in the /proc//mountinfo file. +type Info struct { + // ID is a unique identifier of the mount (may be reused after umount). + ID int + + // Parent indicates the ID of the mount parent (or of self for the top of the + // mount tree). + Parent int + + // Major indicates one half of the device ID which identifies the device class. + Major int + + // Minor indicates one half of the device ID which identifies a specific + // instance of device. + Minor int + + // Root of the mount within the filesystem. + Root string + + // Mountpoint indicates the mount point relative to the process's root. + Mountpoint string + + // Opts represents mount-specific options. + Opts string + + // Optional represents optional fields. + Optional string + + // Fstype indicates the type of filesystem, such as EXT3. + Fstype string + + // Source indicates filesystem specific information or "none". + Source string + + // VfsOpts represents per super block options. + VfsOpts string +} diff --git a/libcontainer/network_linux.go b/libcontainer/network_linux.go new file mode 100644 index 0000000..938d8ce --- /dev/null +++ b/libcontainer/network_linux.go @@ -0,0 +1,103 @@ +// +build linux + +package libcontainer + +import ( + "fmt" + "io/ioutil" + "path/filepath" + "strconv" + "strings" + + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/types" + "github.com/vishvananda/netlink" +) + +var strategies = map[string]networkStrategy{ + "loopback": &loopback{}, +} + +// networkStrategy represents a specific network configuration for +// a container's networking stack +type networkStrategy interface { + create(*network, int) error + initialize(*network) error + detach(*configs.Network) error + attach(*configs.Network) error +} + +// getStrategy returns the specific network strategy for the +// provided type. +func getStrategy(tpe string) (networkStrategy, error) { + s, exists := strategies[tpe] + if !exists { + return nil, fmt.Errorf("unknown strategy type %q", tpe) + } + return s, nil +} + +// Returns the network statistics for the network interfaces represented by the NetworkRuntimeInfo. +func getNetworkInterfaceStats(interfaceName string) (*types.NetworkInterface, error) { + out := &types.NetworkInterface{Name: interfaceName} + // This can happen if the network runtime information is missing - possible if the + // container was created by an old version of libcontainer. + if interfaceName == "" { + return out, nil + } + type netStatsPair struct { + // Where to write the output. + Out *uint64 + // The network stats file to read. + File string + } + // Ingress for host veth is from the container. Hence tx_bytes stat on the host veth is actually number of bytes received by the container. + netStats := []netStatsPair{ + {Out: &out.RxBytes, File: "tx_bytes"}, + {Out: &out.RxPackets, File: "tx_packets"}, + {Out: &out.RxErrors, File: "tx_errors"}, + {Out: &out.RxDropped, File: "tx_dropped"}, + + {Out: &out.TxBytes, File: "rx_bytes"}, + {Out: &out.TxPackets, File: "rx_packets"}, + {Out: &out.TxErrors, File: "rx_errors"}, + {Out: &out.TxDropped, File: "rx_dropped"}, + } + for _, netStat := range netStats { + data, err := readSysfsNetworkStats(interfaceName, netStat.File) + if err != nil { + return nil, err + } + *(netStat.Out) = data + } + return out, nil +} + +// Reads the specified statistics available under /sys/class/net//statistics +func readSysfsNetworkStats(ethInterface, statsFile string) (uint64, error) { + data, err := ioutil.ReadFile(filepath.Join("/sys/class/net", ethInterface, "statistics", statsFile)) + if err != nil { + return 0, err + } + return strconv.ParseUint(strings.TrimSpace(string(data)), 10, 64) +} + +// loopback is a network strategy that provides a basic loopback device +type loopback struct { +} + +func (l *loopback) create(n *network, nspid int) error { + return nil +} + +func (l *loopback) initialize(config *network) error { + return netlink.LinkSetUp(&netlink.Device{LinkAttrs: netlink.LinkAttrs{Name: "lo"}}) +} + +func (l *loopback) attach(n *configs.Network) (err error) { + return nil +} + +func (l *loopback) detach(n *configs.Network) (err error) { + return nil +} diff --git a/libcontainer/notify_linux.go b/libcontainer/notify_linux.go new file mode 100644 index 0000000..47a0678 --- /dev/null +++ b/libcontainer/notify_linux.go @@ -0,0 +1,90 @@ +// +build linux + +package libcontainer + +import ( + "fmt" + "io/ioutil" + "os" + "path/filepath" + + "golang.org/x/sys/unix" +) + +const oomCgroupName = "memory" + +type PressureLevel uint + +const ( + LowPressure PressureLevel = iota + MediumPressure + CriticalPressure +) + +func registerMemoryEvent(cgDir string, evName string, arg string) (<-chan struct{}, error) { + evFile, err := os.Open(filepath.Join(cgDir, evName)) + if err != nil { + return nil, err + } + fd, err := unix.Eventfd(0, unix.EFD_CLOEXEC) + if err != nil { + evFile.Close() + return nil, err + } + + eventfd := os.NewFile(uintptr(fd), "eventfd") + + eventControlPath := filepath.Join(cgDir, "cgroup.event_control") + data := fmt.Sprintf("%d %d %s", eventfd.Fd(), evFile.Fd(), arg) + if err := ioutil.WriteFile(eventControlPath, []byte(data), 0700); err != nil { + eventfd.Close() + evFile.Close() + return nil, err + } + ch := make(chan struct{}) + go func() { + defer func() { + eventfd.Close() + evFile.Close() + close(ch) + }() + buf := make([]byte, 8) + for { + if _, err := eventfd.Read(buf); err != nil { + return + } + // When a cgroup is destroyed, an event is sent to eventfd. + // So if the control path is gone, return instead of notifying. + if _, err := os.Lstat(eventControlPath); os.IsNotExist(err) { + return + } + ch <- struct{}{} + } + }() + return ch, nil +} + +// notifyOnOOM returns channel on which you can expect event about OOM, +// if process died without OOM this channel will be closed. +func notifyOnOOM(paths map[string]string) (<-chan struct{}, error) { + dir := paths[oomCgroupName] + if dir == "" { + return nil, fmt.Errorf("path %q missing", oomCgroupName) + } + + return registerMemoryEvent(dir, "memory.oom_control", "") +} + +func notifyMemoryPressure(paths map[string]string, level PressureLevel) (<-chan struct{}, error) { + dir := paths[oomCgroupName] + if dir == "" { + return nil, fmt.Errorf("path %q missing", oomCgroupName) + } + + if level > CriticalPressure { + return nil, fmt.Errorf("invalid pressure level %d", level) + } + + levelStr := []string{"low", "medium", "critical"}[level] + return registerMemoryEvent(dir, "memory.pressure_level", levelStr) +} diff --git a/libcontainer/notify_linux_test.go b/libcontainer/notify_linux_test.go new file mode 100644 index 0000000..1e15ae2 --- /dev/null +++ b/libcontainer/notify_linux_test.go @@ -0,0 +1,126 @@ +// +build linux + +package libcontainer + +import ( + "encoding/binary" + "fmt" + "io/ioutil" + "os" + "path/filepath" + "testing" + "time" + + "golang.org/x/sys/unix" +) + +type notifyFunc func(paths map[string]string) (<-chan struct{}, error) + +func testMemoryNotification(t *testing.T, evName string, notify notifyFunc, targ string) { + memoryPath, err := ioutil.TempDir("", "testmemnotification-"+evName) + if err != nil { + t.Fatal(err) + } + evFile := filepath.Join(memoryPath, evName) + eventPath := filepath.Join(memoryPath, "cgroup.event_control") + if err := ioutil.WriteFile(evFile, []byte{}, 0700); err != nil { + t.Fatal(err) + } + if err := ioutil.WriteFile(eventPath, []byte{}, 0700); err != nil { + t.Fatal(err) + } + paths := map[string]string{ + "memory": memoryPath, + } + ch, err := notify(paths) + if err != nil { + t.Fatal("expected no error, got:", err) + } + + data, err := ioutil.ReadFile(eventPath) + if err != nil { + t.Fatal("couldn't read event control file:", err) + } + + var eventFd, evFd int + var arg string + if targ != "" { + _, err = fmt.Sscanf(string(data), "%d %d %s", &eventFd, &evFd, &arg) + } else { + _, err = fmt.Sscanf(string(data), "%d %d", &eventFd, &evFd) + } + if err != nil || arg != targ { + t.Fatalf("invalid control data %q: %s", data, err) + } + + // dup the eventfd + efd, err := unix.Dup(eventFd) + if err != nil { + t.Fatal("unable to dup eventfd:", err) + } + defer unix.Close(efd) + + buf := make([]byte, 8) + binary.LittleEndian.PutUint64(buf, 1) + + if _, err := unix.Write(efd, buf); err != nil { + t.Fatal("unable to write to eventfd:", err) + } + + select { + case <-ch: + case <-time.After(100 * time.Millisecond): + t.Fatal("no notification on channel after 100ms") + } + + // simulate what happens when a cgroup is destroyed by cleaning up and then + // writing to the eventfd. + if err := os.RemoveAll(memoryPath); err != nil { + t.Fatal(err) + } + if _, err := unix.Write(efd, buf); err != nil { + t.Fatal("unable to write to eventfd:", err) + } + + // give things a moment to shut down + select { + case _, ok := <-ch: + if ok { + t.Fatal("expected no notification to be triggered") + } + case <-time.After(100 * time.Millisecond): + t.Fatal("channel not closed after 100ms") + } + + if _, _, err := unix.Syscall(unix.SYS_FCNTL, uintptr(evFd), unix.F_GETFD, 0); err != unix.EBADF { + t.Errorf("expected event control to be closed, but received error %s", err.Error()) + } + + if _, _, err := unix.Syscall(unix.SYS_FCNTL, uintptr(eventFd), unix.F_GETFD, 0); err != unix.EBADF { + t.Errorf("expected event fd to be closed, but received error %s", err.Error()) + } +} + +func TestNotifyOnOOM(t *testing.T) { + f := func(paths map[string]string) (<-chan struct{}, error) { + return notifyOnOOM(paths) + } + + testMemoryNotification(t, "memory.oom_control", f, "") +} + +func TestNotifyMemoryPressure(t *testing.T) { + tests := map[PressureLevel]string{ + LowPressure: "low", + MediumPressure: "medium", + CriticalPressure: "critical", + } + + for level, arg := range tests { + f := func(paths map[string]string) (<-chan struct{}, error) { + return notifyMemoryPressure(paths, level) + } + + testMemoryNotification(t, "memory.pressure_level", f, arg) + } +} diff --git a/libcontainer/nsenter/README.md b/libcontainer/nsenter/README.md new file mode 100644 index 0000000..9ec6c39 --- /dev/null +++ b/libcontainer/nsenter/README.md @@ -0,0 +1,44 @@ +## nsenter + +The `nsenter` package registers a special init constructor that is called before +the Go runtime has a chance to boot. This provides us the ability to `setns` on +existing namespaces and avoid the issues that the Go runtime has with multiple +threads. This constructor will be called if this package is registered, +imported, in your go application. + +The `nsenter` package will `import "C"` and it uses [cgo](https://golang.org/cmd/cgo/) +package. In cgo, if the import of "C" is immediately preceded by a comment, that comment, +called the preamble, is used as a header when compiling the C parts of the package. +So every time we import package `nsenter`, the C code function `nsexec()` would be +called. And package `nsenter` is only imported in `init.go`, so every time the runc +`init` command is invoked, that C code is run. + +Because `nsexec()` must be run before the Go runtime in order to use the +Linux kernel namespace, you must `import` this library into a package if +you plan to use `libcontainer` directly. Otherwise Go will not execute +the `nsexec()` constructor, which means that the re-exec will not cause +the namespaces to be joined. You can import it like this: + +```go +import _ "github.com/opencontainers/runc/libcontainer/nsenter" +``` + +`nsexec()` will first get the file descriptor number for the init pipe +from the environment variable `_LIBCONTAINER_INITPIPE` (which was opened +by the parent and kept open across the fork-exec of the `nsexec()` init +process). The init pipe is used to read bootstrap data (namespace paths, +clone flags, uid and gid mappings, and the console path) from the parent +process. `nsexec()` will then call `setns(2)` to join the namespaces +provided in the bootstrap data (if available), `clone(2)` a child process +with the provided clone flags, update the user and group ID mappings, do +some further miscellaneous setup steps, and then send the PID of the +child process to the parent of the `nsexec()` "caller". Finally, +the parent `nsexec()` will exit and the child `nsexec()` process will +return to allow the Go runtime take over. + +NOTE: We do both `setns(2)` and `clone(2)` even if we don't have any +`CLONE_NEW*` clone flags because we must fork a new process in order to +enter the PID namespace. + + + diff --git a/libcontainer/nsenter/cloned_binary.c b/libcontainer/nsenter/cloned_binary.c new file mode 100644 index 0000000..ad10f14 --- /dev/null +++ b/libcontainer/nsenter/cloned_binary.c @@ -0,0 +1,516 @@ +/* + * Copyright (C) 2019 Aleksa Sarai + * Copyright (C) 2019 SUSE LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +/* Use our own wrapper for memfd_create. */ +#if !defined(SYS_memfd_create) && defined(__NR_memfd_create) +# define SYS_memfd_create __NR_memfd_create +#endif +/* memfd_create(2) flags -- copied from . */ +#ifndef MFD_CLOEXEC +# define MFD_CLOEXEC 0x0001U +# define MFD_ALLOW_SEALING 0x0002U +#endif +int memfd_create(const char *name, unsigned int flags) +{ +#ifdef SYS_memfd_create + return syscall(SYS_memfd_create, name, flags); +#else + errno = ENOSYS; + return -1; +#endif +} + + +/* This comes directly from . */ +#ifndef F_LINUX_SPECIFIC_BASE +# define F_LINUX_SPECIFIC_BASE 1024 +#endif +#ifndef F_ADD_SEALS +# define F_ADD_SEALS (F_LINUX_SPECIFIC_BASE + 9) +# define F_GET_SEALS (F_LINUX_SPECIFIC_BASE + 10) +#endif +#ifndef F_SEAL_SEAL +# define F_SEAL_SEAL 0x0001 /* prevent further seals from being set */ +# define F_SEAL_SHRINK 0x0002 /* prevent file from shrinking */ +# define F_SEAL_GROW 0x0004 /* prevent file from growing */ +# define F_SEAL_WRITE 0x0008 /* prevent writes */ +#endif + +#define CLONED_BINARY_ENV "_LIBCONTAINER_CLONED_BINARY" +#define RUNC_MEMFD_COMMENT "runc_cloned:/proc/self/exe" +#define RUNC_MEMFD_SEALS \ + (F_SEAL_SEAL | F_SEAL_SHRINK | F_SEAL_GROW | F_SEAL_WRITE) + +static void *must_realloc(void *ptr, size_t size) +{ + void *old = ptr; + do { + ptr = realloc(old, size); + } while(!ptr); + return ptr; +} + +/* + * Verify whether we are currently in a self-cloned program (namely, is + * /proc/self/exe a memfd). F_GET_SEALS will only succeed for memfds (or rather + * for shmem files), and we want to be sure it's actually sealed. + */ +static int is_self_cloned(void) +{ + int fd, ret, is_cloned = 0; + struct stat statbuf = {}; + struct statfs fsbuf = {}; + + fd = open("/proc/self/exe", O_RDONLY|O_CLOEXEC); + if (fd < 0) + return -ENOTRECOVERABLE; + + /* + * Is the binary a fully-sealed memfd? We don't need CLONED_BINARY_ENV for + * this, because you cannot write to a sealed memfd no matter what (so + * sharing it isn't a bad thing -- and an admin could bind-mount a sealed + * memfd to /usr/bin/runc to allow re-use). + */ + ret = fcntl(fd, F_GET_SEALS); + if (ret >= 0) { + is_cloned = (ret == RUNC_MEMFD_SEALS); + goto out; + } + + /* + * All other forms require CLONED_BINARY_ENV, since they are potentially + * writeable (or we can't tell if they're fully safe) and thus we must + * check the environment as an extra layer of defence. + */ + if (!getenv(CLONED_BINARY_ENV)) { + is_cloned = false; + goto out; + } + + /* + * Is the binary on a read-only filesystem? We can't detect bind-mounts in + * particular (in-kernel they are identical to regular mounts) but we can + * at least be sure that it's read-only. In addition, to make sure that + * it's *our* bind-mount we check CLONED_BINARY_ENV. + */ + if (fstatfs(fd, &fsbuf) >= 0) + is_cloned |= (fsbuf.f_flags & MS_RDONLY); + + /* + * Okay, we're a tmpfile -- or we're currently running on RHEL <=7.6 + * which appears to have a borked backport of F_GET_SEALS. Either way, + * having a file which has no hardlinks indicates that we aren't using + * a host-side "runc" binary and this is something that a container + * cannot fake (because unlinking requires being able to resolve the + * path that you want to unlink). + */ + if (fstat(fd, &statbuf) >= 0) + is_cloned |= (statbuf.st_nlink == 0); + +out: + close(fd); + return is_cloned; +} + +/* Read a given file into a new buffer, and providing the length. */ +static char *read_file(char *path, size_t *length) +{ + int fd; + char buf[4096], *copy = NULL; + + if (!length) + return NULL; + + fd = open(path, O_RDONLY | O_CLOEXEC); + if (fd < 0) + return NULL; + + *length = 0; + for (;;) { + ssize_t n; + + n = read(fd, buf, sizeof(buf)); + if (n < 0) + goto error; + if (!n) + break; + + copy = must_realloc(copy, (*length + n) * sizeof(*copy)); + memcpy(copy + *length, buf, n); + *length += n; + } + close(fd); + return copy; + +error: + close(fd); + free(copy); + return NULL; +} + +/* + * A poor-man's version of "xargs -0". Basically parses a given block of + * NUL-delimited data, within the given length and adds a pointer to each entry + * to the array of pointers. + */ +static int parse_xargs(char *data, int data_length, char ***output) +{ + int num = 0; + char *cur = data; + + if (!data || *output != NULL) + return -1; + + while (cur < data + data_length) { + num++; + *output = must_realloc(*output, (num + 1) * sizeof(**output)); + (*output)[num - 1] = cur; + cur += strlen(cur) + 1; + } + (*output)[num] = NULL; + return num; +} + +/* + * "Parse" out argv from /proc/self/cmdline. + * This is necessary because we are running in a context where we don't have a + * main() that we can just get the arguments from. + */ +static int fetchve(char ***argv) +{ + char *cmdline = NULL; + size_t cmdline_size; + + cmdline = read_file("/proc/self/cmdline", &cmdline_size); + if (!cmdline) + goto error; + + if (parse_xargs(cmdline, cmdline_size, argv) <= 0) + goto error; + + return 0; + +error: + free(cmdline); + return -EINVAL; +} + +enum { + EFD_NONE = 0, + EFD_MEMFD, + EFD_FILE, +}; + +/* + * This comes from . We can't hard-code __O_TMPFILE because it + * changes depending on the architecture. If we don't have O_TMPFILE we always + * have the mkostemp(3) fallback. + */ +#ifndef O_TMPFILE +# if defined(__O_TMPFILE) && defined(O_DIRECTORY) +# define O_TMPFILE (__O_TMPFILE | O_DIRECTORY) +# endif +#endif + +static int make_execfd(int *fdtype) +{ + int fd = -1; + char template[PATH_MAX] = {0}; + char *prefix = getenv("_LIBCONTAINER_STATEDIR"); + + if (!prefix || *prefix != '/') + prefix = "/tmp"; + if (snprintf(template, sizeof(template), "%s/runc.XXXXXX", prefix) < 0) + return -1; + + /* + * Now try memfd, it's much nicer than actually creating a file in STATEDIR + * since it's easily detected thanks to sealing and also doesn't require + * assumptions about STATEDIR. + */ + *fdtype = EFD_MEMFD; + fd = memfd_create(RUNC_MEMFD_COMMENT, MFD_CLOEXEC | MFD_ALLOW_SEALING); + if (fd >= 0) + return fd; + if (errno != ENOSYS && errno != EINVAL) + goto error; + +#ifdef O_TMPFILE + /* + * Try O_TMPFILE to avoid races where someone might snatch our file. Note + * that O_EXCL isn't actually a security measure here (since you can just + * fd re-open it and clear O_EXCL). + */ + *fdtype = EFD_FILE; + fd = open(prefix, O_TMPFILE | O_EXCL | O_RDWR | O_CLOEXEC, 0700); + if (fd >= 0) { + struct stat statbuf = {}; + bool working_otmpfile = false; + + /* + * open(2) ignores unknown O_* flags -- yeah, I was surprised when I + * found this out too. As a result we can't check for EINVAL. However, + * if we get nlink != 0 (or EISDIR) then we know that this kernel + * doesn't support O_TMPFILE. + */ + if (fstat(fd, &statbuf) >= 0) + working_otmpfile = (statbuf.st_nlink == 0); + + if (working_otmpfile) + return fd; + + /* Pretend that we got EISDIR since O_TMPFILE failed. */ + close(fd); + errno = EISDIR; + } + if (errno != EISDIR) + goto error; +#endif /* defined(O_TMPFILE) */ + + /* + * Our final option is to create a temporary file the old-school way, and + * then unlink it so that nothing else sees it by accident. + */ + *fdtype = EFD_FILE; + fd = mkostemp(template, O_CLOEXEC); + if (fd >= 0) { + if (unlink(template) >= 0) + return fd; + close(fd); + } + +error: + *fdtype = EFD_NONE; + return -1; +} + +static int seal_execfd(int *fd, int fdtype) +{ + switch (fdtype) { + case EFD_MEMFD: + return fcntl(*fd, F_ADD_SEALS, RUNC_MEMFD_SEALS); + case EFD_FILE: { + /* Need to re-open our pseudo-memfd as an O_PATH to avoid execve(2) giving -ETXTBSY. */ + int newfd; + char fdpath[PATH_MAX] = {0}; + + if (fchmod(*fd, 0100) < 0) + return -1; + + if (snprintf(fdpath, sizeof(fdpath), "/proc/self/fd/%d", *fd) < 0) + return -1; + + newfd = open(fdpath, O_PATH | O_CLOEXEC); + if (newfd < 0) + return -1; + + close(*fd); + *fd = newfd; + return 0; + } + default: + break; + } + return -1; +} + +static int try_bindfd(void) +{ + int fd, ret = -1; + char template[PATH_MAX] = {0}; + char *prefix = getenv("_LIBCONTAINER_STATEDIR"); + + if (!prefix || *prefix != '/') + prefix = "/tmp"; + if (snprintf(template, sizeof(template), "%s/runc.XXXXXX", prefix) < 0) + return ret; + + /* + * We need somewhere to mount it, mounting anything over /proc/self is a + * BAD idea on the host -- even if we do it temporarily. + */ + fd = mkstemp(template); + if (fd < 0) + return ret; + close(fd); + + /* + * For obvious reasons this won't work in rootless mode because we haven't + * created a userns+mntns -- but getting that to work will be a bit + * complicated and it's only worth doing if someone actually needs it. + */ + ret = -EPERM; + if (mount("/proc/self/exe", template, "", MS_BIND, "") < 0) + goto out; + if (mount("", template, "", MS_REMOUNT | MS_BIND | MS_RDONLY, "") < 0) + goto out_umount; + + + /* Get read-only handle that we're sure can't be made read-write. */ + ret = open(template, O_PATH | O_CLOEXEC); + +out_umount: + /* + * Make sure the MNT_DETACH works, otherwise we could get remounted + * read-write and that would be quite bad (the fd would be made read-write + * too, invalidating the protection). + */ + if (umount2(template, MNT_DETACH) < 0) { + if (ret >= 0) + close(ret); + ret = -ENOTRECOVERABLE; + } + +out: + /* + * We don't care about unlink errors, the worst that happens is that + * there's an empty file left around in STATEDIR. + */ + unlink(template); + return ret; +} + +static ssize_t fd_to_fd(int outfd, int infd) +{ + ssize_t total = 0; + char buffer[4096]; + + for (;;) { + ssize_t nread, nwritten = 0; + + nread = read(infd, buffer, sizeof(buffer)); + if (nread < 0) + return -1; + if (!nread) + break; + + do { + ssize_t n = write(outfd, buffer + nwritten, nread - nwritten); + if (n < 0) + return -1; + nwritten += n; + } while(nwritten < nread); + + total += nwritten; + } + + return total; +} + +static int clone_binary(void) +{ + int binfd, execfd; + struct stat statbuf = {}; + size_t sent = 0; + int fdtype = EFD_NONE; + + /* + * Before we resort to copying, let's try creating an ro-binfd in one shot + * by getting a handle for a read-only bind-mount of the execfd. + */ + execfd = try_bindfd(); + if (execfd >= 0) + return execfd; + + /* + * Dammit, that didn't work -- time to copy the binary to a safe place we + * can seal the contents. + */ + execfd = make_execfd(&fdtype); + if (execfd < 0 || fdtype == EFD_NONE) + return -ENOTRECOVERABLE; + + binfd = open("/proc/self/exe", O_RDONLY | O_CLOEXEC); + if (binfd < 0) + goto error; + + if (fstat(binfd, &statbuf) < 0) + goto error_binfd; + + while (sent < statbuf.st_size) { + int n = sendfile(execfd, binfd, NULL, statbuf.st_size - sent); + if (n < 0) { + /* sendfile can fail so we fallback to a dumb user-space copy. */ + n = fd_to_fd(execfd, binfd); + if (n < 0) + goto error_binfd; + } + sent += n; + } + close(binfd); + if (sent != statbuf.st_size) + goto error; + + if (seal_execfd(&execfd, fdtype) < 0) + goto error; + + return execfd; + +error_binfd: + close(binfd); +error: + close(execfd); + return -EIO; +} + +/* Get cheap access to the environment. */ +extern char **environ; + +int ensure_cloned_binary(void) +{ + int execfd; + char **argv = NULL; + + /* Check that we're not self-cloned, and if we are then bail. */ + int cloned = is_self_cloned(); + if (cloned > 0 || cloned == -ENOTRECOVERABLE) + return cloned; + + if (fetchve(&argv) < 0) + return -EINVAL; + + execfd = clone_binary(); + if (execfd < 0) + return -EIO; + + if (putenv(CLONED_BINARY_ENV "=1")) + goto error; + + fexecve(execfd, argv, environ); +error: + close(execfd); + return -ENOEXEC; +} diff --git a/libcontainer/nsenter/namespace.h b/libcontainer/nsenter/namespace.h new file mode 100644 index 0000000..9e9bdca --- /dev/null +++ b/libcontainer/nsenter/namespace.h @@ -0,0 +1,32 @@ +#ifndef NSENTER_NAMESPACE_H +#define NSENTER_NAMESPACE_H + +#ifndef _GNU_SOURCE +# define _GNU_SOURCE +#endif +#include + +/* All of these are taken from include/uapi/linux/sched.h */ +#ifndef CLONE_NEWNS +# define CLONE_NEWNS 0x00020000 /* New mount namespace group */ +#endif +#ifndef CLONE_NEWCGROUP +# define CLONE_NEWCGROUP 0x02000000 /* New cgroup namespace */ +#endif +#ifndef CLONE_NEWUTS +# define CLONE_NEWUTS 0x04000000 /* New utsname namespace */ +#endif +#ifndef CLONE_NEWIPC +# define CLONE_NEWIPC 0x08000000 /* New ipc namespace */ +#endif +#ifndef CLONE_NEWUSER +# define CLONE_NEWUSER 0x10000000 /* New user namespace */ +#endif +#ifndef CLONE_NEWPID +# define CLONE_NEWPID 0x20000000 /* New pid namespace */ +#endif +#ifndef CLONE_NEWNET +# define CLONE_NEWNET 0x40000000 /* New network namespace */ +#endif + +#endif /* NSENTER_NAMESPACE_H */ diff --git a/libcontainer/nsenter/nsenter.go b/libcontainer/nsenter/nsenter.go new file mode 100644 index 0000000..07f4d63 --- /dev/null +++ b/libcontainer/nsenter/nsenter.go @@ -0,0 +1,12 @@ +// +build linux,!gccgo + +package nsenter + +/* +#cgo CFLAGS: -Wall +extern void nsexec(); +void __attribute__((constructor)) init(void) { + nsexec(); +} +*/ +import "C" diff --git a/libcontainer/nsenter/nsenter_gccgo.go b/libcontainer/nsenter/nsenter_gccgo.go new file mode 100644 index 0000000..63c7a3e --- /dev/null +++ b/libcontainer/nsenter/nsenter_gccgo.go @@ -0,0 +1,25 @@ +// +build linux,gccgo + +package nsenter + +/* +#cgo CFLAGS: -Wall +extern void nsexec(); +void __attribute__((constructor)) init(void) { + nsexec(); +} +*/ +import "C" + +// AlwaysFalse is here to stay false +// (and be exported so the compiler doesn't optimize out its reference) +var AlwaysFalse bool + +func init() { + if AlwaysFalse { + // by referencing this C init() in a noop test, it will ensure the compiler + // links in the C function. + // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=65134 + C.init() + } +} diff --git a/libcontainer/nsenter/nsenter_test.go b/libcontainer/nsenter/nsenter_test.go new file mode 100644 index 0000000..c4d3c86 --- /dev/null +++ b/libcontainer/nsenter/nsenter_test.go @@ -0,0 +1,239 @@ +package nsenter + +import ( + "bytes" + "encoding/json" + "fmt" + "io" + "io/ioutil" + "os" + "os/exec" + "strings" + "testing" + + "github.com/opencontainers/runc/libcontainer" + "github.com/vishvananda/netlink/nl" + + "golang.org/x/sys/unix" +) + +type pid struct { + Pid int `json:"Pid"` +} + +type logentry struct { + Msg string `json:"msg"` + Level string `json:"level"` +} + +func TestNsenterValidPaths(t *testing.T) { + args := []string{"nsenter-exec"} + parent, child, err := newPipe() + if err != nil { + t.Fatalf("failed to create pipe %v", err) + } + + namespaces := []string{ + // join pid ns of the current process + fmt.Sprintf("pid:/proc/%d/ns/pid", os.Getpid()), + } + cmd := &exec.Cmd{ + Path: os.Args[0], + Args: args, + ExtraFiles: []*os.File{child}, + Env: []string{"_LIBCONTAINER_INITPIPE=3"}, + Stdout: os.Stdout, + Stderr: os.Stderr, + } + + if err := cmd.Start(); err != nil { + t.Fatalf("nsenter failed to start %v", err) + } + // write cloneFlags + r := nl.NewNetlinkRequest(int(libcontainer.InitMsg), 0) + r.AddData(&libcontainer.Int32msg{ + Type: libcontainer.CloneFlagsAttr, + Value: uint32(unix.CLONE_NEWNET), + }) + r.AddData(&libcontainer.Bytemsg{ + Type: libcontainer.NsPathsAttr, + Value: []byte(strings.Join(namespaces, ",")), + }) + if _, err := io.Copy(parent, bytes.NewReader(r.Serialize())); err != nil { + t.Fatal(err) + } + + decoder := json.NewDecoder(parent) + var pid *pid + + if err := cmd.Wait(); err != nil { + t.Fatalf("nsenter exits with a non-zero exit status") + } + if err := decoder.Decode(&pid); err != nil { + dir, _ := ioutil.ReadDir(fmt.Sprintf("/proc/%d/ns", os.Getpid())) + for _, d := range dir { + t.Log(d.Name()) + } + t.Fatalf("%v", err) + } + + p, err := os.FindProcess(pid.Pid) + if err != nil { + t.Fatalf("%v", err) + } + p.Wait() +} + +func TestNsenterInvalidPaths(t *testing.T) { + args := []string{"nsenter-exec"} + parent, child, err := newPipe() + if err != nil { + t.Fatalf("failed to create pipe %v", err) + } + + namespaces := []string{ + // join pid ns of the current process + fmt.Sprintf("pid:/proc/%d/ns/pid", -1), + } + cmd := &exec.Cmd{ + Path: os.Args[0], + Args: args, + ExtraFiles: []*os.File{child}, + Env: []string{"_LIBCONTAINER_INITPIPE=3"}, + } + + if err := cmd.Start(); err != nil { + t.Fatal(err) + } + // write cloneFlags + r := nl.NewNetlinkRequest(int(libcontainer.InitMsg), 0) + r.AddData(&libcontainer.Int32msg{ + Type: libcontainer.CloneFlagsAttr, + Value: uint32(unix.CLONE_NEWNET), + }) + r.AddData(&libcontainer.Bytemsg{ + Type: libcontainer.NsPathsAttr, + Value: []byte(strings.Join(namespaces, ",")), + }) + if _, err := io.Copy(parent, bytes.NewReader(r.Serialize())); err != nil { + t.Fatal(err) + } + + if err := cmd.Wait(); err == nil { + t.Fatalf("nsenter exits with a zero exit status") + } +} + +func TestNsenterIncorrectPathType(t *testing.T) { + args := []string{"nsenter-exec"} + parent, child, err := newPipe() + if err != nil { + t.Fatalf("failed to create pipe %v", err) + } + + namespaces := []string{ + // join pid ns of the current process + fmt.Sprintf("net:/proc/%d/ns/pid", os.Getpid()), + } + cmd := &exec.Cmd{ + Path: os.Args[0], + Args: args, + ExtraFiles: []*os.File{child}, + Env: []string{"_LIBCONTAINER_INITPIPE=3"}, + } + + if err := cmd.Start(); err != nil { + t.Fatal(err) + } + // write cloneFlags + r := nl.NewNetlinkRequest(int(libcontainer.InitMsg), 0) + r.AddData(&libcontainer.Int32msg{ + Type: libcontainer.CloneFlagsAttr, + Value: uint32(unix.CLONE_NEWNET), + }) + r.AddData(&libcontainer.Bytemsg{ + Type: libcontainer.NsPathsAttr, + Value: []byte(strings.Join(namespaces, ",")), + }) + if _, err := io.Copy(parent, bytes.NewReader(r.Serialize())); err != nil { + t.Fatal(err) + } + + if err := cmd.Wait(); err == nil { + t.Fatalf("nsenter exits with a zero exit status") + } +} + +func TestNsenterChildLogging(t *testing.T) { + args := []string{"nsenter-exec"} + parent, child, err := newPipe() + if err != nil { + t.Fatalf("failed to create exec pipe %v", err) + } + logread, logwrite, err := os.Pipe() + if err != nil { + t.Fatalf("failed to create log pipe %v", err) + } + defer logread.Close() + defer logwrite.Close() + + namespaces := []string{ + // join pid ns of the current process + fmt.Sprintf("pid:/proc/%d/ns/pid", os.Getpid()), + } + cmd := &exec.Cmd{ + Path: os.Args[0], + Args: args, + ExtraFiles: []*os.File{child, logwrite}, + Env: []string{"_LIBCONTAINER_INITPIPE=3", "_LIBCONTAINER_LOGPIPE=4"}, + Stdout: os.Stdout, + Stderr: os.Stderr, + } + + if err := cmd.Start(); err != nil { + t.Fatalf("nsenter failed to start %v", err) + } + // write cloneFlags + r := nl.NewNetlinkRequest(int(libcontainer.InitMsg), 0) + r.AddData(&libcontainer.Int32msg{ + Type: libcontainer.CloneFlagsAttr, + Value: uint32(unix.CLONE_NEWNET), + }) + r.AddData(&libcontainer.Bytemsg{ + Type: libcontainer.NsPathsAttr, + Value: []byte(strings.Join(namespaces, ",")), + }) + if _, err := io.Copy(parent, bytes.NewReader(r.Serialize())); err != nil { + t.Fatal(err) + } + + logsDecoder := json.NewDecoder(logread) + var logentry *logentry + + err = logsDecoder.Decode(&logentry) + if err != nil { + t.Fatalf("child log: %v", err) + } + if logentry.Level == "" || logentry.Msg == "" { + t.Fatalf("child log: empty log fileds: level=\"%s\" msg=\"%s\"", logentry.Level, logentry.Msg) + } + + if err := cmd.Wait(); err != nil { + t.Fatalf("nsenter exits with a non-zero exit status") + } +} + +func init() { + if strings.HasPrefix(os.Args[0], "nsenter-") { + os.Exit(0) + } + return +} + +func newPipe() (parent *os.File, child *os.File, err error) { + fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0) + if err != nil { + return nil, nil, err + } + return os.NewFile(uintptr(fds[1]), "parent"), os.NewFile(uintptr(fds[0]), "child"), nil +} diff --git a/libcontainer/nsenter/nsenter_unsupported.go b/libcontainer/nsenter/nsenter_unsupported.go new file mode 100644 index 0000000..2459c63 --- /dev/null +++ b/libcontainer/nsenter/nsenter_unsupported.go @@ -0,0 +1,3 @@ +// +build !linux !cgo + +package nsenter diff --git a/libcontainer/nsenter/nsexec.c b/libcontainer/nsenter/nsexec.c new file mode 100644 index 0000000..0726568 --- /dev/null +++ b/libcontainer/nsenter/nsexec.c @@ -0,0 +1,1032 @@ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include + +/* Get all of the CLONE_NEW* flags. */ +#include "namespace.h" + +/* Synchronisation values. */ +enum sync_t { + SYNC_USERMAP_PLS = 0x40, /* Request parent to map our users. */ + SYNC_USERMAP_ACK = 0x41, /* Mapping finished by the parent. */ + SYNC_RECVPID_PLS = 0x42, /* Tell parent we're sending the PID. */ + SYNC_RECVPID_ACK = 0x43, /* PID was correctly received by parent. */ + SYNC_GRANDCHILD = 0x44, /* The grandchild is ready to run. */ + SYNC_CHILD_READY = 0x45, /* The child or grandchild is ready to return. */ +}; + +/* + * Synchronisation value for cgroup namespace setup. + * The same constant is defined in process_linux.go as "createCgroupns". + */ +#define CREATECGROUPNS 0x80 + +/* longjmp() arguments. */ +#define JUMP_PARENT 0x00 +#define JUMP_CHILD 0xA0 +#define JUMP_INIT 0xA1 + +/* Assume the stack grows down, so arguments should be above it. */ +struct clone_t { + /* + * Reserve some space for clone() to locate arguments + * and retcode in this place + */ + char stack[4096] __attribute__ ((aligned(16))); + char stack_ptr[0]; + + /* There's two children. This is used to execute the different code. */ + jmp_buf *env; + int jmpval; +}; + +struct nlconfig_t { + char *data; + + /* Process settings. */ + uint32_t cloneflags; + char *oom_score_adj; + size_t oom_score_adj_len; + + /* User namespace settings. */ + char *uidmap; + size_t uidmap_len; + char *gidmap; + size_t gidmap_len; + char *namespaces; + size_t namespaces_len; + uint8_t is_setgroup; + + /* Rootless container settings. */ + uint8_t is_rootless_euid; /* boolean */ + char *uidmappath; + size_t uidmappath_len; + char *gidmappath; + size_t gidmappath_len; +}; + +#define PANIC "panic" +#define FATAL "fatal" +#define ERROR "error" +#define WARNING "warning" +#define INFO "info" +#define DEBUG "debug" + +static int logfd = -1; + +/* + * List of netlink message types sent to us as part of bootstrapping the init. + * These constants are defined in libcontainer/message_linux.go. + */ +#define INIT_MSG 62000 +#define CLONE_FLAGS_ATTR 27281 +#define NS_PATHS_ATTR 27282 +#define UIDMAP_ATTR 27283 +#define GIDMAP_ATTR 27284 +#define SETGROUP_ATTR 27285 +#define OOM_SCORE_ADJ_ATTR 27286 +#define ROOTLESS_EUID_ATTR 27287 +#define UIDMAPPATH_ATTR 27288 +#define GIDMAPPATH_ATTR 27289 + +/* + * Use the raw syscall for versions of glibc which don't include a function for + * it, namely (glibc 2.12). + */ +#if __GLIBC__ == 2 && __GLIBC_MINOR__ < 14 +# define _GNU_SOURCE +# include "syscall.h" +# if !defined(SYS_setns) && defined(__NR_setns) +# define SYS_setns __NR_setns +# endif + +#ifndef SYS_setns +# error "setns(2) syscall not supported by glibc version" +#endif + +int setns(int fd, int nstype) +{ + return syscall(SYS_setns, fd, nstype); +} +#endif + +static void write_log_with_info(const char *level, const char *function, int line, const char *format, ...) +{ + char message[1024] = {}; + + va_list args; + + if (logfd < 0 || level == NULL) + return; + + va_start(args, format); + if (vsnprintf(message, sizeof(message), format, args) < 0) + goto done; + + dprintf(logfd, "{\"level\":\"%s\", \"msg\": \"%s:%d %s\"}\n", level, function, line, message); +done: + va_end(args); +} + +#define write_log(level, fmt, ...) \ + write_log_with_info((level), __FUNCTION__, __LINE__, (fmt), ##__VA_ARGS__) + +/* XXX: This is ugly. */ +static int syncfd = -1; + +#define bail(fmt, ...) \ + do { \ + write_log(FATAL, "nsenter: " fmt ": %m", ##__VA_ARGS__); \ + exit(1); \ + } while(0) + +static int write_file(char *data, size_t data_len, char *pathfmt, ...) +{ + int fd, len, ret = 0; + char path[PATH_MAX]; + + va_list ap; + va_start(ap, pathfmt); + len = vsnprintf(path, PATH_MAX, pathfmt, ap); + va_end(ap); + if (len < 0) + return -1; + + fd = open(path, O_RDWR); + if (fd < 0) { + return -1; + } + + len = write(fd, data, data_len); + if (len != data_len) { + ret = -1; + goto out; + } + + out: + close(fd); + return ret; +} + +enum policy_t { + SETGROUPS_DEFAULT = 0, + SETGROUPS_ALLOW, + SETGROUPS_DENY, +}; + +/* This *must* be called before we touch gid_map. */ +static void update_setgroups(int pid, enum policy_t setgroup) +{ + char *policy; + + switch (setgroup) { + case SETGROUPS_ALLOW: + policy = "allow"; + break; + case SETGROUPS_DENY: + policy = "deny"; + break; + case SETGROUPS_DEFAULT: + default: + /* Nothing to do. */ + return; + } + + if (write_file(policy, strlen(policy), "/proc/%d/setgroups", pid) < 0) { + /* + * If the kernel is too old to support /proc/pid/setgroups, + * open(2) or write(2) will return ENOENT. This is fine. + */ + if (errno != ENOENT) + bail("failed to write '%s' to /proc/%d/setgroups", policy, pid); + } +} + +static int try_mapping_tool(const char *app, int pid, char *map, size_t map_len) +{ + int child; + + /* + * If @app is NULL, execve will segfault. Just check it here and bail (if + * we're in this path, the caller is already getting desperate and there + * isn't a backup to this failing). This usually would be a configuration + * or programming issue. + */ + if (!app) + bail("mapping tool not present"); + + child = fork(); + if (child < 0) + bail("failed to fork"); + + if (!child) { +#define MAX_ARGV 20 + char *argv[MAX_ARGV]; + char *envp[] = { NULL }; + char pid_fmt[16]; + int argc = 0; + char *next; + + snprintf(pid_fmt, 16, "%d", pid); + + argv[argc++] = (char *)app; + argv[argc++] = pid_fmt; + /* + * Convert the map string into a list of argument that + * newuidmap/newgidmap can understand. + */ + + while (argc < MAX_ARGV) { + if (*map == '\0') { + argv[argc++] = NULL; + break; + } + argv[argc++] = map; + next = strpbrk(map, "\n "); + if (next == NULL) + break; + *next++ = '\0'; + map = next + strspn(next, "\n "); + } + + execve(app, argv, envp); + bail("failed to execv"); + } else { + int status; + + while (true) { + if (waitpid(child, &status, 0) < 0) { + if (errno == EINTR) + continue; + bail("failed to waitpid"); + } + if (WIFEXITED(status) || WIFSIGNALED(status)) + return WEXITSTATUS(status); + } + } + + return -1; +} + +static void update_uidmap(const char *path, int pid, char *map, size_t map_len) +{ + if (map == NULL || map_len <= 0) + return; + + if (write_file(map, map_len, "/proc/%d/uid_map", pid) < 0) { + if (errno != EPERM) + bail("failed to update /proc/%d/uid_map", pid); + if (try_mapping_tool(path, pid, map, map_len)) + bail("failed to use newuid map on %d", pid); + } +} + +static void update_gidmap(const char *path, int pid, char *map, size_t map_len) +{ + if (map == NULL || map_len <= 0) + return; + + if (write_file(map, map_len, "/proc/%d/gid_map", pid) < 0) { + if (errno != EPERM) + bail("failed to update /proc/%d/gid_map", pid); + if (try_mapping_tool(path, pid, map, map_len)) + bail("failed to use newgid map on %d", pid); + } +} + +static void update_oom_score_adj(char *data, size_t len) +{ + if (data == NULL || len <= 0) + return; + + if (write_file(data, len, "/proc/self/oom_score_adj") < 0) + bail("failed to update /proc/self/oom_score_adj"); +} + +/* A dummy function that just jumps to the given jumpval. */ +static int child_func(void *arg) __attribute__ ((noinline)); +static int child_func(void *arg) +{ + struct clone_t *ca = (struct clone_t *)arg; + longjmp(*ca->env, ca->jmpval); +} + +static int clone_parent(jmp_buf *env, int jmpval) __attribute__ ((noinline)); +static int clone_parent(jmp_buf *env, int jmpval) +{ + struct clone_t ca = { + .env = env, + .jmpval = jmpval, + }; + + return clone(child_func, ca.stack_ptr, CLONE_PARENT | SIGCHLD, &ca); +} + +/* + * Gets the init pipe fd from the environment, which is used to read the + * bootstrap data and tell the parent what the new pid is after we finish + * setting up the environment. + */ +static int initpipe(void) +{ + int pipenum; + char *initpipe, *endptr; + + initpipe = getenv("_LIBCONTAINER_INITPIPE"); + if (initpipe == NULL || *initpipe == '\0') + return -1; + + pipenum = strtol(initpipe, &endptr, 10); + if (*endptr != '\0') + bail("unable to parse _LIBCONTAINER_INITPIPE"); + + return pipenum; +} + +static void setup_logpipe(void) +{ + char *logpipe, *endptr; + + logpipe = getenv("_LIBCONTAINER_LOGPIPE"); + if (logpipe == NULL || *logpipe == '\0') { + return; + } + + logfd = strtol(logpipe, &endptr, 10); + if (logpipe == endptr || *endptr != '\0') { + fprintf(stderr, "unable to parse _LIBCONTAINER_LOGPIPE, value: %s\n", logpipe); + /* It is too early to use bail */ + exit(1); + } +} + +/* Returns the clone(2) flag for a namespace, given the name of a namespace. */ +static int nsflag(char *name) +{ + if (!strcmp(name, "cgroup")) + return CLONE_NEWCGROUP; + else if (!strcmp(name, "ipc")) + return CLONE_NEWIPC; + else if (!strcmp(name, "mnt")) + return CLONE_NEWNS; + else if (!strcmp(name, "net")) + return CLONE_NEWNET; + else if (!strcmp(name, "pid")) + return CLONE_NEWPID; + else if (!strcmp(name, "user")) + return CLONE_NEWUSER; + else if (!strcmp(name, "uts")) + return CLONE_NEWUTS; + + /* If we don't recognise a name, fallback to 0. */ + return 0; +} + +static uint32_t readint32(char *buf) +{ + return *(uint32_t *) buf; +} + +static uint8_t readint8(char *buf) +{ + return *(uint8_t *) buf; +} + +static void nl_parse(int fd, struct nlconfig_t *config) +{ + size_t len, size; + struct nlmsghdr hdr; + char *data, *current; + + /* Retrieve the netlink header. */ + len = read(fd, &hdr, NLMSG_HDRLEN); + if (len != NLMSG_HDRLEN) + bail("invalid netlink header length %zu", len); + + if (hdr.nlmsg_type == NLMSG_ERROR) + bail("failed to read netlink message"); + + if (hdr.nlmsg_type != INIT_MSG) + bail("unexpected msg type %d", hdr.nlmsg_type); + + /* Retrieve data. */ + size = NLMSG_PAYLOAD(&hdr, 0); + current = data = malloc(size); + if (!data) + bail("failed to allocate %zu bytes of memory for nl_payload", size); + + len = read(fd, data, size); + if (len != size) + bail("failed to read netlink payload, %zu != %zu", len, size); + + /* Parse the netlink payload. */ + config->data = data; + while (current < data + size) { + struct nlattr *nlattr = (struct nlattr *)current; + size_t payload_len = nlattr->nla_len - NLA_HDRLEN; + + /* Advance to payload. */ + current += NLA_HDRLEN; + + /* Handle payload. */ + switch (nlattr->nla_type) { + case CLONE_FLAGS_ATTR: + config->cloneflags = readint32(current); + break; + case ROOTLESS_EUID_ATTR: + config->is_rootless_euid = readint8(current); /* boolean */ + break; + case OOM_SCORE_ADJ_ATTR: + config->oom_score_adj = current; + config->oom_score_adj_len = payload_len; + break; + case NS_PATHS_ATTR: + config->namespaces = current; + config->namespaces_len = payload_len; + break; + case UIDMAP_ATTR: + config->uidmap = current; + config->uidmap_len = payload_len; + break; + case GIDMAP_ATTR: + config->gidmap = current; + config->gidmap_len = payload_len; + break; + case UIDMAPPATH_ATTR: + config->uidmappath = current; + config->uidmappath_len = payload_len; + break; + case GIDMAPPATH_ATTR: + config->gidmappath = current; + config->gidmappath_len = payload_len; + break; + case SETGROUP_ATTR: + config->is_setgroup = readint8(current); + break; + default: + bail("unknown netlink message type %d", nlattr->nla_type); + } + + current += NLA_ALIGN(payload_len); + } +} + +void nl_free(struct nlconfig_t *config) +{ + free(config->data); +} + +void join_namespaces(char *nslist) +{ + int num = 0, i; + char *saveptr = NULL; + char *namespace = strtok_r(nslist, ",", &saveptr); + struct namespace_t { + int fd; + int ns; + char type[PATH_MAX]; + char path[PATH_MAX]; + } *namespaces = NULL; + + if (!namespace || !strlen(namespace) || !strlen(nslist)) + bail("ns paths are empty"); + + /* + * We have to open the file descriptors first, since after + * we join the mnt namespace we might no longer be able to + * access the paths. + */ + do { + int fd; + char *path; + struct namespace_t *ns; + + /* Resize the namespace array. */ + namespaces = realloc(namespaces, ++num * sizeof(struct namespace_t)); + if (!namespaces) + bail("failed to reallocate namespace array"); + ns = &namespaces[num - 1]; + + /* Split 'ns:path'. */ + path = strstr(namespace, ":"); + if (!path) + bail("failed to parse %s", namespace); + *path++ = '\0'; + + fd = open(path, O_RDONLY); + if (fd < 0) + bail("failed to open %s", path); + + ns->fd = fd; + ns->ns = nsflag(namespace); + strncpy(ns->path, path, PATH_MAX - 1); + ns->path[PATH_MAX - 1] = '\0'; + } while ((namespace = strtok_r(NULL, ",", &saveptr)) != NULL); + + /* + * The ordering in which we join namespaces is important. We should + * always join the user namespace *first*. This is all guaranteed + * from the container_linux.go side of this, so we're just going to + * follow the order given to us. + */ + + for (i = 0; i < num; i++) { + struct namespace_t ns = namespaces[i]; + + if (setns(ns.fd, ns.ns) < 0) + bail("failed to setns to %s", ns.path); + + close(ns.fd); + } + + free(namespaces); +} + +/* Defined in cloned_binary.c. */ +extern int ensure_cloned_binary(void); + +void nsexec(void) +{ + int pipenum; + jmp_buf env; + int sync_child_pipe[2], sync_grandchild_pipe[2]; + struct nlconfig_t config = { 0 }; + + /* + * Setup a pipe to send logs to the parent. This should happen + * first, because bail will use that pipe. + */ + setup_logpipe(); + + /* + * If we don't have an init pipe, just return to the go routine. + * We'll only get an init pipe for start or exec. + */ + pipenum = initpipe(); + if (pipenum == -1) + return; + + /* + * We need to re-exec if we are not in a cloned binary. This is necessary + * to ensure that containers won't be able to access the host binary + * through /proc/self/exe. See CVE-2019-5736. + */ + if (ensure_cloned_binary() < 0) + bail("could not ensure we are a cloned binary"); + + write_log(DEBUG, "nsexec started"); + + /* Parse all of the netlink configuration. */ + nl_parse(pipenum, &config); + + /* Set oom_score_adj. This has to be done before !dumpable because + * /proc/self/oom_score_adj is not writeable unless you're an privileged + * user (if !dumpable is set). All children inherit their parent's + * oom_score_adj value on fork(2) so this will always be propagated + * properly. + */ + update_oom_score_adj(config.oom_score_adj, config.oom_score_adj_len); + + /* + * Make the process non-dumpable, to avoid various race conditions that + * could cause processes in namespaces we're joining to access host + * resources (or potentially execute code). + * + * However, if the number of namespaces we are joining is 0, we are not + * going to be switching to a different security context. Thus setting + * ourselves to be non-dumpable only breaks things (like rootless + * containers), which is the recommendation from the kernel folks. + */ + if (config.namespaces) { + if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0) + bail("failed to set process as non-dumpable"); + } + + /* Pipe so we can tell the child when we've finished setting up. */ + if (socketpair(AF_LOCAL, SOCK_STREAM, 0, sync_child_pipe) < 0) + bail("failed to setup sync pipe between parent and child"); + + /* + * We need a new socketpair to sync with grandchild so we don't have + * race condition with child. + */ + if (socketpair(AF_LOCAL, SOCK_STREAM, 0, sync_grandchild_pipe) < 0) + bail("failed to setup sync pipe between parent and grandchild"); + + /* TODO: Currently we aren't dealing with child deaths properly. */ + + /* + * Okay, so this is quite annoying. + * + * In order for this unsharing code to be more extensible we need to split + * up unshare(CLONE_NEWUSER) and clone() in various ways. The ideal case + * would be if we did clone(CLONE_NEWUSER) and the other namespaces + * separately, but because of SELinux issues we cannot really do that. But + * we cannot just dump the namespace flags into clone(...) because several + * usecases (such as rootless containers) require more granularity around + * the namespace setup. In addition, some older kernels had issues where + * CLONE_NEWUSER wasn't handled before other namespaces (but we cannot + * handle this while also dealing with SELinux so we choose SELinux support + * over broken kernel support). + * + * However, if we unshare(2) the user namespace *before* we clone(2), then + * all hell breaks loose. + * + * The parent no longer has permissions to do many things (unshare(2) drops + * all capabilities in your old namespace), and the container cannot be set + * up to have more than one {uid,gid} mapping. This is obviously less than + * ideal. In order to fix this, we have to first clone(2) and then unshare. + * + * Unfortunately, it's not as simple as that. We have to fork to enter the + * PID namespace (the PID namespace only applies to children). Since we'll + * have to double-fork, this clone_parent() call won't be able to get the + * PID of the _actual_ init process (without doing more synchronisation than + * I can deal with at the moment). So we'll just get the parent to send it + * for us, the only job of this process is to update + * /proc/pid/{setgroups,uid_map,gid_map}. + * + * And as a result of the above, we also need to setns(2) in the first child + * because if we join a PID namespace in the topmost parent then our child + * will be in that namespace (and it will not be able to give us a PID value + * that makes sense without resorting to sending things with cmsg). + * + * This also deals with an older issue caused by dumping cloneflags into + * clone(2): On old kernels, CLONE_PARENT didn't work with CLONE_NEWPID, so + * we have to unshare(2) before clone(2) in order to do this. This was fixed + * in upstream commit 1f7f4dde5c945f41a7abc2285be43d918029ecc5, and was + * introduced by 40a0d32d1eaffe6aac7324ca92604b6b3977eb0e. As far as we're + * aware, the last mainline kernel which had this bug was Linux 3.12. + * However, we cannot comment on which kernels the broken patch was + * backported to. + * + * -- Aleksa "what has my life come to?" Sarai + */ + + switch (setjmp(env)) { + /* + * Stage 0: We're in the parent. Our job is just to create a new child + * (stage 1: JUMP_CHILD) process and write its uid_map and + * gid_map. That process will go on to create a new process, then + * it will send us its PID which we will send to the bootstrap + * process. + */ + case JUMP_PARENT:{ + int len; + pid_t child, first_child = -1; + bool ready = false; + + /* For debugging. */ + prctl(PR_SET_NAME, (unsigned long)"runc:[0:PARENT]", 0, 0, 0); + + /* Start the process of getting a container. */ + child = clone_parent(&env, JUMP_CHILD); + if (child < 0) + bail("unable to fork: child_func"); + + /* + * State machine for synchronisation with the children. + * + * Father only return when both child and grandchild are + * ready, so we can receive all possible error codes + * generated by children. + */ + while (!ready) { + enum sync_t s; + + syncfd = sync_child_pipe[1]; + close(sync_child_pipe[0]); + + if (read(syncfd, &s, sizeof(s)) != sizeof(s)) + bail("failed to sync with child: next state"); + + switch (s) { + case SYNC_USERMAP_PLS: + /* + * Enable setgroups(2) if we've been asked to. But we also + * have to explicitly disable setgroups(2) if we're + * creating a rootless container for single-entry mapping. + * i.e. config.is_setgroup == false. + * (this is required since Linux 3.19). + * + * For rootless multi-entry mapping, config.is_setgroup shall be true and + * newuidmap/newgidmap shall be used. + */ + + if (config.is_rootless_euid && !config.is_setgroup) + update_setgroups(child, SETGROUPS_DENY); + + /* Set up mappings. */ + update_uidmap(config.uidmappath, child, config.uidmap, config.uidmap_len); + update_gidmap(config.gidmappath, child, config.gidmap, config.gidmap_len); + + s = SYNC_USERMAP_ACK; + if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { + kill(child, SIGKILL); + bail("failed to sync with child: write(SYNC_USERMAP_ACK)"); + } + break; + case SYNC_RECVPID_PLS:{ + first_child = child; + + /* Get the init_func pid. */ + if (read(syncfd, &child, sizeof(child)) != sizeof(child)) { + kill(first_child, SIGKILL); + bail("failed to sync with child: read(childpid)"); + } + + /* Send ACK. */ + s = SYNC_RECVPID_ACK; + if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { + kill(first_child, SIGKILL); + kill(child, SIGKILL); + bail("failed to sync with child: write(SYNC_RECVPID_ACK)"); + } + + /* Send the init_func pid back to our parent. + * + * Send the init_func pid and the pid of the first child back to our parent. + * We need to send both back because we can't reap the first child we created (CLONE_PARENT). + * It becomes the responsibility of our parent to reap the first child. + */ + len = dprintf(pipenum, "{\"pid\": %d, \"pid_first\": %d}\n", child, first_child); + if (len < 0) { + kill(child, SIGKILL); + bail("unable to generate JSON for child pid"); + } + } + break; + case SYNC_CHILD_READY: + ready = true; + break; + default: + bail("unexpected sync value: %u", s); + } + } + + /* Now sync with grandchild. */ + + ready = false; + while (!ready) { + enum sync_t s; + + syncfd = sync_grandchild_pipe[1]; + close(sync_grandchild_pipe[0]); + + s = SYNC_GRANDCHILD; + if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { + kill(child, SIGKILL); + bail("failed to sync with child: write(SYNC_GRANDCHILD)"); + } + + if (read(syncfd, &s, sizeof(s)) != sizeof(s)) + bail("failed to sync with child: next state"); + + switch (s) { + case SYNC_CHILD_READY: + ready = true; + break; + default: + bail("unexpected sync value: %u", s); + } + } + exit(0); + } + + /* + * Stage 1: We're in the first child process. Our job is to join any + * provided namespaces in the netlink payload and unshare all + * of the requested namespaces. If we've been asked to + * CLONE_NEWUSER, we will ask our parent (stage 0) to set up + * our user mappings for us. Then, we create a new child + * (stage 2: JUMP_INIT) for PID namespace. We then send the + * child's PID to our parent (stage 0). + */ + case JUMP_CHILD:{ + pid_t child; + enum sync_t s; + + /* We're in a child and thus need to tell the parent if we die. */ + syncfd = sync_child_pipe[0]; + close(sync_child_pipe[1]); + + /* For debugging. */ + prctl(PR_SET_NAME, (unsigned long)"runc:[1:CHILD]", 0, 0, 0); + + /* + * We need to setns first. We cannot do this earlier (in stage 0) + * because of the fact that we forked to get here (the PID of + * [stage 2: JUMP_INIT]) would be meaningless). We could send it + * using cmsg(3) but that's just annoying. + */ + if (config.namespaces) + join_namespaces(config.namespaces); + + /* + * Deal with user namespaces first. They are quite special, as they + * affect our ability to unshare other namespaces and are used as + * context for privilege checks. + * + * We don't unshare all namespaces in one go. The reason for this + * is that, while the kernel documentation may claim otherwise, + * there are certain cases where unsharing all namespaces at once + * will result in namespace objects being owned incorrectly. + * Ideally we should just fix these kernel bugs, but it's better to + * be safe than sorry, and fix them separately. + * + * A specific case of this is that the SELinux label of the + * internal kern-mount that mqueue uses will be incorrect if the + * UTS namespace is cloned before the USER namespace is mapped. + * I've also heard of similar problems with the network namespace + * in some scenarios. This also mirrors how LXC deals with this + * problem. + */ + if (config.cloneflags & CLONE_NEWUSER) { + if (unshare(CLONE_NEWUSER) < 0) + bail("failed to unshare user namespace"); + config.cloneflags &= ~CLONE_NEWUSER; + + /* + * We don't have the privileges to do any mapping here (see the + * clone_parent rant). So signal our parent to hook us up. + */ + + /* Switching is only necessary if we joined namespaces. */ + if (config.namespaces) { + if (prctl(PR_SET_DUMPABLE, 1, 0, 0, 0) < 0) + bail("failed to set process as dumpable"); + } + s = SYNC_USERMAP_PLS; + if (write(syncfd, &s, sizeof(s)) != sizeof(s)) + bail("failed to sync with parent: write(SYNC_USERMAP_PLS)"); + + /* ... wait for mapping ... */ + + if (read(syncfd, &s, sizeof(s)) != sizeof(s)) + bail("failed to sync with parent: read(SYNC_USERMAP_ACK)"); + if (s != SYNC_USERMAP_ACK) + bail("failed to sync with parent: SYNC_USERMAP_ACK: got %u", s); + /* Switching is only necessary if we joined namespaces. */ + if (config.namespaces) { + if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0) + bail("failed to set process as dumpable"); + } + + /* Become root in the namespace proper. */ + if (setresuid(0, 0, 0) < 0) + bail("failed to become root in user namespace"); + } + /* + * Unshare all of the namespaces. Now, it should be noted that this + * ordering might break in the future (especially with rootless + * containers). But for now, it's not possible to split this into + * CLONE_NEWUSER + [the rest] because of some RHEL SELinux issues. + * + * Note that we don't merge this with clone() because there were + * some old kernel versions where clone(CLONE_PARENT | CLONE_NEWPID) + * was broken, so we'll just do it the long way anyway. + */ + if (unshare(config.cloneflags & ~CLONE_NEWCGROUP) < 0) + bail("failed to unshare namespaces"); + + /* + * TODO: What about non-namespace clone flags that we're dropping here? + * + * We fork again because of PID namespace, setns(2) or unshare(2) don't + * change the PID namespace of the calling process, because doing so + * would change the caller's idea of its own PID (as reported by getpid()), + * which would break many applications and libraries, so we must fork + * to actually enter the new PID namespace. + */ + child = clone_parent(&env, JUMP_INIT); + if (child < 0) + bail("unable to fork: init_func"); + + /* Send the child to our parent, which knows what it's doing. */ + s = SYNC_RECVPID_PLS; + if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { + kill(child, SIGKILL); + bail("failed to sync with parent: write(SYNC_RECVPID_PLS)"); + } + if (write(syncfd, &child, sizeof(child)) != sizeof(child)) { + kill(child, SIGKILL); + bail("failed to sync with parent: write(childpid)"); + } + + /* ... wait for parent to get the pid ... */ + + if (read(syncfd, &s, sizeof(s)) != sizeof(s)) { + kill(child, SIGKILL); + bail("failed to sync with parent: read(SYNC_RECVPID_ACK)"); + } + if (s != SYNC_RECVPID_ACK) { + kill(child, SIGKILL); + bail("failed to sync with parent: SYNC_RECVPID_ACK: got %u", s); + } + + s = SYNC_CHILD_READY; + if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { + kill(child, SIGKILL); + bail("failed to sync with parent: write(SYNC_CHILD_READY)"); + } + + /* Our work is done. [Stage 2: JUMP_INIT] is doing the rest of the work. */ + exit(0); + } + + /* + * Stage 2: We're the final child process, and the only process that will + * actually return to the Go runtime. Our job is to just do the + * final cleanup steps and then return to the Go runtime to allow + * init_linux.go to run. + */ + case JUMP_INIT:{ + /* + * We're inside the child now, having jumped from the + * start_child() code after forking in the parent. + */ + enum sync_t s; + + /* We're in a child and thus need to tell the parent if we die. */ + syncfd = sync_grandchild_pipe[0]; + close(sync_grandchild_pipe[1]); + close(sync_child_pipe[0]); + close(sync_child_pipe[1]); + + /* For debugging. */ + prctl(PR_SET_NAME, (unsigned long)"runc:[2:INIT]", 0, 0, 0); + + if (read(syncfd, &s, sizeof(s)) != sizeof(s)) + bail("failed to sync with parent: read(SYNC_GRANDCHILD)"); + if (s != SYNC_GRANDCHILD) + bail("failed to sync with parent: SYNC_GRANDCHILD: got %u", s); + + if (setsid() < 0) + bail("setsid failed"); + + if (setuid(0) < 0) + bail("setuid failed"); + + if (setgid(0) < 0) + bail("setgid failed"); + + if (!config.is_rootless_euid && config.is_setgroup) { + if (setgroups(0, NULL) < 0) + bail("setgroups failed"); + } + + /* ... wait until our topmost parent has finished cgroup setup in p.manager.Apply() ... */ + if (config.cloneflags & CLONE_NEWCGROUP) { + uint8_t value; + if (read(pipenum, &value, sizeof(value)) != sizeof(value)) + bail("read synchronisation value failed"); + if (value == CREATECGROUPNS) { + if (unshare(CLONE_NEWCGROUP) < 0) + bail("failed to unshare cgroup namespace"); + } else + bail("received unknown synchronisation value"); + } + + s = SYNC_CHILD_READY; + if (write(syncfd, &s, sizeof(s)) != sizeof(s)) + bail("failed to sync with patent: write(SYNC_CHILD_READY)"); + + /* Close sync pipes. */ + close(sync_grandchild_pipe[0]); + + /* Free netlink data. */ + nl_free(&config); + + /* Finish executing, let the Go runtime take over. */ + return; + } + default: + bail("unexpected jump value"); + } + + /* Should never be reached. */ + bail("should never be reached"); +} diff --git a/libcontainer/process.go b/libcontainer/process.go new file mode 100644 index 0000000..d3e472a --- /dev/null +++ b/libcontainer/process.go @@ -0,0 +1,115 @@ +package libcontainer + +import ( + "fmt" + "io" + "math" + "os" + + "github.com/opencontainers/runc/libcontainer/configs" +) + +type processOperations interface { + wait() (*os.ProcessState, error) + signal(sig os.Signal) error + pid() int +} + +// Process specifies the configuration and IO for a process inside +// a container. +type Process struct { + // The command to be run followed by any arguments. + Args []string + + // Env specifies the environment variables for the process. + Env []string + + // User will set the uid and gid of the executing process running inside the container + // local to the container's user and group configuration. + User string + + // AdditionalGroups specifies the gids that should be added to supplementary groups + // in addition to those that the user belongs to. + AdditionalGroups []string + + // Cwd will change the processes current working directory inside the container's rootfs. + Cwd string + + // Stdin is a pointer to a reader which provides the standard input stream. + Stdin io.Reader + + // Stdout is a pointer to a writer which receives the standard output stream. + Stdout io.Writer + + // Stderr is a pointer to a writer which receives the standard error stream. + Stderr io.Writer + + // ExtraFiles specifies additional open files to be inherited by the container + ExtraFiles []*os.File + + // Initial sizings for the console + ConsoleWidth uint16 + ConsoleHeight uint16 + + // Capabilities specify the capabilities to keep when executing the process inside the container + // All capabilities not specified will be dropped from the processes capability mask + Capabilities *configs.Capabilities + + // AppArmorProfile specifies the profile to apply to the process and is + // changed at the time the process is execed + AppArmorProfile string + + // Label specifies the label to apply to the process. It is commonly used by selinux + Label string + + // NoNewPrivileges controls whether processes can gain additional privileges. + NoNewPrivileges *bool + + // Rlimits specifies the resource limits, such as max open files, to set in the container + // If Rlimits are not set, the container will inherit rlimits from the parent process + Rlimits []configs.Rlimit + + // ConsoleSocket provides the masterfd console. + ConsoleSocket *os.File + + // Init specifies whether the process is the first process in the container. + Init bool + + ops processOperations + + LogLevel string +} + +// Wait waits for the process to exit. +// Wait releases any resources associated with the Process +func (p Process) Wait() (*os.ProcessState, error) { + if p.ops == nil { + return nil, newGenericError(fmt.Errorf("invalid process"), NoProcessOps) + } + return p.ops.wait() +} + +// Pid returns the process ID +func (p Process) Pid() (int, error) { + // math.MinInt32 is returned here, because it's invalid value + // for the kill() system call. + if p.ops == nil { + return math.MinInt32, newGenericError(fmt.Errorf("invalid process"), NoProcessOps) + } + return p.ops.pid(), nil +} + +// Signal sends a signal to the Process. +func (p Process) Signal(sig os.Signal) error { + if p.ops == nil { + return newGenericError(fmt.Errorf("invalid process"), NoProcessOps) + } + return p.ops.signal(sig) +} + +// IO holds the process's STDIO +type IO struct { + Stdin io.WriteCloser + Stdout io.ReadCloser + Stderr io.ReadCloser +} diff --git a/libcontainer/process_linux.go b/libcontainer/process_linux.go new file mode 100644 index 0000000..de989b5 --- /dev/null +++ b/libcontainer/process_linux.go @@ -0,0 +1,598 @@ +// +build linux + +package libcontainer + +import ( + "encoding/json" + "errors" + "fmt" + "io" + "os" + "os/exec" + "path/filepath" + "strconv" + "syscall" // only for Signal + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/intelrdt" + "github.com/opencontainers/runc/libcontainer/logs" + "github.com/opencontainers/runc/libcontainer/system" + "github.com/opencontainers/runc/libcontainer/utils" + + "golang.org/x/sys/unix" +) + +// Synchronisation value for cgroup namespace setup. +// The same constant is defined in nsexec.c as "CREATECGROUPNS". +const createCgroupns = 0x80 + +type parentProcess interface { + // pid returns the pid for the running process. + pid() int + + // start starts the process execution. + start() error + + // send a SIGKILL to the process and wait for the exit. + terminate() error + + // wait waits on the process returning the process state. + wait() (*os.ProcessState, error) + + // startTime returns the process start time. + startTime() (uint64, error) + + signal(os.Signal) error + + externalDescriptors() []string + + setExternalDescriptors(fds []string) + + forwardChildLogs() +} + +type filePair struct { + parent *os.File + child *os.File +} + +type setnsProcess struct { + cmd *exec.Cmd + messageSockPair filePair + logFilePair filePair + cgroupPaths map[string]string + rootlessCgroups bool + intelRdtPath string + config *initConfig + fds []string + process *Process + bootstrapData io.Reader +} + +func (p *setnsProcess) startTime() (uint64, error) { + stat, err := system.Stat(p.pid()) + return stat.StartTime, err +} + +func (p *setnsProcess) signal(sig os.Signal) error { + s, ok := sig.(syscall.Signal) + if !ok { + return errors.New("os: unsupported signal type") + } + return unix.Kill(p.pid(), s) +} + +func (p *setnsProcess) start() (err error) { + defer p.messageSockPair.parent.Close() + err = p.cmd.Start() + // close the write-side of the pipes (controlled by child) + p.messageSockPair.child.Close() + p.logFilePair.child.Close() + if err != nil { + return newSystemErrorWithCause(err, "starting setns process") + } + if p.bootstrapData != nil { + if _, err := io.Copy(p.messageSockPair.parent, p.bootstrapData); err != nil { + return newSystemErrorWithCause(err, "copying bootstrap data to pipe") + } + } + if err = p.execSetns(); err != nil { + return newSystemErrorWithCause(err, "executing setns process") + } + if len(p.cgroupPaths) > 0 { + if err := cgroups.EnterPid(p.cgroupPaths, p.pid()); err != nil && !p.rootlessCgroups { + return newSystemErrorWithCausef(err, "adding pid %d to cgroups", p.pid()) + } + } + if p.intelRdtPath != "" { + // if Intel RDT "resource control" filesystem path exists + _, err := os.Stat(p.intelRdtPath) + if err == nil { + if err := intelrdt.WriteIntelRdtTasks(p.intelRdtPath, p.pid()); err != nil { + return newSystemErrorWithCausef(err, "adding pid %d to Intel RDT resource control filesystem", p.pid()) + } + } + } + // set rlimits, this has to be done here because we lose permissions + // to raise the limits once we enter a user-namespace + if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil { + return newSystemErrorWithCause(err, "setting rlimits for process") + } + if err := utils.WriteJSON(p.messageSockPair.parent, p.config); err != nil { + return newSystemErrorWithCause(err, "writing config to pipe") + } + + ierr := parseSync(p.messageSockPair.parent, func(sync *syncT) error { + switch sync.Type { + case procReady: + // This shouldn't happen. + panic("unexpected procReady in setns") + case procHooks: + // This shouldn't happen. + panic("unexpected procHooks in setns") + default: + return newSystemError(fmt.Errorf("invalid JSON payload from child")) + } + }) + + if err := unix.Shutdown(int(p.messageSockPair.parent.Fd()), unix.SHUT_WR); err != nil { + return newSystemErrorWithCause(err, "calling shutdown on init pipe") + } + // Must be done after Shutdown so the child will exit and we can wait for it. + if ierr != nil { + p.wait() + return ierr + } + return nil +} + +// execSetns runs the process that executes C code to perform the setns calls +// because setns support requires the C process to fork off a child and perform the setns +// before the go runtime boots, we wait on the process to die and receive the child's pid +// over the provided pipe. +func (p *setnsProcess) execSetns() error { + status, err := p.cmd.Process.Wait() + if err != nil { + p.cmd.Wait() + return newSystemErrorWithCause(err, "waiting on setns process to finish") + } + if !status.Success() { + p.cmd.Wait() + return newSystemError(&exec.ExitError{ProcessState: status}) + } + var pid *pid + if err := json.NewDecoder(p.messageSockPair.parent).Decode(&pid); err != nil { + p.cmd.Wait() + return newSystemErrorWithCause(err, "reading pid from init pipe") + } + + // Clean up the zombie parent process + // On Unix systems FindProcess always succeeds. + firstChildProcess, _ := os.FindProcess(pid.PidFirstChild) + + // Ignore the error in case the child has already been reaped for any reason + _, _ = firstChildProcess.Wait() + + process, err := os.FindProcess(pid.Pid) + if err != nil { + return err + } + p.cmd.Process = process + p.process.ops = p + return nil +} + +// terminate sends a SIGKILL to the forked process for the setns routine then waits to +// avoid the process becoming a zombie. +func (p *setnsProcess) terminate() error { + if p.cmd.Process == nil { + return nil + } + err := p.cmd.Process.Kill() + if _, werr := p.wait(); err == nil { + err = werr + } + return err +} + +func (p *setnsProcess) wait() (*os.ProcessState, error) { + err := p.cmd.Wait() + + // Return actual ProcessState even on Wait error + return p.cmd.ProcessState, err +} + +func (p *setnsProcess) pid() int { + return p.cmd.Process.Pid +} + +func (p *setnsProcess) externalDescriptors() []string { + return p.fds +} + +func (p *setnsProcess) setExternalDescriptors(newFds []string) { + p.fds = newFds +} + +func (p *setnsProcess) forwardChildLogs() { + go logs.ForwardLogs(p.logFilePair.parent) +} + +type initProcess struct { + cmd *exec.Cmd + messageSockPair filePair + logFilePair filePair + config *initConfig + manager cgroups.Manager + intelRdtManager intelrdt.Manager + container *linuxContainer + fds []string + process *Process + bootstrapData io.Reader + sharePidns bool +} + +func (p *initProcess) pid() int { + return p.cmd.Process.Pid +} + +func (p *initProcess) externalDescriptors() []string { + return p.fds +} + +// getChildPid receives the final child's pid over the provided pipe. +func (p *initProcess) getChildPid() (int, error) { + var pid pid + if err := json.NewDecoder(p.messageSockPair.parent).Decode(&pid); err != nil { + p.cmd.Wait() + return -1, err + } + + // Clean up the zombie parent process + // On Unix systems FindProcess always succeeds. + firstChildProcess, _ := os.FindProcess(pid.PidFirstChild) + + // Ignore the error in case the child has already been reaped for any reason + _, _ = firstChildProcess.Wait() + + return pid.Pid, nil +} + +func (p *initProcess) waitForChildExit(childPid int) error { + status, err := p.cmd.Process.Wait() + if err != nil { + p.cmd.Wait() + return err + } + if !status.Success() { + p.cmd.Wait() + return &exec.ExitError{ProcessState: status} + } + + process, err := os.FindProcess(childPid) + if err != nil { + return err + } + p.cmd.Process = process + p.process.ops = p + return nil +} + +func (p *initProcess) start() error { + defer p.messageSockPair.parent.Close() + err := p.cmd.Start() + p.process.ops = p + // close the write-side of the pipes (controlled by child) + p.messageSockPair.child.Close() + p.logFilePair.child.Close() + if err != nil { + p.process.ops = nil + return newSystemErrorWithCause(err, "starting init process command") + } + // Do this before syncing with child so that no children can escape the + // cgroup. We don't need to worry about not doing this and not being root + // because we'd be using the rootless cgroup manager in that case. + if err := p.manager.Apply(p.pid()); err != nil { + return newSystemErrorWithCause(err, "applying cgroup configuration for process") + } + if p.intelRdtManager != nil { + if err := p.intelRdtManager.Apply(p.pid()); err != nil { + return newSystemErrorWithCause(err, "applying Intel RDT configuration for process") + } + } + defer func() { + if err != nil { + // TODO: should not be the responsibility to call here + p.manager.Destroy() + if p.intelRdtManager != nil { + p.intelRdtManager.Destroy() + } + } + }() + + if _, err := io.Copy(p.messageSockPair.parent, p.bootstrapData); err != nil { + return newSystemErrorWithCause(err, "copying bootstrap data to pipe") + } + childPid, err := p.getChildPid() + if err != nil { + return newSystemErrorWithCause(err, "getting the final child's pid from pipe") + } + + // Save the standard descriptor names before the container process + // can potentially move them (e.g., via dup2()). If we don't do this now, + // we won't know at checkpoint time which file descriptor to look up. + fds, err := getPipeFds(childPid) + if err != nil { + return newSystemErrorWithCausef(err, "getting pipe fds for pid %d", childPid) + } + p.setExternalDescriptors(fds) + // Do this before syncing with child so that no children + // can escape the cgroup + if err := p.manager.Apply(childPid); err != nil { + return newSystemErrorWithCause(err, "applying cgroup configuration for process") + } + if p.intelRdtManager != nil { + if err := p.intelRdtManager.Apply(childPid); err != nil { + return newSystemErrorWithCause(err, "applying Intel RDT configuration for process") + } + } + // Now it's time to setup cgroup namesapce + if p.config.Config.Namespaces.Contains(configs.NEWCGROUP) && p.config.Config.Namespaces.PathOf(configs.NEWCGROUP) == "" { + if _, err := p.messageSockPair.parent.Write([]byte{createCgroupns}); err != nil { + return newSystemErrorWithCause(err, "sending synchronization value to init process") + } + } + + // Wait for our first child to exit + if err := p.waitForChildExit(childPid); err != nil { + return newSystemErrorWithCause(err, "waiting for our first child to exit") + } + + defer func() { + if err != nil { + // TODO: should not be the responsibility to call here + p.manager.Destroy() + if p.intelRdtManager != nil { + p.intelRdtManager.Destroy() + } + } + }() + if err := p.createNetworkInterfaces(); err != nil { + return newSystemErrorWithCause(err, "creating network interfaces") + } + if err := p.sendConfig(); err != nil { + return newSystemErrorWithCause(err, "sending config to init process") + } + var ( + sentRun bool + sentResume bool + ) + + ierr := parseSync(p.messageSockPair.parent, func(sync *syncT) error { + switch sync.Type { + case procReady: + // set rlimits, this has to be done here because we lose permissions + // to raise the limits once we enter a user-namespace + if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil { + return newSystemErrorWithCause(err, "setting rlimits for ready process") + } + // call prestart hooks + if !p.config.Config.Namespaces.Contains(configs.NEWNS) { + // Setup cgroup before prestart hook, so that the prestart hook could apply cgroup permissions. + if err := p.manager.Set(p.config.Config); err != nil { + return newSystemErrorWithCause(err, "setting cgroup config for ready process") + } + if p.intelRdtManager != nil { + if err := p.intelRdtManager.Set(p.config.Config); err != nil { + return newSystemErrorWithCause(err, "setting Intel RDT config for ready process") + } + } + + if p.config.Config.Hooks != nil { + s, err := p.container.currentOCIState() + if err != nil { + return err + } + // initProcessStartTime hasn't been set yet. + s.Pid = p.cmd.Process.Pid + s.Status = "creating" + for i, hook := range p.config.Config.Hooks.Prestart { + if err := hook.Run(s); err != nil { + return newSystemErrorWithCausef(err, "running prestart hook %d", i) + } + } + } + } + // Sync with child. + if err := writeSync(p.messageSockPair.parent, procRun); err != nil { + return newSystemErrorWithCause(err, "writing syncT 'run'") + } + sentRun = true + case procHooks: + // Setup cgroup before prestart hook, so that the prestart hook could apply cgroup permissions. + if err := p.manager.Set(p.config.Config); err != nil { + return newSystemErrorWithCause(err, "setting cgroup config for procHooks process") + } + if p.intelRdtManager != nil { + if err := p.intelRdtManager.Set(p.config.Config); err != nil { + return newSystemErrorWithCause(err, "setting Intel RDT config for procHooks process") + } + } + if p.config.Config.Hooks != nil { + s, err := p.container.currentOCIState() + if err != nil { + return err + } + // initProcessStartTime hasn't been set yet. + s.Pid = p.cmd.Process.Pid + s.Status = "creating" + for i, hook := range p.config.Config.Hooks.Prestart { + if err := hook.Run(s); err != nil { + return newSystemErrorWithCausef(err, "running prestart hook %d", i) + } + } + } + // Sync with child. + if err := writeSync(p.messageSockPair.parent, procResume); err != nil { + return newSystemErrorWithCause(err, "writing syncT 'resume'") + } + sentResume = true + default: + return newSystemError(fmt.Errorf("invalid JSON payload from child")) + } + + return nil + }) + + if !sentRun { + return newSystemErrorWithCause(ierr, "container init") + } + if p.config.Config.Namespaces.Contains(configs.NEWNS) && !sentResume { + return newSystemError(fmt.Errorf("could not synchronise after executing prestart hooks with container process")) + } + if err := unix.Shutdown(int(p.messageSockPair.parent.Fd()), unix.SHUT_WR); err != nil { + return newSystemErrorWithCause(err, "shutting down init pipe") + } + + // Must be done after Shutdown so the child will exit and we can wait for it. + if ierr != nil { + p.wait() + return ierr + } + return nil +} + +func (p *initProcess) wait() (*os.ProcessState, error) { + err := p.cmd.Wait() + if err != nil { + return p.cmd.ProcessState, err + } + // we should kill all processes in cgroup when init is died if we use host PID namespace + if p.sharePidns { + signalAllProcesses(p.manager, unix.SIGKILL) + } + return p.cmd.ProcessState, nil +} + +func (p *initProcess) terminate() error { + if p.cmd.Process == nil { + return nil + } + err := p.cmd.Process.Kill() + if _, werr := p.wait(); err == nil { + err = werr + } + return err +} + +func (p *initProcess) startTime() (uint64, error) { + stat, err := system.Stat(p.pid()) + return stat.StartTime, err +} + +func (p *initProcess) sendConfig() error { + // send the config to the container's init process, we don't use JSON Encode + // here because there might be a problem in JSON decoder in some cases, see: + // https://github.com/docker/docker/issues/14203#issuecomment-174177790 + return utils.WriteJSON(p.messageSockPair.parent, p.config) +} + +func (p *initProcess) createNetworkInterfaces() error { + for _, config := range p.config.Config.Networks { + strategy, err := getStrategy(config.Type) + if err != nil { + return err + } + n := &network{ + Network: *config, + } + if err := strategy.create(n, p.pid()); err != nil { + return err + } + p.config.Networks = append(p.config.Networks, n) + } + return nil +} + +func (p *initProcess) signal(sig os.Signal) error { + s, ok := sig.(syscall.Signal) + if !ok { + return errors.New("os: unsupported signal type") + } + return unix.Kill(p.pid(), s) +} + +func (p *initProcess) setExternalDescriptors(newFds []string) { + p.fds = newFds +} + +func (p *initProcess) forwardChildLogs() { + go logs.ForwardLogs(p.logFilePair.parent) +} + +func getPipeFds(pid int) ([]string, error) { + fds := make([]string, 3) + + dirPath := filepath.Join("/proc", strconv.Itoa(pid), "/fd") + for i := 0; i < 3; i++ { + // XXX: This breaks if the path is not a valid symlink (which can + // happen in certain particularly unlucky mount namespace setups). + f := filepath.Join(dirPath, strconv.Itoa(i)) + target, err := os.Readlink(f) + if err != nil { + // Ignore permission errors, for rootless containers and other + // non-dumpable processes. if we can't get the fd for a particular + // file, there's not much we can do. + if os.IsPermission(err) { + continue + } + return fds, err + } + fds[i] = target + } + return fds, nil +} + +// InitializeIO creates pipes for use with the process's stdio and returns the +// opposite side for each. Do not use this if you want to have a pseudoterminal +// set up for you by libcontainer (TODO: fix that too). +// TODO: This is mostly unnecessary, and should be handled by clients. +func (p *Process) InitializeIO(rootuid, rootgid int) (i *IO, err error) { + var fds []uintptr + i = &IO{} + // cleanup in case of an error + defer func() { + if err != nil { + for _, fd := range fds { + unix.Close(int(fd)) + } + } + }() + // STDIN + r, w, err := os.Pipe() + if err != nil { + return nil, err + } + fds = append(fds, r.Fd(), w.Fd()) + p.Stdin, i.Stdin = r, w + // STDOUT + if r, w, err = os.Pipe(); err != nil { + return nil, err + } + fds = append(fds, r.Fd(), w.Fd()) + p.Stdout, i.Stdout = w, r + // STDERR + if r, w, err = os.Pipe(); err != nil { + return nil, err + } + fds = append(fds, r.Fd(), w.Fd()) + p.Stderr, i.Stderr = w, r + // change ownership of the pipes in case we are in a user namespace + for _, fd := range fds { + if err := unix.Fchown(int(fd), rootuid, rootgid); err != nil { + return nil, err + } + } + return i, nil +} diff --git a/libcontainer/restored_process.go b/libcontainer/restored_process.go new file mode 100644 index 0000000..28d52ad --- /dev/null +++ b/libcontainer/restored_process.go @@ -0,0 +1,128 @@ +// +build linux + +package libcontainer + +import ( + "fmt" + "os" + + "github.com/opencontainers/runc/libcontainer/system" +) + +func newRestoredProcess(pid int, fds []string) (*restoredProcess, error) { + var ( + err error + ) + proc, err := os.FindProcess(pid) + if err != nil { + return nil, err + } + stat, err := system.Stat(pid) + if err != nil { + return nil, err + } + return &restoredProcess{ + proc: proc, + processStartTime: stat.StartTime, + fds: fds, + }, nil +} + +type restoredProcess struct { + proc *os.Process + processStartTime uint64 + fds []string +} + +func (p *restoredProcess) start() error { + return newGenericError(fmt.Errorf("restored process cannot be started"), SystemError) +} + +func (p *restoredProcess) pid() int { + return p.proc.Pid +} + +func (p *restoredProcess) terminate() error { + err := p.proc.Kill() + if _, werr := p.wait(); err == nil { + err = werr + } + return err +} + +func (p *restoredProcess) wait() (*os.ProcessState, error) { + // TODO: how do we wait on the actual process? + // maybe use --exec-cmd in criu + st, err := p.proc.Wait() + if err != nil { + return nil, err + } + return st, nil +} + +func (p *restoredProcess) startTime() (uint64, error) { + return p.processStartTime, nil +} + +func (p *restoredProcess) signal(s os.Signal) error { + return p.proc.Signal(s) +} + +func (p *restoredProcess) externalDescriptors() []string { + return p.fds +} + +func (p *restoredProcess) setExternalDescriptors(newFds []string) { + p.fds = newFds +} + +func (p *restoredProcess) forwardChildLogs() { +} + +// nonChildProcess represents a process where the calling process is not +// the parent process. This process is created when a factory loads a container from +// a persisted state. +type nonChildProcess struct { + processPid int + processStartTime uint64 + fds []string +} + +func (p *nonChildProcess) start() error { + return newGenericError(fmt.Errorf("restored process cannot be started"), SystemError) +} + +func (p *nonChildProcess) pid() int { + return p.processPid +} + +func (p *nonChildProcess) terminate() error { + return newGenericError(fmt.Errorf("restored process cannot be terminated"), SystemError) +} + +func (p *nonChildProcess) wait() (*os.ProcessState, error) { + return nil, newGenericError(fmt.Errorf("restored process cannot be waited on"), SystemError) +} + +func (p *nonChildProcess) startTime() (uint64, error) { + return p.processStartTime, nil +} + +func (p *nonChildProcess) signal(s os.Signal) error { + proc, err := os.FindProcess(p.processPid) + if err != nil { + return err + } + return proc.Signal(s) +} + +func (p *nonChildProcess) externalDescriptors() []string { + return p.fds +} + +func (p *nonChildProcess) setExternalDescriptors(newFds []string) { + p.fds = newFds +} + +func (p *nonChildProcess) forwardChildLogs() { +} diff --git a/libcontainer/rootfs_linux.go b/libcontainer/rootfs_linux.go new file mode 100644 index 0000000..106c4c2 --- /dev/null +++ b/libcontainer/rootfs_linux.go @@ -0,0 +1,1009 @@ +// +build linux + +package libcontainer + +import ( + "fmt" + "io" + "io/ioutil" + "os" + "os/exec" + "path" + "path/filepath" + "strings" + "time" + + securejoin "github.com/cyphar/filepath-securejoin" + "github.com/mrunalp/fileutils" + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/mount" + "github.com/opencontainers/runc/libcontainer/system" + libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils" + "github.com/opencontainers/selinux/go-selinux/label" + + "golang.org/x/sys/unix" +) + +const defaultMountFlags = unix.MS_NOEXEC | unix.MS_NOSUID | unix.MS_NODEV + +// needsSetupDev returns true if /dev needs to be set up. +func needsSetupDev(config *configs.Config) bool { + for _, m := range config.Mounts { + if m.Device == "bind" && libcontainerUtils.CleanPath(m.Destination) == "/dev" { + return false + } + } + return true +} + +// prepareRootfs sets up the devices, mount points, and filesystems for use +// inside a new mount namespace. It doesn't set anything as ro. You must call +// finalizeRootfs after this function to finish setting up the rootfs. +func prepareRootfs(pipe io.ReadWriter, iConfig *initConfig) (err error) { + config := iConfig.Config + if err := prepareRoot(config); err != nil { + return newSystemErrorWithCause(err, "preparing rootfs") + } + + hasCgroupns := config.Namespaces.Contains(configs.NEWCGROUP) + setupDev := needsSetupDev(config) + for _, m := range config.Mounts { + for _, precmd := range m.PremountCmds { + if err := mountCmd(precmd); err != nil { + return newSystemErrorWithCause(err, "running premount command") + } + } + if err := mountToRootfs(m, config.Rootfs, config.MountLabel, hasCgroupns); err != nil { + return newSystemErrorWithCausef(err, "mounting %q to rootfs %q at %q", m.Source, config.Rootfs, m.Destination) + } + + for _, postcmd := range m.PostmountCmds { + if err := mountCmd(postcmd); err != nil { + return newSystemErrorWithCause(err, "running postmount command") + } + } + } + + if setupDev { + if err := createDevices(config); err != nil { + return newSystemErrorWithCause(err, "creating device nodes") + } + if err := setupPtmx(config); err != nil { + return newSystemErrorWithCause(err, "setting up ptmx") + } + if err := setupDevSymlinks(config.Rootfs); err != nil { + return newSystemErrorWithCause(err, "setting up /dev symlinks") + } + } + + // Signal the parent to run the pre-start hooks. + // The hooks are run after the mounts are setup, but before we switch to the new + // root, so that the old root is still available in the hooks for any mount + // manipulations. + // Note that iConfig.Cwd is not guaranteed to exist here. + if err := syncParentHooks(pipe); err != nil { + return err + } + + // The reason these operations are done here rather than in finalizeRootfs + // is because the console-handling code gets quite sticky if we have to set + // up the console before doing the pivot_root(2). This is because the + // Console API has to also work with the ExecIn case, which means that the + // API must be able to deal with being inside as well as outside the + // container. It's just cleaner to do this here (at the expense of the + // operation not being perfectly split). + + if err := unix.Chdir(config.Rootfs); err != nil { + return newSystemErrorWithCausef(err, "changing dir to %q", config.Rootfs) + } + + if config.NoPivotRoot { + err = msMoveRoot(config.Rootfs) + } else if config.Namespaces.Contains(configs.NEWNS) { + err = pivotRoot(config.Rootfs) + } else { + err = chroot(config.Rootfs) + } + if err != nil { + return newSystemErrorWithCause(err, "jailing process inside rootfs") + } + + if setupDev { + if err := reOpenDevNull(); err != nil { + return newSystemErrorWithCause(err, "reopening /dev/null inside container") + } + } + + if cwd := iConfig.Cwd; cwd != "" { + // Note that spec.Process.Cwd can contain unclean value like "../../../../foo/bar...". + // However, we are safe to call MkDirAll directly because we are in the jail here. + if err := os.MkdirAll(cwd, 0755); err != nil { + return err + } + } + + return nil +} + +// finalizeRootfs sets anything to ro if necessary. You must call +// prepareRootfs first. +func finalizeRootfs(config *configs.Config) (err error) { + // remount dev as ro if specified + for _, m := range config.Mounts { + if libcontainerUtils.CleanPath(m.Destination) == "/dev" { + if m.Flags&unix.MS_RDONLY == unix.MS_RDONLY { + if err := remountReadonly(m); err != nil { + return newSystemErrorWithCausef(err, "remounting %q as readonly", m.Destination) + } + } + break + } + } + + // set rootfs ( / ) as readonly + if config.Readonlyfs { + if err := setReadonly(); err != nil { + return newSystemErrorWithCause(err, "setting rootfs as readonly") + } + } + + unix.Umask(0022) + return nil +} + +// /tmp has to be mounted as private to allow MS_MOVE to work in all situations +func prepareTmp(topTmpDir string) (string, error) { + tmpdir, err := ioutil.TempDir(topTmpDir, "runctop") + if err != nil { + return "", err + } + if err := unix.Mount(tmpdir, tmpdir, "bind", unix.MS_BIND, ""); err != nil { + return "", err + } + if err := unix.Mount("", tmpdir, "", uintptr(unix.MS_PRIVATE), ""); err != nil { + return "", err + } + return tmpdir, nil +} + +func cleanupTmp(tmpdir string) error { + unix.Unmount(tmpdir, 0) + return os.RemoveAll(tmpdir) +} + +func mountCmd(cmd configs.Command) error { + command := exec.Command(cmd.Path, cmd.Args[:]...) + command.Env = cmd.Env + command.Dir = cmd.Dir + if out, err := command.CombinedOutput(); err != nil { + return fmt.Errorf("%#v failed: %s: %v", cmd, string(out), err) + } + return nil +} + +func prepareBindMount(m *configs.Mount, rootfs string) error { + stat, err := os.Stat(m.Source) + if err != nil { + // error out if the source of a bind mount does not exist as we will be + // unable to bind anything to it. + return err + } + // ensure that the destination of the bind mount is resolved of symlinks at mount time because + // any previous mounts can invalidate the next mount's destination. + // this can happen when a user specifies mounts within other mounts to cause breakouts or other + // evil stuff to try to escape the container's rootfs. + var dest string + if dest, err = securejoin.SecureJoin(rootfs, m.Destination); err != nil { + return err + } + if err := checkProcMount(rootfs, dest, m.Source); err != nil { + return err + } + // update the mount with the correct dest after symlinks are resolved. + m.Destination = dest + if err := createIfNotExists(dest, stat.IsDir()); err != nil { + return err + } + + return nil +} + +func mountCgroupV1(m *configs.Mount, rootfs, mountLabel string, enableCgroupns bool) error { + binds, err := getCgroupMounts(m) + if err != nil { + return err + } + var merged []string + for _, b := range binds { + ss := filepath.Base(b.Destination) + if strings.Contains(ss, ",") { + merged = append(merged, ss) + } + } + tmpfs := &configs.Mount{ + Source: "tmpfs", + Device: "tmpfs", + Destination: m.Destination, + Flags: defaultMountFlags, + Data: "mode=755", + PropagationFlags: m.PropagationFlags, + } + if err := mountToRootfs(tmpfs, rootfs, mountLabel, enableCgroupns); err != nil { + return err + } + for _, b := range binds { + if enableCgroupns { + subsystemPath := filepath.Join(rootfs, b.Destination) + if err := os.MkdirAll(subsystemPath, 0755); err != nil { + return err + } + flags := defaultMountFlags + if m.Flags&unix.MS_RDONLY != 0 { + flags = flags | unix.MS_RDONLY + } + cgroupmount := &configs.Mount{ + Source: "cgroup", + Device: "cgroup", + Destination: subsystemPath, + Flags: flags, + Data: filepath.Base(subsystemPath), + } + if err := mountNewCgroup(cgroupmount); err != nil { + return err + } + } else { + if err := mountToRootfs(b, rootfs, mountLabel, enableCgroupns); err != nil { + return err + } + } + } + for _, mc := range merged { + for _, ss := range strings.Split(mc, ",") { + // symlink(2) is very dumb, it will just shove the path into + // the link and doesn't do any checks or relative path + // conversion. Also, don't error out if the cgroup already exists. + if err := os.Symlink(mc, filepath.Join(rootfs, m.Destination, ss)); err != nil && !os.IsExist(err) { + return err + } + } + } + return nil +} + +func mountCgroupV2(m *configs.Mount, rootfs, mountLabel string, enableCgroupns bool) error { + cgroupPath, err := securejoin.SecureJoin(rootfs, m.Destination) + if err != nil { + return err + } + if err := os.MkdirAll(cgroupPath, 0755); err != nil { + return err + } + if err := unix.Mount(m.Source, cgroupPath, "cgroup2", uintptr(m.Flags), m.Data); err != nil { + // when we are in UserNS but CgroupNS is not unshared, we cannot mount cgroup2 (#2158) + if err == unix.EPERM || err == unix.EBUSY { + return unix.Mount("/sys/fs/cgroup", cgroupPath, "", uintptr(m.Flags)|unix.MS_BIND, "") + } + return err + } + return nil +} + +func mountToRootfs(m *configs.Mount, rootfs, mountLabel string, enableCgroupns bool) error { + var ( + dest = m.Destination + ) + if !strings.HasPrefix(dest, rootfs) { + dest = filepath.Join(rootfs, dest) + } + + switch m.Device { + case "proc", "sysfs": + // If the destination already exists and is not a directory, we bail + // out This is to avoid mounting through a symlink or similar -- which + // has been a "fun" attack scenario in the past. + // TODO: This won't be necessary once we switch to libpathrs and we can + // stop all of these symlink-exchange attacks. + if fi, err := os.Lstat(dest); err != nil { + if !os.IsNotExist(err) { + return err + } + } else if fi.Mode()&os.ModeDir == 0 { + return fmt.Errorf("filesystem %q must be mounted on ordinary directory", m.Device) + } + if err := os.MkdirAll(dest, 0755); err != nil { + return err + } + // Selinux kernels do not support labeling of /proc or /sys + return mountPropagate(m, rootfs, "") + case "mqueue": + if err := os.MkdirAll(dest, 0755); err != nil { + return err + } + if err := mountPropagate(m, rootfs, mountLabel); err != nil { + // older kernels do not support labeling of /dev/mqueue + if err := mountPropagate(m, rootfs, ""); err != nil { + return err + } + return label.SetFileLabel(dest, mountLabel) + } + return nil + case "tmpfs": + copyUp := m.Extensions&configs.EXT_COPYUP == configs.EXT_COPYUP + tmpDir := "" + stat, err := os.Stat(dest) + if err != nil { + if err := os.MkdirAll(dest, 0755); err != nil { + return err + } + } + if copyUp { + tmpdir, err := prepareTmp("/tmp") + if err != nil { + return newSystemErrorWithCause(err, "tmpcopyup: failed to setup tmpdir") + } + defer cleanupTmp(tmpdir) + tmpDir, err = ioutil.TempDir(tmpdir, "runctmpdir") + if err != nil { + return newSystemErrorWithCause(err, "tmpcopyup: failed to create tmpdir") + } + defer os.RemoveAll(tmpDir) + m.Destination = tmpDir + } + if err := mountPropagate(m, rootfs, mountLabel); err != nil { + return err + } + if copyUp { + if err := fileutils.CopyDirectory(dest, tmpDir); err != nil { + errMsg := fmt.Errorf("tmpcopyup: failed to copy %s to %s: %v", dest, tmpDir, err) + if err1 := unix.Unmount(tmpDir, unix.MNT_DETACH); err1 != nil { + return newSystemErrorWithCausef(err1, "tmpcopyup: %v: failed to unmount", errMsg) + } + return errMsg + } + if err := unix.Mount(tmpDir, dest, "", unix.MS_MOVE, ""); err != nil { + errMsg := fmt.Errorf("tmpcopyup: failed to move mount %s to %s: %v", tmpDir, dest, err) + if err1 := unix.Unmount(tmpDir, unix.MNT_DETACH); err1 != nil { + return newSystemErrorWithCausef(err1, "tmpcopyup: %v: failed to unmount", errMsg) + } + return errMsg + } + } + if stat != nil { + if err = os.Chmod(dest, stat.Mode()); err != nil { + return err + } + } + return nil + case "bind": + if err := prepareBindMount(m, rootfs); err != nil { + return err + } + if err := mountPropagate(m, rootfs, mountLabel); err != nil { + return err + } + // bind mount won't change mount options, we need remount to make mount options effective. + // first check that we have non-default options required before attempting a remount + if m.Flags&^(unix.MS_REC|unix.MS_REMOUNT|unix.MS_BIND) != 0 { + // only remount if unique mount options are set + if err := remount(m, rootfs); err != nil { + return err + } + } + + if m.Relabel != "" { + if err := label.Validate(m.Relabel); err != nil { + return err + } + shared := label.IsShared(m.Relabel) + if err := label.Relabel(m.Source, mountLabel, shared); err != nil { + return err + } + } + case "cgroup": + if cgroups.IsCgroup2UnifiedMode() { + if err := mountCgroupV2(m, rootfs, mountLabel, enableCgroupns); err != nil { + return err + } + } else { + + if err := mountCgroupV1(m, rootfs, mountLabel, enableCgroupns); err != nil { + return err + } + } + if m.Flags&unix.MS_RDONLY != 0 { + // remount cgroup root as readonly + mcgrouproot := &configs.Mount{ + Source: m.Destination, + Device: "bind", + Destination: m.Destination, + Flags: defaultMountFlags | unix.MS_RDONLY | unix.MS_BIND, + } + if err := remount(mcgrouproot, rootfs); err != nil { + return err + } + } + default: + // ensure that the destination of the mount is resolved of symlinks at mount time because + // any previous mounts can invalidate the next mount's destination. + // this can happen when a user specifies mounts within other mounts to cause breakouts or other + // evil stuff to try to escape the container's rootfs. + var err error + if dest, err = securejoin.SecureJoin(rootfs, m.Destination); err != nil { + return err + } + if err := checkProcMount(rootfs, dest, m.Source); err != nil { + return err + } + // update the mount with the correct dest after symlinks are resolved. + m.Destination = dest + if err := os.MkdirAll(dest, 0755); err != nil { + return err + } + return mountPropagate(m, rootfs, mountLabel) + } + return nil +} + +func getCgroupMounts(m *configs.Mount) ([]*configs.Mount, error) { + mounts, err := cgroups.GetCgroupMounts(false) + if err != nil { + return nil, err + } + + cgroupPaths, err := cgroups.ParseCgroupFile("/proc/self/cgroup") + if err != nil { + return nil, err + } + + var binds []*configs.Mount + + for _, mm := range mounts { + dir, err := mm.GetOwnCgroup(cgroupPaths) + if err != nil { + return nil, err + } + relDir, err := filepath.Rel(mm.Root, dir) + if err != nil { + return nil, err + } + binds = append(binds, &configs.Mount{ + Device: "bind", + Source: filepath.Join(mm.Mountpoint, relDir), + Destination: filepath.Join(m.Destination, filepath.Base(mm.Mountpoint)), + Flags: unix.MS_BIND | unix.MS_REC | m.Flags, + PropagationFlags: m.PropagationFlags, + }) + } + + return binds, nil +} + +// checkProcMount checks to ensure that the mount destination is not over the top of /proc. +// dest is required to be an abs path and have any symlinks resolved before calling this function. +// +// if source is nil, don't stat the filesystem. This is used for restore of a checkpoint. +func checkProcMount(rootfs, dest, source string) error { + const procPath = "/proc" + // White list, it should be sub directories of invalid destinations + validDestinations := []string{ + // These entries can be bind mounted by files emulated by fuse, + // so commands like top, free displays stats in container. + "/proc/cpuinfo", + "/proc/diskstats", + "/proc/meminfo", + "/proc/stat", + "/proc/swaps", + "/proc/uptime", + "/proc/loadavg", + "/proc/net/dev", + } + for _, valid := range validDestinations { + path, err := filepath.Rel(filepath.Join(rootfs, valid), dest) + if err != nil { + return err + } + if path == "." { + return nil + } + } + path, err := filepath.Rel(filepath.Join(rootfs, procPath), dest) + if err != nil { + return err + } + // pass if the mount path is located outside of /proc + if strings.HasPrefix(path, "..") { + return nil + } + if path == "." { + // an empty source is pasted on restore + if source == "" { + return nil + } + // only allow a mount on-top of proc if it's source is "proc" + isproc, err := isProc(source) + if err != nil { + return err + } + // pass if the mount is happening on top of /proc and the source of + // the mount is a proc filesystem + if isproc { + return nil + } + return fmt.Errorf("%q cannot be mounted because it is not of type proc", dest) + } + return fmt.Errorf("%q cannot be mounted because it is inside /proc", dest) +} + +func isProc(path string) (bool, error) { + var s unix.Statfs_t + if err := unix.Statfs(path, &s); err != nil { + return false, err + } + return s.Type == unix.PROC_SUPER_MAGIC, nil +} + +func setupDevSymlinks(rootfs string) error { + var links = [][2]string{ + {"/proc/self/fd", "/dev/fd"}, + {"/proc/self/fd/0", "/dev/stdin"}, + {"/proc/self/fd/1", "/dev/stdout"}, + {"/proc/self/fd/2", "/dev/stderr"}, + } + // kcore support can be toggled with CONFIG_PROC_KCORE; only create a symlink + // in /dev if it exists in /proc. + if _, err := os.Stat("/proc/kcore"); err == nil { + links = append(links, [2]string{"/proc/kcore", "/dev/core"}) + } + for _, link := range links { + var ( + src = link[0] + dst = filepath.Join(rootfs, link[1]) + ) + if err := os.Symlink(src, dst); err != nil && !os.IsExist(err) { + return fmt.Errorf("symlink %s %s %s", src, dst, err) + } + } + return nil +} + +// If stdin, stdout, and/or stderr are pointing to `/dev/null` in the parent's rootfs +// this method will make them point to `/dev/null` in this container's rootfs. This +// needs to be called after we chroot/pivot into the container's rootfs so that any +// symlinks are resolved locally. +func reOpenDevNull() error { + var stat, devNullStat unix.Stat_t + file, err := os.OpenFile("/dev/null", os.O_RDWR, 0) + if err != nil { + return fmt.Errorf("Failed to open /dev/null - %s", err) + } + defer file.Close() + if err := unix.Fstat(int(file.Fd()), &devNullStat); err != nil { + return err + } + for fd := 0; fd < 3; fd++ { + if err := unix.Fstat(fd, &stat); err != nil { + return err + } + if stat.Rdev == devNullStat.Rdev { + // Close and re-open the fd. + if err := unix.Dup3(int(file.Fd()), fd, 0); err != nil { + return err + } + } + } + return nil +} + +// Create the device nodes in the container. +func createDevices(config *configs.Config) error { + useBindMount := system.RunningInUserNS() || config.Namespaces.Contains(configs.NEWUSER) + oldMask := unix.Umask(0000) + for _, node := range config.Devices { + // containers running in a user namespace are not allowed to mknod + // devices so we can just bind mount it from the host. + if err := createDeviceNode(config.Rootfs, node, useBindMount); err != nil { + unix.Umask(oldMask) + return err + } + } + unix.Umask(oldMask) + return nil +} + +func bindMountDeviceNode(dest string, node *configs.Device) error { + f, err := os.Create(dest) + if err != nil && !os.IsExist(err) { + return err + } + if f != nil { + f.Close() + } + return unix.Mount(node.Path, dest, "bind", unix.MS_BIND, "") +} + +// Creates the device node in the rootfs of the container. +func createDeviceNode(rootfs string, node *configs.Device, bind bool) error { + dest := filepath.Join(rootfs, node.Path) + if err := os.MkdirAll(filepath.Dir(dest), 0755); err != nil { + return err + } + + if bind { + return bindMountDeviceNode(dest, node) + } + if err := mknodDevice(dest, node); err != nil { + if os.IsExist(err) { + return nil + } else if os.IsPermission(err) { + return bindMountDeviceNode(dest, node) + } + return err + } + return nil +} + +func mknodDevice(dest string, node *configs.Device) error { + fileMode := node.FileMode + switch node.Type { + case 'c', 'u': + fileMode |= unix.S_IFCHR + case 'b': + fileMode |= unix.S_IFBLK + case 'p': + fileMode |= unix.S_IFIFO + default: + return fmt.Errorf("%c is not a valid device type for device %s", node.Type, node.Path) + } + if err := unix.Mknod(dest, uint32(fileMode), node.Mkdev()); err != nil { + return err + } + return unix.Chown(dest, int(node.Uid), int(node.Gid)) +} + +func getMountInfo(mountinfo []*mount.Info, dir string) *mount.Info { + for _, m := range mountinfo { + if m.Mountpoint == dir { + return m + } + } + return nil +} + +// Get the parent mount point of directory passed in as argument. Also return +// optional fields. +func getParentMount(rootfs string) (string, string, error) { + var path string + + mountinfos, err := mount.GetMounts() + if err != nil { + return "", "", err + } + + mountinfo := getMountInfo(mountinfos, rootfs) + if mountinfo != nil { + return rootfs, mountinfo.Optional, nil + } + + path = rootfs + for { + path = filepath.Dir(path) + + mountinfo = getMountInfo(mountinfos, path) + if mountinfo != nil { + return path, mountinfo.Optional, nil + } + + if path == "/" { + break + } + } + + // If we are here, we did not find parent mount. Something is wrong. + return "", "", fmt.Errorf("Could not find parent mount of %s", rootfs) +} + +// Make parent mount private if it was shared +func rootfsParentMountPrivate(rootfs string) error { + sharedMount := false + + parentMount, optionalOpts, err := getParentMount(rootfs) + if err != nil { + return err + } + + optsSplit := strings.Split(optionalOpts, " ") + for _, opt := range optsSplit { + if strings.HasPrefix(opt, "shared:") { + sharedMount = true + break + } + } + + // Make parent mount PRIVATE if it was shared. It is needed for two + // reasons. First of all pivot_root() will fail if parent mount is + // shared. Secondly when we bind mount rootfs it will propagate to + // parent namespace and we don't want that to happen. + if sharedMount { + return unix.Mount("", parentMount, "", unix.MS_PRIVATE, "") + } + + return nil +} + +func prepareRoot(config *configs.Config) error { + flag := unix.MS_SLAVE | unix.MS_REC + if config.RootPropagation != 0 { + flag = config.RootPropagation + } + if err := unix.Mount("", "/", "", uintptr(flag), ""); err != nil { + return err + } + + // Make parent mount private to make sure following bind mount does + // not propagate in other namespaces. Also it will help with kernel + // check pass in pivot_root. (IS_SHARED(new_mnt->mnt_parent)) + if err := rootfsParentMountPrivate(config.Rootfs); err != nil { + return err + } + + return unix.Mount(config.Rootfs, config.Rootfs, "bind", unix.MS_BIND|unix.MS_REC, "") +} + +func setReadonly() error { + return unix.Mount("/", "/", "bind", unix.MS_BIND|unix.MS_REMOUNT|unix.MS_RDONLY|unix.MS_REC, "") +} + +func setupPtmx(config *configs.Config) error { + ptmx := filepath.Join(config.Rootfs, "dev/ptmx") + if err := os.Remove(ptmx); err != nil && !os.IsNotExist(err) { + return err + } + if err := os.Symlink("pts/ptmx", ptmx); err != nil { + return fmt.Errorf("symlink dev ptmx %s", err) + } + return nil +} + +// pivotRoot will call pivot_root such that rootfs becomes the new root +// filesystem, and everything else is cleaned up. +func pivotRoot(rootfs string) error { + // While the documentation may claim otherwise, pivot_root(".", ".") is + // actually valid. What this results in is / being the new root but + // /proc/self/cwd being the old root. Since we can play around with the cwd + // with pivot_root this allows us to pivot without creating directories in + // the rootfs. Shout-outs to the LXC developers for giving us this idea. + + oldroot, err := unix.Open("/", unix.O_DIRECTORY|unix.O_RDONLY, 0) + if err != nil { + return err + } + defer unix.Close(oldroot) + + newroot, err := unix.Open(rootfs, unix.O_DIRECTORY|unix.O_RDONLY, 0) + if err != nil { + return err + } + defer unix.Close(newroot) + + // Change to the new root so that the pivot_root actually acts on it. + if err := unix.Fchdir(newroot); err != nil { + return err + } + + if err := unix.PivotRoot(".", "."); err != nil { + return fmt.Errorf("pivot_root %s", err) + } + + // Currently our "." is oldroot (according to the current kernel code). + // However, purely for safety, we will fchdir(oldroot) since there isn't + // really any guarantee from the kernel what /proc/self/cwd will be after a + // pivot_root(2). + + if err := unix.Fchdir(oldroot); err != nil { + return err + } + + // Make oldroot rslave to make sure our unmounts don't propagate to the + // host (and thus bork the machine). We don't use rprivate because this is + // known to cause issues due to races where we still have a reference to a + // mount while a process in the host namespace are trying to operate on + // something they think has no mounts (devicemapper in particular). + if err := unix.Mount("", ".", "", unix.MS_SLAVE|unix.MS_REC, ""); err != nil { + return err + } + // Preform the unmount. MNT_DETACH allows us to unmount /proc/self/cwd. + if err := unix.Unmount(".", unix.MNT_DETACH); err != nil { + return err + } + + // Switch back to our shiny new root. + if err := unix.Chdir("/"); err != nil { + return fmt.Errorf("chdir / %s", err) + } + return nil +} + +func msMoveRoot(rootfs string) error { + mountinfos, err := mount.GetMounts() + if err != nil { + return err + } + + absRootfs, err := filepath.Abs(rootfs) + if err != nil { + return err + } + + for _, info := range mountinfos { + p, err := filepath.Abs(info.Mountpoint) + if err != nil { + return err + } + // Umount every syfs and proc file systems, except those under the container rootfs + if (info.Fstype != "proc" && info.Fstype != "sysfs") || filepath.HasPrefix(p, absRootfs) { + continue + } + // Be sure umount events are not propagated to the host. + if err := unix.Mount("", p, "", unix.MS_SLAVE|unix.MS_REC, ""); err != nil { + return err + } + if err := unix.Unmount(p, unix.MNT_DETACH); err != nil { + if err != unix.EINVAL && err != unix.EPERM { + return err + } else { + // If we have not privileges for umounting (e.g. rootless), then + // cover the path. + if err := unix.Mount("tmpfs", p, "tmpfs", 0, ""); err != nil { + return err + } + } + } + } + if err := unix.Mount(rootfs, "/", "", unix.MS_MOVE, ""); err != nil { + return err + } + return chroot(rootfs) +} + +func chroot(rootfs string) error { + if err := unix.Chroot("."); err != nil { + return err + } + return unix.Chdir("/") +} + +// createIfNotExists creates a file or a directory only if it does not already exist. +func createIfNotExists(path string, isDir bool) error { + if _, err := os.Stat(path); err != nil { + if os.IsNotExist(err) { + if isDir { + return os.MkdirAll(path, 0755) + } + if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil { + return err + } + f, err := os.OpenFile(path, os.O_CREATE, 0755) + if err != nil { + return err + } + f.Close() + } + } + return nil +} + +// readonlyPath will make a path read only. +func readonlyPath(path string) error { + if err := unix.Mount(path, path, "", unix.MS_BIND|unix.MS_REC, ""); err != nil { + if os.IsNotExist(err) { + return nil + } + return err + } + return unix.Mount(path, path, "", unix.MS_BIND|unix.MS_REMOUNT|unix.MS_RDONLY|unix.MS_REC, "") +} + +// remountReadonly will remount an existing mount point and ensure that it is read-only. +func remountReadonly(m *configs.Mount) error { + var ( + dest = m.Destination + flags = m.Flags + ) + for i := 0; i < 5; i++ { + // There is a special case in the kernel for + // MS_REMOUNT | MS_BIND, which allows us to change only the + // flags even as an unprivileged user (i.e. user namespace) + // assuming we don't drop any security related flags (nodev, + // nosuid, etc.). So, let's use that case so that we can do + // this re-mount without failing in a userns. + flags |= unix.MS_REMOUNT | unix.MS_BIND | unix.MS_RDONLY + if err := unix.Mount("", dest, "", uintptr(flags), ""); err != nil { + switch err { + case unix.EBUSY: + time.Sleep(100 * time.Millisecond) + continue + default: + return err + } + } + return nil + } + return fmt.Errorf("unable to mount %s as readonly max retries reached", dest) +} + +// maskPath masks the top of the specified path inside a container to avoid +// security issues from processes reading information from non-namespace aware +// mounts ( proc/kcore ). +// For files, maskPath bind mounts /dev/null over the top of the specified path. +// For directories, maskPath mounts read-only tmpfs over the top of the specified path. +func maskPath(path string, mountLabel string) error { + if err := unix.Mount("/dev/null", path, "", unix.MS_BIND, ""); err != nil && !os.IsNotExist(err) { + if err == unix.ENOTDIR { + return unix.Mount("tmpfs", path, "tmpfs", unix.MS_RDONLY, label.FormatMountLabel("", mountLabel)) + } + return err + } + return nil +} + +// writeSystemProperty writes the value to a path under /proc/sys as determined from the key. +// For e.g. net.ipv4.ip_forward translated to /proc/sys/net/ipv4/ip_forward. +func writeSystemProperty(key, value string) error { + keyPath := strings.Replace(key, ".", "/", -1) + return ioutil.WriteFile(path.Join("/proc/sys", keyPath), []byte(value), 0644) +} + +func remount(m *configs.Mount, rootfs string) error { + var ( + dest = m.Destination + ) + if !strings.HasPrefix(dest, rootfs) { + dest = filepath.Join(rootfs, dest) + } + return unix.Mount(m.Source, dest, m.Device, uintptr(m.Flags|unix.MS_REMOUNT), "") +} + +// Do the mount operation followed by additional mounts required to take care +// of propagation flags. +func mountPropagate(m *configs.Mount, rootfs string, mountLabel string) error { + var ( + dest = m.Destination + data = label.FormatMountLabel(m.Data, mountLabel) + flags = m.Flags + ) + if libcontainerUtils.CleanPath(dest) == "/dev" { + flags &= ^unix.MS_RDONLY + } + + copyUp := m.Extensions&configs.EXT_COPYUP == configs.EXT_COPYUP + if !(copyUp || strings.HasPrefix(dest, rootfs)) { + dest = filepath.Join(rootfs, dest) + } + + if err := unix.Mount(m.Source, dest, m.Device, uintptr(flags), data); err != nil { + return err + } + + for _, pflag := range m.PropagationFlags { + if err := unix.Mount("", dest, "", uintptr(pflag), ""); err != nil { + return err + } + } + return nil +} + +func mountNewCgroup(m *configs.Mount) error { + var ( + data = m.Data + source = m.Source + ) + if data == "systemd" { + data = cgroups.CgroupNamePrefix + data + source = "systemd" + } + if err := unix.Mount(source, m.Destination, m.Device, uintptr(m.Flags), data); err != nil { + return err + } + return nil +} diff --git a/libcontainer/rootfs_linux_test.go b/libcontainer/rootfs_linux_test.go new file mode 100644 index 0000000..1bfe7c6 --- /dev/null +++ b/libcontainer/rootfs_linux_test.go @@ -0,0 +1,101 @@ +// +build linux + +package libcontainer + +import ( + "testing" + + "github.com/opencontainers/runc/libcontainer/configs" +) + +func TestCheckMountDestOnProc(t *testing.T) { + dest := "/rootfs/proc/sys" + err := checkProcMount("/rootfs", dest, "") + if err == nil { + t.Fatal("destination inside proc should return an error") + } +} + +func TestCheckMountDestOnProcChroot(t *testing.T) { + dest := "/rootfs/proc/" + err := checkProcMount("/rootfs", dest, "/proc") + if err != nil { + t.Fatal("destination inside proc when using chroot should not return an error") + } +} + +func TestCheckMountDestInSys(t *testing.T) { + dest := "/rootfs//sys/fs/cgroup" + err := checkProcMount("/rootfs", dest, "") + if err != nil { + t.Fatal("destination inside /sys should not return an error") + } +} + +func TestCheckMountDestFalsePositive(t *testing.T) { + dest := "/rootfs/sysfiles/fs/cgroup" + err := checkProcMount("/rootfs", dest, "") + if err != nil { + t.Fatal(err) + } +} + +func TestNeedsSetupDev(t *testing.T) { + config := &configs.Config{ + Mounts: []*configs.Mount{ + { + Device: "bind", + Source: "/dev", + Destination: "/dev", + }, + }, + } + if needsSetupDev(config) { + t.Fatal("expected needsSetupDev to be false, got true") + } +} + +func TestNeedsSetupDevStrangeSource(t *testing.T) { + config := &configs.Config{ + Mounts: []*configs.Mount{ + { + Device: "bind", + Source: "/devx", + Destination: "/dev", + }, + }, + } + if needsSetupDev(config) { + t.Fatal("expected needsSetupDev to be false, got true") + } +} + +func TestNeedsSetupDevStrangeDest(t *testing.T) { + config := &configs.Config{ + Mounts: []*configs.Mount{ + { + Device: "bind", + Source: "/dev", + Destination: "/devx", + }, + }, + } + if !needsSetupDev(config) { + t.Fatal("expected needsSetupDev to be true, got false") + } +} + +func TestNeedsSetupDevStrangeSourceDest(t *testing.T) { + config := &configs.Config{ + Mounts: []*configs.Mount{ + { + Device: "bind", + Source: "/devx", + Destination: "/devx", + }, + }, + } + if !needsSetupDev(config) { + t.Fatal("expected needsSetupDev to be true, got false") + } +} diff --git a/libcontainer/seccomp/config.go b/libcontainer/seccomp/config.go new file mode 100644 index 0000000..c321227 --- /dev/null +++ b/libcontainer/seccomp/config.go @@ -0,0 +1,77 @@ +package seccomp + +import ( + "fmt" + + "github.com/opencontainers/runc/libcontainer/configs" +) + +var operators = map[string]configs.Operator{ + "SCMP_CMP_NE": configs.NotEqualTo, + "SCMP_CMP_LT": configs.LessThan, + "SCMP_CMP_LE": configs.LessThanOrEqualTo, + "SCMP_CMP_EQ": configs.EqualTo, + "SCMP_CMP_GE": configs.GreaterThanOrEqualTo, + "SCMP_CMP_GT": configs.GreaterThan, + "SCMP_CMP_MASKED_EQ": configs.MaskEqualTo, +} + +var actions = map[string]configs.Action{ + "SCMP_ACT_KILL": configs.Kill, + "SCMP_ACT_ERRNO": configs.Errno, + "SCMP_ACT_TRAP": configs.Trap, + "SCMP_ACT_ALLOW": configs.Allow, + "SCMP_ACT_TRACE": configs.Trace, + "SCMP_ACT_LOG": configs.Log, +} + +var archs = map[string]string{ + "SCMP_ARCH_X86": "x86", + "SCMP_ARCH_X86_64": "amd64", + "SCMP_ARCH_X32": "x32", + "SCMP_ARCH_ARM": "arm", + "SCMP_ARCH_AARCH64": "arm64", + "SCMP_ARCH_MIPS": "mips", + "SCMP_ARCH_MIPS64": "mips64", + "SCMP_ARCH_MIPS64N32": "mips64n32", + "SCMP_ARCH_MIPSEL": "mipsel", + "SCMP_ARCH_MIPSEL64": "mipsel64", + "SCMP_ARCH_MIPSEL64N32": "mipsel64n32", + "SCMP_ARCH_PPC": "ppc", + "SCMP_ARCH_PPC64": "ppc64", + "SCMP_ARCH_PPC64LE": "ppc64le", + "SCMP_ARCH_S390": "s390", + "SCMP_ARCH_S390X": "s390x", +} + +// ConvertStringToOperator converts a string into a Seccomp comparison operator. +// Comparison operators use the names they are assigned by Libseccomp's header. +// Attempting to convert a string that is not a valid operator results in an +// error. +func ConvertStringToOperator(in string) (configs.Operator, error) { + if op, ok := operators[in]; ok == true { + return op, nil + } + return 0, fmt.Errorf("string %s is not a valid operator for seccomp", in) +} + +// ConvertStringToAction converts a string into a Seccomp rule match action. +// Actions use the names they are assigned in Libseccomp's header, though some +// (notable, SCMP_ACT_TRACE) are not available in this implementation and will +// return errors. +// Attempting to convert a string that is not a valid action results in an +// error. +func ConvertStringToAction(in string) (configs.Action, error) { + if act, ok := actions[in]; ok == true { + return act, nil + } + return 0, fmt.Errorf("string %s is not a valid action for seccomp", in) +} + +// ConvertStringToArch converts a string into a Seccomp comparison arch. +func ConvertStringToArch(in string) (string, error) { + if arch, ok := archs[in]; ok == true { + return arch, nil + } + return "", fmt.Errorf("string %s is not a valid arch for seccomp", in) +} diff --git a/libcontainer/seccomp/fixtures/proc_self_status b/libcontainer/seccomp/fixtures/proc_self_status new file mode 100644 index 0000000..0e0084f --- /dev/null +++ b/libcontainer/seccomp/fixtures/proc_self_status @@ -0,0 +1,47 @@ +Name: cat +State: R (running) +Tgid: 19383 +Ngid: 0 +Pid: 19383 +PPid: 19275 +TracerPid: 0 +Uid: 1000 1000 1000 1000 +Gid: 1000 1000 1000 1000 +FDSize: 256 +Groups: 24 25 27 29 30 44 46 102 104 108 111 1000 1001 +NStgid: 19383 +NSpid: 19383 +NSpgid: 19383 +NSsid: 19275 +VmPeak: 5944 kB +VmSize: 5944 kB +VmLck: 0 kB +VmPin: 0 kB +VmHWM: 744 kB +VmRSS: 744 kB +VmData: 324 kB +VmStk: 136 kB +VmExe: 48 kB +VmLib: 1776 kB +VmPTE: 32 kB +VmPMD: 12 kB +VmSwap: 0 kB +Threads: 1 +SigQ: 0/30067 +SigPnd: 0000000000000000 +ShdPnd: 0000000000000000 +SigBlk: 0000000000000000 +SigIgn: 0000000000000080 +SigCgt: 0000000000000000 +CapInh: 0000000000000000 +CapPrm: 0000000000000000 +CapEff: 0000000000000000 +CapBnd: 0000003fffffffff +CapAmb: 0000000000000000 +Seccomp: 0 +Cpus_allowed: f +Cpus_allowed_list: 0-3 +Mems_allowed: 00000000,00000001 +Mems_allowed_list: 0 +voluntary_ctxt_switches: 0 +nonvoluntary_ctxt_switches: 1 diff --git a/libcontainer/seccomp/seccomp_linux.go b/libcontainer/seccomp/seccomp_linux.go new file mode 100644 index 0000000..1b7a071 --- /dev/null +++ b/libcontainer/seccomp/seccomp_linux.go @@ -0,0 +1,261 @@ +// +build linux,cgo,seccomp + +package seccomp + +import ( + "bufio" + "fmt" + "os" + "strings" + + "github.com/opencontainers/runc/libcontainer/configs" + libseccomp "github.com/seccomp/libseccomp-golang" + + "golang.org/x/sys/unix" +) + +var ( + actAllow = libseccomp.ActAllow + actTrap = libseccomp.ActTrap + actKill = libseccomp.ActKill + actTrace = libseccomp.ActTrace.SetReturnCode(int16(unix.EPERM)) + actLog = libseccomp.ActLog + actErrno = libseccomp.ActErrno.SetReturnCode(int16(unix.EPERM)) +) + +const ( + // Linux system calls can have at most 6 arguments + syscallMaxArguments int = 6 +) + +// Filters given syscalls in a container, preventing them from being used +// Started in the container init process, and carried over to all child processes +// Setns calls, however, require a separate invocation, as they are not children +// of the init until they join the namespace +func InitSeccomp(config *configs.Seccomp) error { + if config == nil { + return fmt.Errorf("cannot initialize Seccomp - nil config passed") + } + + defaultAction, err := getAction(config.DefaultAction) + if err != nil { + return fmt.Errorf("error initializing seccomp - invalid default action") + } + + filter, err := libseccomp.NewFilter(defaultAction) + if err != nil { + return fmt.Errorf("error creating filter: %s", err) + } + + // Add extra architectures + for _, arch := range config.Architectures { + scmpArch, err := libseccomp.GetArchFromString(arch) + if err != nil { + return fmt.Errorf("error validating Seccomp architecture: %s", err) + } + + if err := filter.AddArch(scmpArch); err != nil { + return fmt.Errorf("error adding architecture to seccomp filter: %s", err) + } + } + + // Unset no new privs bit + if err := filter.SetNoNewPrivsBit(false); err != nil { + return fmt.Errorf("error setting no new privileges: %s", err) + } + + // Add a rule for each syscall + for _, call := range config.Syscalls { + if call == nil { + return fmt.Errorf("encountered nil syscall while initializing Seccomp") + } + + if err = matchCall(filter, call); err != nil { + return err + } + } + + if err = filter.Load(); err != nil { + return fmt.Errorf("error loading seccomp filter into kernel: %s", err) + } + + return nil +} + +// IsEnabled returns if the kernel has been configured to support seccomp. +func IsEnabled() bool { + // Try to read from /proc/self/status for kernels > 3.8 + s, err := parseStatusFile("/proc/self/status") + if err != nil { + // Check if Seccomp is supported, via CONFIG_SECCOMP. + if err := unix.Prctl(unix.PR_GET_SECCOMP, 0, 0, 0, 0); err != unix.EINVAL { + // Make sure the kernel has CONFIG_SECCOMP_FILTER. + if err := unix.Prctl(unix.PR_SET_SECCOMP, unix.SECCOMP_MODE_FILTER, 0, 0, 0); err != unix.EINVAL { + return true + } + } + return false + } + _, ok := s["Seccomp"] + return ok +} + +// Convert Libcontainer Action to Libseccomp ScmpAction +func getAction(act configs.Action) (libseccomp.ScmpAction, error) { + switch act { + case configs.Kill: + return actKill, nil + case configs.Errno: + return actErrno, nil + case configs.Trap: + return actTrap, nil + case configs.Allow: + return actAllow, nil + case configs.Trace: + return actTrace, nil + case configs.Log: + return actLog, nil + default: + return libseccomp.ActInvalid, fmt.Errorf("invalid action, cannot use in rule") + } +} + +// Convert Libcontainer Operator to Libseccomp ScmpCompareOp +func getOperator(op configs.Operator) (libseccomp.ScmpCompareOp, error) { + switch op { + case configs.EqualTo: + return libseccomp.CompareEqual, nil + case configs.NotEqualTo: + return libseccomp.CompareNotEqual, nil + case configs.GreaterThan: + return libseccomp.CompareGreater, nil + case configs.GreaterThanOrEqualTo: + return libseccomp.CompareGreaterEqual, nil + case configs.LessThan: + return libseccomp.CompareLess, nil + case configs.LessThanOrEqualTo: + return libseccomp.CompareLessOrEqual, nil + case configs.MaskEqualTo: + return libseccomp.CompareMaskedEqual, nil + default: + return libseccomp.CompareInvalid, fmt.Errorf("invalid operator, cannot use in rule") + } +} + +// Convert Libcontainer Arg to Libseccomp ScmpCondition +func getCondition(arg *configs.Arg) (libseccomp.ScmpCondition, error) { + cond := libseccomp.ScmpCondition{} + + if arg == nil { + return cond, fmt.Errorf("cannot convert nil to syscall condition") + } + + op, err := getOperator(arg.Op) + if err != nil { + return cond, err + } + + return libseccomp.MakeCondition(arg.Index, op, arg.Value, arg.ValueTwo) +} + +// Add a rule to match a single syscall +func matchCall(filter *libseccomp.ScmpFilter, call *configs.Syscall) error { + if call == nil || filter == nil { + return fmt.Errorf("cannot use nil as syscall to block") + } + + if len(call.Name) == 0 { + return fmt.Errorf("empty string is not a valid syscall") + } + + // If we can't resolve the syscall, assume it's not supported on this kernel + // Ignore it, don't error out + callNum, err := libseccomp.GetSyscallFromName(call.Name) + if err != nil { + return nil + } + + // Convert the call's action to the libseccomp equivalent + callAct, err := getAction(call.Action) + if err != nil { + return fmt.Errorf("action in seccomp profile is invalid: %s", err) + } + + // Unconditional match - just add the rule + if len(call.Args) == 0 { + if err = filter.AddRule(callNum, callAct); err != nil { + return fmt.Errorf("error adding seccomp filter rule for syscall %s: %s", call.Name, err) + } + } else { + // If two or more arguments have the same condition, + // Revert to old behavior, adding each condition as a separate rule + argCounts := make([]uint, syscallMaxArguments) + conditions := []libseccomp.ScmpCondition{} + + for _, cond := range call.Args { + newCond, err := getCondition(cond) + if err != nil { + return fmt.Errorf("error creating seccomp syscall condition for syscall %s: %s", call.Name, err) + } + + argCounts[cond.Index] += 1 + + conditions = append(conditions, newCond) + } + + hasMultipleArgs := false + for _, count := range argCounts { + if count > 1 { + hasMultipleArgs = true + break + } + } + + if hasMultipleArgs { + // Revert to old behavior + // Add each condition attached to a separate rule + for _, cond := range conditions { + condArr := []libseccomp.ScmpCondition{cond} + + if err = filter.AddRuleConditional(callNum, callAct, condArr); err != nil { + return fmt.Errorf("error adding seccomp rule for syscall %s: %s", call.Name, err) + } + } + } else { + // No conditions share same argument + // Use new, proper behavior + if err = filter.AddRuleConditional(callNum, callAct, conditions); err != nil { + return fmt.Errorf("error adding seccomp rule for syscall %s: %s", call.Name, err) + } + } + } + + return nil +} + +func parseStatusFile(path string) (map[string]string, error) { + f, err := os.Open(path) + if err != nil { + return nil, err + } + defer f.Close() + + s := bufio.NewScanner(f) + status := make(map[string]string) + + for s.Scan() { + text := s.Text() + parts := strings.Split(text, ":") + + if len(parts) <= 1 { + continue + } + + status[parts[0]] = parts[1] + } + if err := s.Err(); err != nil { + return nil, err + } + + return status, nil +} diff --git a/libcontainer/seccomp/seccomp_linux_test.go b/libcontainer/seccomp/seccomp_linux_test.go new file mode 100644 index 0000000..67a2ef6 --- /dev/null +++ b/libcontainer/seccomp/seccomp_linux_test.go @@ -0,0 +1,17 @@ +// +build linux,cgo,seccomp + +package seccomp + +import "testing" + +func TestParseStatusFile(t *testing.T) { + s, err := parseStatusFile("fixtures/proc_self_status") + if err != nil { + t.Fatal(err) + } + + if _, ok := s["Seccomp"]; !ok { + + t.Fatal("expected to find 'Seccomp' in the map but did not.") + } +} diff --git a/libcontainer/seccomp/seccomp_unsupported.go b/libcontainer/seccomp/seccomp_unsupported.go new file mode 100644 index 0000000..44df1ad --- /dev/null +++ b/libcontainer/seccomp/seccomp_unsupported.go @@ -0,0 +1,24 @@ +// +build !linux !cgo !seccomp + +package seccomp + +import ( + "errors" + + "github.com/opencontainers/runc/libcontainer/configs" +) + +var ErrSeccompNotEnabled = errors.New("seccomp: config provided but seccomp not supported") + +// InitSeccomp does nothing because seccomp is not supported. +func InitSeccomp(config *configs.Seccomp) error { + if config != nil { + return ErrSeccompNotEnabled + } + return nil +} + +// IsEnabled returns false, because it is not supported. +func IsEnabled() bool { + return false +} diff --git a/libcontainer/setns_init_linux.go b/libcontainer/setns_init_linux.go new file mode 100644 index 0000000..888981f --- /dev/null +++ b/libcontainer/setns_init_linux.go @@ -0,0 +1,92 @@ +// +build linux + +package libcontainer + +import ( + "fmt" + "os" + "runtime" + + "github.com/opencontainers/runc/libcontainer/apparmor" + "github.com/opencontainers/runc/libcontainer/keys" + "github.com/opencontainers/runc/libcontainer/seccomp" + "github.com/opencontainers/runc/libcontainer/system" + "github.com/opencontainers/selinux/go-selinux/label" + "github.com/pkg/errors" + + "golang.org/x/sys/unix" +) + +// linuxSetnsInit performs the container's initialization for running a new process +// inside an existing container. +type linuxSetnsInit struct { + pipe *os.File + consoleSocket *os.File + config *initConfig +} + +func (l *linuxSetnsInit) getSessionRingName() string { + return fmt.Sprintf("_ses.%s", l.config.ContainerId) +} + +func (l *linuxSetnsInit) Init() error { + runtime.LockOSThread() + defer runtime.UnlockOSThread() + + if !l.config.Config.NoNewKeyring { + if err := label.SetKeyLabel(l.config.ProcessLabel); err != nil { + return err + } + defer label.SetKeyLabel("") + // Do not inherit the parent's session keyring. + if _, err := keys.JoinSessionKeyring(l.getSessionRingName()); err != nil { + // Same justification as in standart_init_linux.go as to why we + // don't bail on ENOSYS. + // + // TODO(cyphar): And we should have logging here too. + if errors.Cause(err) != unix.ENOSYS { + return errors.Wrap(err, "join session keyring") + } + } + } + if l.config.CreateConsole { + if err := setupConsole(l.consoleSocket, l.config, false); err != nil { + return err + } + if err := system.Setctty(); err != nil { + return err + } + } + if l.config.NoNewPrivileges { + if err := unix.Prctl(unix.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil { + return err + } + } + if err := label.SetProcessLabel(l.config.ProcessLabel); err != nil { + return err + } + defer label.SetProcessLabel("") + // Without NoNewPrivileges seccomp is a privileged operation, so we need to + // do this before dropping capabilities; otherwise do it as late as possible + // just before execve so as few syscalls take place after it as possible. + if l.config.Config.Seccomp != nil && !l.config.NoNewPrivileges { + if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil { + return err + } + } + if err := finalizeNamespace(l.config); err != nil { + return err + } + if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil { + return err + } + // Set seccomp as close to execve as possible, so as few syscalls take + // place afterward (reducing the amount of syscalls that users need to + // enable in their seccomp profiles). + if l.config.Config.Seccomp != nil && l.config.NoNewPrivileges { + if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil { + return newSystemErrorWithCause(err, "init seccomp") + } + } + return system.Execv(l.config.Args[0], l.config.Args[0:], os.Environ()) +} diff --git a/libcontainer/specconv/example.go b/libcontainer/specconv/example.go new file mode 100644 index 0000000..23e225c --- /dev/null +++ b/libcontainer/specconv/example.go @@ -0,0 +1,223 @@ +package specconv + +import ( + "os" + "strings" + + "github.com/opencontainers/runtime-spec/specs-go" +) + +// Example returns an example spec file, with many options set so a user can +// see what a standard spec file looks like. +func Example() *specs.Spec { + return &specs.Spec{ + Version: specs.Version, + Root: &specs.Root{ + Path: "rootfs", + Readonly: true, + }, + Process: &specs.Process{ + Terminal: true, + User: specs.User{}, + Args: []string{ + "sh", + }, + Env: []string{ + "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", + "TERM=xterm", + }, + Cwd: "/", + NoNewPrivileges: true, + Capabilities: &specs.LinuxCapabilities{ + Bounding: []string{ + "CAP_AUDIT_WRITE", + "CAP_KILL", + "CAP_NET_BIND_SERVICE", + }, + Permitted: []string{ + "CAP_AUDIT_WRITE", + "CAP_KILL", + "CAP_NET_BIND_SERVICE", + }, + Inheritable: []string{ + "CAP_AUDIT_WRITE", + "CAP_KILL", + "CAP_NET_BIND_SERVICE", + }, + Ambient: []string{ + "CAP_AUDIT_WRITE", + "CAP_KILL", + "CAP_NET_BIND_SERVICE", + }, + Effective: []string{ + "CAP_AUDIT_WRITE", + "CAP_KILL", + "CAP_NET_BIND_SERVICE", + }, + }, + Rlimits: []specs.POSIXRlimit{ + { + Type: "RLIMIT_NOFILE", + Hard: uint64(1024), + Soft: uint64(1024), + }, + }, + }, + Hostname: "runc", + Mounts: []specs.Mount{ + { + Destination: "/proc", + Type: "proc", + Source: "proc", + Options: nil, + }, + { + Destination: "/dev", + Type: "tmpfs", + Source: "tmpfs", + Options: []string{"nosuid", "strictatime", "mode=755", "size=65536k"}, + }, + { + Destination: "/dev/pts", + Type: "devpts", + Source: "devpts", + Options: []string{"nosuid", "noexec", "newinstance", "ptmxmode=0666", "mode=0620", "gid=5"}, + }, + { + Destination: "/dev/shm", + Type: "tmpfs", + Source: "shm", + Options: []string{"nosuid", "noexec", "nodev", "mode=1777", "size=65536k"}, + }, + { + Destination: "/dev/mqueue", + Type: "mqueue", + Source: "mqueue", + Options: []string{"nosuid", "noexec", "nodev"}, + }, + { + Destination: "/sys", + Type: "sysfs", + Source: "sysfs", + Options: []string{"nosuid", "noexec", "nodev", "ro"}, + }, + { + Destination: "/sys/fs/cgroup", + Type: "cgroup", + Source: "cgroup", + Options: []string{"nosuid", "noexec", "nodev", "relatime", "ro"}, + }, + }, + Linux: &specs.Linux{ + MaskedPaths: []string{ + "/proc/acpi", + "/proc/asound", + "/proc/kcore", + "/proc/keys", + "/proc/latency_stats", + "/proc/timer_list", + "/proc/timer_stats", + "/proc/sched_debug", + "/sys/firmware", + "/proc/scsi", + }, + ReadonlyPaths: []string{ + "/proc/bus", + "/proc/fs", + "/proc/irq", + "/proc/sys", + "/proc/sysrq-trigger", + }, + Resources: &specs.LinuxResources{ + Devices: []specs.LinuxDeviceCgroup{ + { + Allow: false, + Access: "rwm", + }, + }, + }, + Namespaces: []specs.LinuxNamespace{ + { + Type: "pid", + }, + { + Type: "network", + }, + { + Type: "ipc", + }, + { + Type: "uts", + }, + { + Type: "mount", + }, + }, + }, + } +} + +// ToRootless converts the given spec file into one that should work with +// rootless containers (euid != 0), by removing incompatible options and adding others that +// are needed. +func ToRootless(spec *specs.Spec) { + var namespaces []specs.LinuxNamespace + + // Remove networkns from the spec. + for _, ns := range spec.Linux.Namespaces { + switch ns.Type { + case specs.NetworkNamespace, specs.UserNamespace: + // Do nothing. + default: + namespaces = append(namespaces, ns) + } + } + // Add userns to the spec. + namespaces = append(namespaces, specs.LinuxNamespace{ + Type: specs.UserNamespace, + }) + spec.Linux.Namespaces = namespaces + + // Add mappings for the current user. + spec.Linux.UIDMappings = []specs.LinuxIDMapping{{ + HostID: uint32(os.Geteuid()), + ContainerID: 0, + Size: 1, + }} + spec.Linux.GIDMappings = []specs.LinuxIDMapping{{ + HostID: uint32(os.Getegid()), + ContainerID: 0, + Size: 1, + }} + + // Fix up mounts. + var mounts []specs.Mount + for _, mount := range spec.Mounts { + // Ignore all mounts that are under /sys. + if strings.HasPrefix(mount.Destination, "/sys") { + continue + } + + // Remove all gid= and uid= mappings. + var options []string + for _, option := range mount.Options { + if !strings.HasPrefix(option, "gid=") && !strings.HasPrefix(option, "uid=") { + options = append(options, option) + } + } + + mount.Options = options + mounts = append(mounts, mount) + } + // Add the sysfs mount as an rbind. + mounts = append(mounts, specs.Mount{ + Source: "/sys", + Destination: "/sys", + Type: "none", + Options: []string{"rbind", "nosuid", "noexec", "nodev", "ro"}, + }) + spec.Mounts = mounts + + // Remove cgroup settings. + spec.Linux.Resources = nil +} diff --git a/libcontainer/specconv/spec_linux.go b/libcontainer/specconv/spec_linux.go new file mode 100644 index 0000000..d9e73c4 --- /dev/null +++ b/libcontainer/specconv/spec_linux.go @@ -0,0 +1,839 @@ +// +build linux + +// Package specconv implements conversion of specifications to libcontainer +// configurations +package specconv + +import ( + "fmt" + "os" + "path/filepath" + "strings" + "time" + + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/seccomp" + libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils" + "github.com/opencontainers/runtime-spec/specs-go" + + "golang.org/x/sys/unix" +) + +const wildcard = -1 + +var namespaceMapping = map[specs.LinuxNamespaceType]configs.NamespaceType{ + specs.PIDNamespace: configs.NEWPID, + specs.NetworkNamespace: configs.NEWNET, + specs.MountNamespace: configs.NEWNS, + specs.UserNamespace: configs.NEWUSER, + specs.IPCNamespace: configs.NEWIPC, + specs.UTSNamespace: configs.NEWUTS, + specs.CgroupNamespace: configs.NEWCGROUP, +} + +var mountPropagationMapping = map[string]int{ + "rprivate": unix.MS_PRIVATE | unix.MS_REC, + "private": unix.MS_PRIVATE, + "rslave": unix.MS_SLAVE | unix.MS_REC, + "slave": unix.MS_SLAVE, + "rshared": unix.MS_SHARED | unix.MS_REC, + "shared": unix.MS_SHARED, + "runbindable": unix.MS_UNBINDABLE | unix.MS_REC, + "unbindable": unix.MS_UNBINDABLE, + "": 0, +} + +// AllowedDevices is exposed for devicefilter_test.go +var AllowedDevices = []*configs.Device{ + // allow mknod for any device + { + Type: 'c', + Major: wildcard, + Minor: wildcard, + Permissions: "m", + Allow: true, + }, + { + Type: 'b', + Major: wildcard, + Minor: wildcard, + Permissions: "m", + Allow: true, + }, + { + Type: 'c', + Path: "/dev/null", + Major: 1, + Minor: 3, + Permissions: "rwm", + Allow: true, + }, + { + Type: 'c', + Path: "/dev/random", + Major: 1, + Minor: 8, + Permissions: "rwm", + Allow: true, + }, + { + Type: 'c', + Path: "/dev/full", + Major: 1, + Minor: 7, + Permissions: "rwm", + Allow: true, + }, + { + Type: 'c', + Path: "/dev/tty", + Major: 5, + Minor: 0, + Permissions: "rwm", + Allow: true, + }, + { + Type: 'c', + Path: "/dev/zero", + Major: 1, + Minor: 5, + Permissions: "rwm", + Allow: true, + }, + { + Type: 'c', + Path: "/dev/urandom", + Major: 1, + Minor: 9, + Permissions: "rwm", + Allow: true, + }, + { + Path: "/dev/console", + Type: 'c', + Major: 5, + Minor: 1, + Permissions: "rwm", + Allow: true, + }, + // /dev/pts/ - pts namespaces are "coming soon" + { + Path: "", + Type: 'c', + Major: 136, + Minor: wildcard, + Permissions: "rwm", + Allow: true, + }, + { + Path: "", + Type: 'c', + Major: 5, + Minor: 2, + Permissions: "rwm", + Allow: true, + }, + // tuntap + { + Path: "", + Type: 'c', + Major: 10, + Minor: 200, + Permissions: "rwm", + Allow: true, + }, +} + +type CreateOpts struct { + CgroupName string + UseSystemdCgroup bool + NoPivotRoot bool + NoNewKeyring bool + Spec *specs.Spec + RootlessEUID bool + RootlessCgroups bool +} + +// CreateLibcontainerConfig creates a new libcontainer configuration from a +// given specification and a cgroup name +func CreateLibcontainerConfig(opts *CreateOpts) (*configs.Config, error) { + // runc's cwd will always be the bundle path + rcwd, err := os.Getwd() + if err != nil { + return nil, err + } + cwd, err := filepath.Abs(rcwd) + if err != nil { + return nil, err + } + spec := opts.Spec + if spec.Root == nil { + return nil, fmt.Errorf("Root must be specified") + } + rootfsPath := spec.Root.Path + if !filepath.IsAbs(rootfsPath) { + rootfsPath = filepath.Join(cwd, rootfsPath) + } + labels := []string{} + for k, v := range spec.Annotations { + labels = append(labels, fmt.Sprintf("%s=%s", k, v)) + } + config := &configs.Config{ + Rootfs: rootfsPath, + NoPivotRoot: opts.NoPivotRoot, + Readonlyfs: spec.Root.Readonly, + Hostname: spec.Hostname, + Labels: append(labels, fmt.Sprintf("bundle=%s", cwd)), + NoNewKeyring: opts.NoNewKeyring, + RootlessEUID: opts.RootlessEUID, + RootlessCgroups: opts.RootlessCgroups, + } + + exists := false + for _, m := range spec.Mounts { + config.Mounts = append(config.Mounts, createLibcontainerMount(cwd, m)) + } + if err := createDevices(spec, config); err != nil { + return nil, err + } + c, err := CreateCgroupConfig(opts) + if err != nil { + return nil, err + } + config.Cgroups = c + // set linux-specific config + if spec.Linux != nil { + if config.RootPropagation, exists = mountPropagationMapping[spec.Linux.RootfsPropagation]; !exists { + return nil, fmt.Errorf("rootfsPropagation=%v is not supported", spec.Linux.RootfsPropagation) + } + if config.NoPivotRoot && (config.RootPropagation&unix.MS_PRIVATE != 0) { + return nil, fmt.Errorf("rootfsPropagation of [r]private is not safe without pivot_root") + } + + for _, ns := range spec.Linux.Namespaces { + t, exists := namespaceMapping[ns.Type] + if !exists { + return nil, fmt.Errorf("namespace %q does not exist", ns) + } + if config.Namespaces.Contains(t) { + return nil, fmt.Errorf("malformed spec file: duplicated ns %q", ns) + } + config.Namespaces.Add(t, ns.Path) + } + if config.Namespaces.Contains(configs.NEWNET) && config.Namespaces.PathOf(configs.NEWNET) == "" { + config.Networks = []*configs.Network{ + { + Type: "loopback", + }, + } + } + if config.Namespaces.Contains(configs.NEWUSER) { + if err := setupUserNamespace(spec, config); err != nil { + return nil, err + } + } + config.MaskPaths = spec.Linux.MaskedPaths + config.ReadonlyPaths = spec.Linux.ReadonlyPaths + config.MountLabel = spec.Linux.MountLabel + config.Sysctl = spec.Linux.Sysctl + if spec.Linux.Seccomp != nil { + seccomp, err := SetupSeccomp(spec.Linux.Seccomp) + if err != nil { + return nil, err + } + config.Seccomp = seccomp + } + if spec.Linux.IntelRdt != nil { + config.IntelRdt = &configs.IntelRdt{} + if spec.Linux.IntelRdt.L3CacheSchema != "" { + config.IntelRdt.L3CacheSchema = spec.Linux.IntelRdt.L3CacheSchema + } + if spec.Linux.IntelRdt.MemBwSchema != "" { + config.IntelRdt.MemBwSchema = spec.Linux.IntelRdt.MemBwSchema + } + } + } + if spec.Process != nil { + config.OomScoreAdj = spec.Process.OOMScoreAdj + if spec.Process.SelinuxLabel != "" { + config.ProcessLabel = spec.Process.SelinuxLabel + } + if spec.Process.Capabilities != nil { + config.Capabilities = &configs.Capabilities{ + Bounding: spec.Process.Capabilities.Bounding, + Effective: spec.Process.Capabilities.Effective, + Permitted: spec.Process.Capabilities.Permitted, + Inheritable: spec.Process.Capabilities.Inheritable, + Ambient: spec.Process.Capabilities.Ambient, + } + } + } + createHooks(spec, config) + config.Version = specs.Version + return config, nil +} + +func createLibcontainerMount(cwd string, m specs.Mount) *configs.Mount { + flags, pgflags, data, ext := parseMountOptions(m.Options) + source := m.Source + device := m.Type + if flags&unix.MS_BIND != 0 { + // Any "type" the user specified is meaningless (and ignored) for + // bind-mounts -- so we set it to "bind" because rootfs_linux.go + // (incorrectly) relies on this for some checks. + device = "bind" + if !filepath.IsAbs(source) { + source = filepath.Join(cwd, m.Source) + } + } + return &configs.Mount{ + Device: device, + Source: source, + Destination: m.Destination, + Data: data, + Flags: flags, + PropagationFlags: pgflags, + Extensions: ext, + } +} + +func CreateCgroupConfig(opts *CreateOpts) (*configs.Cgroup, error) { + var ( + myCgroupPath string + + spec = opts.Spec + useSystemdCgroup = opts.UseSystemdCgroup + name = opts.CgroupName + ) + + c := &configs.Cgroup{ + Resources: &configs.Resources{}, + } + + if spec.Linux != nil && spec.Linux.CgroupsPath != "" { + myCgroupPath = libcontainerUtils.CleanPath(spec.Linux.CgroupsPath) + if useSystemdCgroup { + myCgroupPath = spec.Linux.CgroupsPath + } + } + + if useSystemdCgroup { + if myCgroupPath == "" { + c.Parent = "system.slice" + c.ScopePrefix = "runc" + c.Name = name + } else { + // Parse the path from expected "slice:prefix:name" + // for e.g. "system.slice:docker:1234" + parts := strings.Split(myCgroupPath, ":") + if len(parts) != 3 { + return nil, fmt.Errorf("expected cgroupsPath to be of format \"slice:prefix:name\" for systemd cgroups, got %q instead", myCgroupPath) + } + c.Parent = parts[0] + c.ScopePrefix = parts[1] + c.Name = parts[2] + } + } else { + if myCgroupPath == "" { + c.Name = name + } + c.Path = myCgroupPath + } + + // In rootless containers, any attempt to make cgroup changes is likely to fail. + // libcontainer will validate this but ignores the error. + c.Resources.AllowedDevices = AllowedDevices + if spec.Linux != nil { + r := spec.Linux.Resources + if r == nil { + return c, nil + } + for i, d := range spec.Linux.Resources.Devices { + var ( + t = "a" + major = int64(-1) + minor = int64(-1) + ) + if d.Type != "" { + t = d.Type + } + if d.Major != nil { + major = *d.Major + } + if d.Minor != nil { + minor = *d.Minor + } + if d.Access == "" { + return nil, fmt.Errorf("device access at %d field cannot be empty", i) + } + dt, err := stringToCgroupDeviceRune(t) + if err != nil { + return nil, err + } + dd := &configs.Device{ + Type: dt, + Major: major, + Minor: minor, + Permissions: d.Access, + Allow: d.Allow, + } + c.Resources.Devices = append(c.Resources.Devices, dd) + } + if r.Memory != nil { + if r.Memory.Limit != nil { + c.Resources.Memory = *r.Memory.Limit + } + if r.Memory.Reservation != nil { + c.Resources.MemoryReservation = *r.Memory.Reservation + } + if r.Memory.Swap != nil { + c.Resources.MemorySwap = *r.Memory.Swap + } + if r.Memory.Kernel != nil { + c.Resources.KernelMemory = *r.Memory.Kernel + } + if r.Memory.KernelTCP != nil { + c.Resources.KernelMemoryTCP = *r.Memory.KernelTCP + } + if r.Memory.Swappiness != nil { + c.Resources.MemorySwappiness = r.Memory.Swappiness + } + if r.Memory.DisableOOMKiller != nil { + c.Resources.OomKillDisable = *r.Memory.DisableOOMKiller + } + } + if r.CPU != nil { + if r.CPU.Shares != nil { + c.Resources.CpuShares = *r.CPU.Shares + } + if r.CPU.Quota != nil { + c.Resources.CpuQuota = *r.CPU.Quota + } + if r.CPU.Period != nil { + c.Resources.CpuPeriod = *r.CPU.Period + } + if r.CPU.RealtimeRuntime != nil { + c.Resources.CpuRtRuntime = *r.CPU.RealtimeRuntime + } + if r.CPU.RealtimePeriod != nil { + c.Resources.CpuRtPeriod = *r.CPU.RealtimePeriod + } + if r.CPU.Cpus != "" { + c.Resources.CpusetCpus = r.CPU.Cpus + } + if r.CPU.Mems != "" { + c.Resources.CpusetMems = r.CPU.Mems + } + } + if r.Pids != nil { + c.Resources.PidsLimit = r.Pids.Limit + } + if r.BlockIO != nil { + if r.BlockIO.Weight != nil { + c.Resources.BlkioWeight = *r.BlockIO.Weight + } + if r.BlockIO.LeafWeight != nil { + c.Resources.BlkioLeafWeight = *r.BlockIO.LeafWeight + } + if r.BlockIO.WeightDevice != nil { + for _, wd := range r.BlockIO.WeightDevice { + var weight, leafWeight uint16 + if wd.Weight != nil { + weight = *wd.Weight + } + if wd.LeafWeight != nil { + leafWeight = *wd.LeafWeight + } + weightDevice := configs.NewWeightDevice(wd.Major, wd.Minor, weight, leafWeight) + c.Resources.BlkioWeightDevice = append(c.Resources.BlkioWeightDevice, weightDevice) + } + } + if r.BlockIO.ThrottleReadBpsDevice != nil { + for _, td := range r.BlockIO.ThrottleReadBpsDevice { + rate := td.Rate + throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, rate) + c.Resources.BlkioThrottleReadBpsDevice = append(c.Resources.BlkioThrottleReadBpsDevice, throttleDevice) + } + } + if r.BlockIO.ThrottleWriteBpsDevice != nil { + for _, td := range r.BlockIO.ThrottleWriteBpsDevice { + rate := td.Rate + throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, rate) + c.Resources.BlkioThrottleWriteBpsDevice = append(c.Resources.BlkioThrottleWriteBpsDevice, throttleDevice) + } + } + if r.BlockIO.ThrottleReadIOPSDevice != nil { + for _, td := range r.BlockIO.ThrottleReadIOPSDevice { + rate := td.Rate + throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, rate) + c.Resources.BlkioThrottleReadIOPSDevice = append(c.Resources.BlkioThrottleReadIOPSDevice, throttleDevice) + } + } + if r.BlockIO.ThrottleWriteIOPSDevice != nil { + for _, td := range r.BlockIO.ThrottleWriteIOPSDevice { + rate := td.Rate + throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, rate) + c.Resources.BlkioThrottleWriteIOPSDevice = append(c.Resources.BlkioThrottleWriteIOPSDevice, throttleDevice) + } + } + } + for _, l := range r.HugepageLimits { + c.Resources.HugetlbLimit = append(c.Resources.HugetlbLimit, &configs.HugepageLimit{ + Pagesize: l.Pagesize, + Limit: l.Limit, + }) + } + if r.Network != nil { + if r.Network.ClassID != nil { + c.Resources.NetClsClassid = *r.Network.ClassID + } + for _, m := range r.Network.Priorities { + c.Resources.NetPrioIfpriomap = append(c.Resources.NetPrioIfpriomap, &configs.IfPrioMap{ + Interface: m.Name, + Priority: int64(m.Priority), + }) + } + } + } + // append the default allowed devices to the end of the list + c.Resources.Devices = append(c.Resources.Devices, AllowedDevices...) + return c, nil +} + +func stringToCgroupDeviceRune(s string) (rune, error) { + switch s { + case "a": + return 'a', nil + case "b": + return 'b', nil + case "c": + return 'c', nil + default: + return 0, fmt.Errorf("invalid cgroup device type %q", s) + } +} + +func stringToDeviceRune(s string) (rune, error) { + switch s { + case "p": + return 'p', nil + case "u": + return 'u', nil + case "b": + return 'b', nil + case "c": + return 'c', nil + default: + return 0, fmt.Errorf("invalid device type %q", s) + } +} + +func createDevices(spec *specs.Spec, config *configs.Config) error { + // add whitelisted devices + config.Devices = []*configs.Device{ + { + Type: 'c', + Path: "/dev/null", + Major: 1, + Minor: 3, + FileMode: 0666, + Uid: 0, + Gid: 0, + }, + { + Type: 'c', + Path: "/dev/random", + Major: 1, + Minor: 8, + FileMode: 0666, + Uid: 0, + Gid: 0, + }, + { + Type: 'c', + Path: "/dev/full", + Major: 1, + Minor: 7, + FileMode: 0666, + Uid: 0, + Gid: 0, + }, + { + Type: 'c', + Path: "/dev/tty", + Major: 5, + Minor: 0, + FileMode: 0666, + Uid: 0, + Gid: 0, + }, + { + Type: 'c', + Path: "/dev/zero", + Major: 1, + Minor: 5, + FileMode: 0666, + Uid: 0, + Gid: 0, + }, + { + Type: 'c', + Path: "/dev/urandom", + Major: 1, + Minor: 9, + FileMode: 0666, + Uid: 0, + Gid: 0, + }, + } + // merge in additional devices from the spec + if spec.Linux != nil { + for _, d := range spec.Linux.Devices { + var uid, gid uint32 + var filemode os.FileMode = 0666 + + if d.UID != nil { + uid = *d.UID + } + if d.GID != nil { + gid = *d.GID + } + dt, err := stringToDeviceRune(d.Type) + if err != nil { + return err + } + if d.FileMode != nil { + filemode = *d.FileMode + } + device := &configs.Device{ + Type: dt, + Path: d.Path, + Major: d.Major, + Minor: d.Minor, + FileMode: filemode, + Uid: uid, + Gid: gid, + } + config.Devices = append(config.Devices, device) + } + } + return nil +} + +func setupUserNamespace(spec *specs.Spec, config *configs.Config) error { + create := func(m specs.LinuxIDMapping) configs.IDMap { + return configs.IDMap{ + HostID: int(m.HostID), + ContainerID: int(m.ContainerID), + Size: int(m.Size), + } + } + if spec.Linux != nil { + for _, m := range spec.Linux.UIDMappings { + config.UidMappings = append(config.UidMappings, create(m)) + } + for _, m := range spec.Linux.GIDMappings { + config.GidMappings = append(config.GidMappings, create(m)) + } + } + rootUID, err := config.HostRootUID() + if err != nil { + return err + } + rootGID, err := config.HostRootGID() + if err != nil { + return err + } + for _, node := range config.Devices { + node.Uid = uint32(rootUID) + node.Gid = uint32(rootGID) + } + return nil +} + +// parseMountOptions parses the string and returns the flags, propagation +// flags and any mount data that it contains. +func parseMountOptions(options []string) (int, []int, string, int) { + var ( + flag int + pgflag []int + data []string + extFlags int + ) + flags := map[string]struct { + clear bool + flag int + }{ + "acl": {false, unix.MS_POSIXACL}, + "async": {true, unix.MS_SYNCHRONOUS}, + "atime": {true, unix.MS_NOATIME}, + "bind": {false, unix.MS_BIND}, + "defaults": {false, 0}, + "dev": {true, unix.MS_NODEV}, + "diratime": {true, unix.MS_NODIRATIME}, + "dirsync": {false, unix.MS_DIRSYNC}, + "exec": {true, unix.MS_NOEXEC}, + "iversion": {false, unix.MS_I_VERSION}, + "lazytime": {false, unix.MS_LAZYTIME}, + "loud": {true, unix.MS_SILENT}, + "mand": {false, unix.MS_MANDLOCK}, + "noacl": {true, unix.MS_POSIXACL}, + "noatime": {false, unix.MS_NOATIME}, + "nodev": {false, unix.MS_NODEV}, + "nodiratime": {false, unix.MS_NODIRATIME}, + "noexec": {false, unix.MS_NOEXEC}, + "noiversion": {true, unix.MS_I_VERSION}, + "nolazytime": {true, unix.MS_LAZYTIME}, + "nomand": {true, unix.MS_MANDLOCK}, + "norelatime": {true, unix.MS_RELATIME}, + "nostrictatime": {true, unix.MS_STRICTATIME}, + "nosuid": {false, unix.MS_NOSUID}, + "rbind": {false, unix.MS_BIND | unix.MS_REC}, + "relatime": {false, unix.MS_RELATIME}, + "remount": {false, unix.MS_REMOUNT}, + "ro": {false, unix.MS_RDONLY}, + "rw": {true, unix.MS_RDONLY}, + "silent": {false, unix.MS_SILENT}, + "strictatime": {false, unix.MS_STRICTATIME}, + "suid": {true, unix.MS_NOSUID}, + "sync": {false, unix.MS_SYNCHRONOUS}, + } + propagationFlags := map[string]int{ + "private": unix.MS_PRIVATE, + "shared": unix.MS_SHARED, + "slave": unix.MS_SLAVE, + "unbindable": unix.MS_UNBINDABLE, + "rprivate": unix.MS_PRIVATE | unix.MS_REC, + "rshared": unix.MS_SHARED | unix.MS_REC, + "rslave": unix.MS_SLAVE | unix.MS_REC, + "runbindable": unix.MS_UNBINDABLE | unix.MS_REC, + } + extensionFlags := map[string]struct { + clear bool + flag int + }{ + "tmpcopyup": {false, configs.EXT_COPYUP}, + } + for _, o := range options { + // If the option does not exist in the flags table or the flag + // is not supported on the platform, + // then it is a data value for a specific fs type + if f, exists := flags[o]; exists && f.flag != 0 { + if f.clear { + flag &= ^f.flag + } else { + flag |= f.flag + } + } else if f, exists := propagationFlags[o]; exists && f != 0 { + pgflag = append(pgflag, f) + } else if f, exists := extensionFlags[o]; exists && f.flag != 0 { + if f.clear { + extFlags &= ^f.flag + } else { + extFlags |= f.flag + } + } else { + data = append(data, o) + } + } + return flag, pgflag, strings.Join(data, ","), extFlags +} + +func SetupSeccomp(config *specs.LinuxSeccomp) (*configs.Seccomp, error) { + if config == nil { + return nil, nil + } + + // No default action specified, no syscalls listed, assume seccomp disabled + if config.DefaultAction == "" && len(config.Syscalls) == 0 { + return nil, nil + } + + newConfig := new(configs.Seccomp) + newConfig.Syscalls = []*configs.Syscall{} + + if len(config.Architectures) > 0 { + newConfig.Architectures = []string{} + for _, arch := range config.Architectures { + newArch, err := seccomp.ConvertStringToArch(string(arch)) + if err != nil { + return nil, err + } + newConfig.Architectures = append(newConfig.Architectures, newArch) + } + } + + // Convert default action from string representation + newDefaultAction, err := seccomp.ConvertStringToAction(string(config.DefaultAction)) + if err != nil { + return nil, err + } + newConfig.DefaultAction = newDefaultAction + + // Loop through all syscall blocks and convert them to libcontainer format + for _, call := range config.Syscalls { + newAction, err := seccomp.ConvertStringToAction(string(call.Action)) + if err != nil { + return nil, err + } + + for _, name := range call.Names { + newCall := configs.Syscall{ + Name: name, + Action: newAction, + Args: []*configs.Arg{}, + } + // Loop through all the arguments of the syscall and convert them + for _, arg := range call.Args { + newOp, err := seccomp.ConvertStringToOperator(string(arg.Op)) + if err != nil { + return nil, err + } + + newArg := configs.Arg{ + Index: arg.Index, + Value: arg.Value, + ValueTwo: arg.ValueTwo, + Op: newOp, + } + + newCall.Args = append(newCall.Args, &newArg) + } + newConfig.Syscalls = append(newConfig.Syscalls, &newCall) + } + } + + return newConfig, nil +} + +func createHooks(rspec *specs.Spec, config *configs.Config) { + config.Hooks = &configs.Hooks{} + if rspec.Hooks != nil { + + for _, h := range rspec.Hooks.Prestart { + cmd := createCommandHook(h) + config.Hooks.Prestart = append(config.Hooks.Prestart, configs.NewCommandHook(cmd)) + } + for _, h := range rspec.Hooks.Poststart { + cmd := createCommandHook(h) + config.Hooks.Poststart = append(config.Hooks.Poststart, configs.NewCommandHook(cmd)) + } + for _, h := range rspec.Hooks.Poststop { + cmd := createCommandHook(h) + config.Hooks.Poststop = append(config.Hooks.Poststop, configs.NewCommandHook(cmd)) + } + } +} + +func createCommandHook(h specs.Hook) configs.Command { + cmd := configs.Command{ + Path: h.Path, + Args: h.Args, + Env: h.Env, + } + if h.Timeout != nil { + d := time.Duration(*h.Timeout) * time.Second + cmd.Timeout = &d + } + return cmd +} diff --git a/libcontainer/specconv/spec_linux_test.go b/libcontainer/specconv/spec_linux_test.go new file mode 100644 index 0000000..da6a43a --- /dev/null +++ b/libcontainer/specconv/spec_linux_test.go @@ -0,0 +1,452 @@ +// +build linux + +package specconv + +import ( + "os" + "strings" + "testing" + + "golang.org/x/sys/unix" + + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/configs/validate" + "github.com/opencontainers/runtime-spec/specs-go" +) + +func TestCreateCommandHookTimeout(t *testing.T) { + timeout := 3600 + hook := specs.Hook{ + Path: "/some/hook/path", + Args: []string{"--some", "thing"}, + Env: []string{"SOME=value"}, + Timeout: &timeout, + } + command := createCommandHook(hook) + timeoutStr := command.Timeout.String() + if timeoutStr != "1h0m0s" { + t.Errorf("Expected the Timeout to be 1h0m0s, got: %s", timeoutStr) + } +} + +func TestCreateHooks(t *testing.T) { + rspec := &specs.Spec{ + Hooks: &specs.Hooks{ + Prestart: []specs.Hook{ + { + Path: "/some/hook/path", + }, + { + Path: "/some/hook2/path", + Args: []string{"--some", "thing"}, + }, + }, + Poststart: []specs.Hook{ + { + Path: "/some/hook/path", + Args: []string{"--some", "thing"}, + Env: []string{"SOME=value"}, + }, + { + Path: "/some/hook2/path", + }, + { + Path: "/some/hook3/path", + }, + }, + Poststop: []specs.Hook{ + { + Path: "/some/hook/path", + Args: []string{"--some", "thing"}, + Env: []string{"SOME=value"}, + }, + { + Path: "/some/hook2/path", + }, + { + Path: "/some/hook3/path", + }, + { + Path: "/some/hook4/path", + Args: []string{"--some", "thing"}, + }, + }, + }, + } + conf := &configs.Config{} + createHooks(rspec, conf) + + prestart := conf.Hooks.Prestart + + if len(prestart) != 2 { + t.Error("Expected 2 Prestart hooks") + } + + poststart := conf.Hooks.Poststart + + if len(poststart) != 3 { + t.Error("Expected 3 Poststart hooks") + } + + poststop := conf.Hooks.Poststop + + if len(poststop) != 4 { + t.Error("Expected 4 Poststop hooks") + } + +} +func TestSetupSeccomp(t *testing.T) { + conf := &specs.LinuxSeccomp{ + DefaultAction: "SCMP_ACT_ERRNO", + Architectures: []specs.Arch{specs.ArchX86_64, specs.ArchARM}, + Syscalls: []specs.LinuxSyscall{ + { + Names: []string{"clone"}, + Action: "SCMP_ACT_ALLOW", + Args: []specs.LinuxSeccompArg{ + { + Index: 0, + Value: unix.CLONE_NEWNS | unix.CLONE_NEWUTS | unix.CLONE_NEWIPC | unix.CLONE_NEWUSER | unix.CLONE_NEWPID | unix.CLONE_NEWNET | unix.CLONE_NEWCGROUP, + ValueTwo: 0, + Op: "SCMP_CMP_MASKED_EQ", + }, + }, + }, + { + Names: []string{ + "select", + "semctl", + "semget", + "semop", + "semtimedop", + "send", + "sendfile", + }, + Action: "SCMP_ACT_ALLOW", + }, + }, + } + seccomp, err := SetupSeccomp(conf) + + if err != nil { + t.Errorf("Couldn't create Seccomp config: %v", err) + } + + if seccomp.DefaultAction != 2 { // SCMP_ACT_ERRNO + t.Error("Wrong conversion for DefaultAction") + } + + if len(seccomp.Architectures) != 2 { + t.Error("Wrong number of architectures") + } + + if seccomp.Architectures[0] != "amd64" || seccomp.Architectures[1] != "arm" { + t.Error("Expected architectures are not found") + } + + calls := seccomp.Syscalls + + callsLength := len(calls) + if callsLength != 8 { + t.Errorf("Expected 8 syscalls, got :%d", callsLength) + } + + for i, call := range calls { + if i == 0 { + expectedCloneSyscallArgs := configs.Arg{ + Index: 0, + Op: 7, // SCMP_CMP_MASKED_EQ + Value: unix.CLONE_NEWNS | unix.CLONE_NEWUTS | unix.CLONE_NEWIPC | unix.CLONE_NEWUSER | unix.CLONE_NEWPID | unix.CLONE_NEWNET | unix.CLONE_NEWCGROUP, + ValueTwo: 0, + } + if expectedCloneSyscallArgs != *call.Args[0] { + t.Errorf("Wrong arguments conversion for the clone syscall under test") + } + } + if call.Action != 4 { + t.Error("Wrong conversion for the clone syscall action") + } + + } + +} + +func TestLinuxCgroupWithMemoryResource(t *testing.T) { + cgroupsPath := "/user/cgroups/path/id" + + spec := &specs.Spec{} + devices := []specs.LinuxDeviceCgroup{ + { + Allow: false, + Access: "rwm", + }, + } + + limit := int64(100) + reservation := int64(50) + swap := int64(20) + kernel := int64(40) + kernelTCP := int64(45) + swappiness := uint64(1) + swappinessPtr := &swappiness + disableOOMKiller := true + resources := &specs.LinuxResources{ + Devices: devices, + Memory: &specs.LinuxMemory{ + Limit: &limit, + Reservation: &reservation, + Swap: &swap, + Kernel: &kernel, + KernelTCP: &kernelTCP, + Swappiness: swappinessPtr, + DisableOOMKiller: &disableOOMKiller, + }, + } + spec.Linux = &specs.Linux{ + CgroupsPath: cgroupsPath, + Resources: resources, + } + + opts := &CreateOpts{ + CgroupName: "ContainerID", + UseSystemdCgroup: false, + Spec: spec, + } + + cgroup, err := CreateCgroupConfig(opts) + if err != nil { + t.Errorf("Couldn't create Cgroup config: %v", err) + } + + if cgroup.Path != cgroupsPath { + t.Errorf("Wrong cgroupsPath, expected '%s' got '%s'", cgroupsPath, cgroup.Path) + } + if cgroup.Resources.Memory != limit { + t.Errorf("Expected to have %d as memory limit, got %d", limit, cgroup.Resources.Memory) + } + if cgroup.Resources.MemoryReservation != reservation { + t.Errorf("Expected to have %d as memory reservation, got %d", reservation, cgroup.Resources.MemoryReservation) + } + if cgroup.Resources.MemorySwap != swap { + t.Errorf("Expected to have %d as swap, got %d", swap, cgroup.Resources.MemorySwap) + } + if cgroup.Resources.KernelMemory != kernel { + t.Errorf("Expected to have %d as Kernel Memory, got %d", kernel, cgroup.Resources.KernelMemory) + } + if cgroup.Resources.KernelMemoryTCP != kernelTCP { + t.Errorf("Expected to have %d as TCP Kernel Memory, got %d", kernelTCP, cgroup.Resources.KernelMemoryTCP) + } + if cgroup.Resources.MemorySwappiness != swappinessPtr { + t.Errorf("Expected to have %d as memory swappiness, got %d", swappinessPtr, cgroup.Resources.MemorySwappiness) + } + if cgroup.Resources.OomKillDisable != disableOOMKiller { + t.Errorf("The OOMKiller should be enabled") + } +} + +func TestLinuxCgroupSystemd(t *testing.T) { + cgroupsPath := "parent:scopeprefix:name" + + spec := &specs.Spec{} + spec.Linux = &specs.Linux{ + CgroupsPath: cgroupsPath, + } + + opts := &CreateOpts{ + UseSystemdCgroup: true, + Spec: spec, + } + + cgroup, err := CreateCgroupConfig(opts) + + if err != nil { + t.Errorf("Couldn't create Cgroup config: %v", err) + } + + expectedParent := "parent" + if cgroup.Parent != expectedParent { + t.Errorf("Expected to have %s as Parent instead of %s", expectedParent, cgroup.Parent) + } + + expectedScopePrefix := "scopeprefix" + if cgroup.ScopePrefix != expectedScopePrefix { + t.Errorf("Expected to have %s as ScopePrefix instead of %s", expectedScopePrefix, cgroup.ScopePrefix) + } + + expectedName := "name" + if cgroup.Name != expectedName { + t.Errorf("Expected to have %s as Name instead of %s", expectedName, cgroup.Name) + } +} + +func TestLinuxCgroupSystemdWithEmptyPath(t *testing.T) { + cgroupsPath := "" + + spec := &specs.Spec{} + spec.Linux = &specs.Linux{ + CgroupsPath: cgroupsPath, + } + + opts := &CreateOpts{ + CgroupName: "ContainerID", + UseSystemdCgroup: true, + Spec: spec, + } + + cgroup, err := CreateCgroupConfig(opts) + + if err != nil { + t.Errorf("Couldn't create Cgroup config: %v", err) + } + + expectedParent := "system.slice" + if cgroup.Parent != expectedParent { + t.Errorf("Expected to have %s as Parent instead of %s", expectedParent, cgroup.Parent) + } + + expectedScopePrefix := "runc" + if cgroup.ScopePrefix != expectedScopePrefix { + t.Errorf("Expected to have %s as ScopePrefix instead of %s", expectedScopePrefix, cgroup.ScopePrefix) + } + + if cgroup.Name != opts.CgroupName { + t.Errorf("Expected to have %s as Name instead of %s", opts.CgroupName, cgroup.Name) + } +} + +func TestLinuxCgroupSystemdWithInvalidPath(t *testing.T) { + cgroupsPath := "/user/cgroups/path/id" + + spec := &specs.Spec{} + spec.Linux = &specs.Linux{ + CgroupsPath: cgroupsPath, + } + + opts := &CreateOpts{ + CgroupName: "ContainerID", + UseSystemdCgroup: true, + Spec: spec, + } + + _, err := CreateCgroupConfig(opts) + if err == nil { + t.Error("Expected to produce an error if not using the correct format for cgroup paths belonging to systemd") + } +} +func TestLinuxCgroupsPathSpecified(t *testing.T) { + cgroupsPath := "/user/cgroups/path/id" + + spec := &specs.Spec{} + spec.Linux = &specs.Linux{ + CgroupsPath: cgroupsPath, + } + + opts := &CreateOpts{ + CgroupName: "ContainerID", + UseSystemdCgroup: false, + Spec: spec, + } + + cgroup, err := CreateCgroupConfig(opts) + if err != nil { + t.Errorf("Couldn't create Cgroup config: %v", err) + } + + if cgroup.Path != cgroupsPath { + t.Errorf("Wrong cgroupsPath, expected '%s' got '%s'", cgroupsPath, cgroup.Path) + } +} + +func TestLinuxCgroupsPathNotSpecified(t *testing.T) { + spec := &specs.Spec{} + opts := &CreateOpts{ + CgroupName: "ContainerID", + UseSystemdCgroup: false, + Spec: spec, + } + + cgroup, err := CreateCgroupConfig(opts) + if err != nil { + t.Errorf("Couldn't create Cgroup config: %v", err) + } + + if cgroup.Path != "" { + t.Errorf("Wrong cgroupsPath, expected it to be empty string, got '%s'", cgroup.Path) + } +} + +func TestSpecconvExampleValidate(t *testing.T) { + spec := Example() + spec.Root.Path = "/" + + opts := &CreateOpts{ + CgroupName: "ContainerID", + UseSystemdCgroup: false, + Spec: spec, + } + + config, err := CreateLibcontainerConfig(opts) + if err != nil { + t.Errorf("Couldn't create libcontainer config: %v", err) + } + + validator := validate.New() + if err := validator.Validate(config); err != nil { + t.Errorf("Expected specconv to produce valid container config: %v", err) + } +} + +func TestDupNamespaces(t *testing.T) { + spec := &specs.Spec{ + Root: &specs.Root{ + Path: "rootfs", + }, + Linux: &specs.Linux{ + Namespaces: []specs.LinuxNamespace{ + { + Type: "pid", + }, + { + Type: "pid", + Path: "/proc/1/ns/pid", + }, + }, + }, + } + + _, err := CreateLibcontainerConfig(&CreateOpts{ + Spec: spec, + }) + + if !strings.Contains(err.Error(), "malformed spec file: duplicated ns") { + t.Errorf("Duplicated namespaces should be forbidden") + } +} + +func TestNonZeroEUIDCompatibleSpecconvValidate(t *testing.T) { + if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) { + t.Skip("userns is unsupported") + } + + spec := Example() + spec.Root.Path = "/" + ToRootless(spec) + + opts := &CreateOpts{ + CgroupName: "ContainerID", + UseSystemdCgroup: false, + Spec: spec, + RootlessEUID: true, + RootlessCgroups: true, + } + + config, err := CreateLibcontainerConfig(opts) + if err != nil { + t.Errorf("Couldn't create libcontainer config: %v", err) + } + + validator := validate.New() + if err := validator.Validate(config); err != nil { + t.Errorf("Expected specconv to produce valid rootless container config: %v", err) + } +} diff --git a/libcontainer/stacktrace/capture.go b/libcontainer/stacktrace/capture.go new file mode 100644 index 0000000..0bbe149 --- /dev/null +++ b/libcontainer/stacktrace/capture.go @@ -0,0 +1,27 @@ +package stacktrace + +import "runtime" + +// Capture captures a stacktrace for the current calling go program +// +// skip is the number of frames to skip +func Capture(userSkip int) Stacktrace { + var ( + skip = userSkip + 1 // add one for our own function + frames []Frame + prevPc uintptr + ) + for i := skip; ; i++ { + pc, file, line, ok := runtime.Caller(i) + //detect if caller is repeated to avoid loop, gccgo + //currently runs into a loop without this check + if !ok || pc == prevPc { + break + } + frames = append(frames, NewFrame(pc, file, line)) + prevPc = pc + } + return Stacktrace{ + Frames: frames, + } +} diff --git a/libcontainer/stacktrace/capture_test.go b/libcontainer/stacktrace/capture_test.go new file mode 100644 index 0000000..978f6c4 --- /dev/null +++ b/libcontainer/stacktrace/capture_test.go @@ -0,0 +1,31 @@ +package stacktrace + +import ( + "strings" + "testing" +) + +func captureFunc() Stacktrace { + return Capture(0) +} + +func TestCaptureTestFunc(t *testing.T) { + stack := captureFunc() + + if len(stack.Frames) == 0 { + t.Fatal("expected stack frames to be returned") + } + + // the first frame is the caller + frame := stack.Frames[0] + if expected := "captureFunc"; frame.Function != expected { + t.Fatalf("expected function %q but received %q", expected, frame.Function) + } + expected := "/runc/libcontainer/stacktrace" + if !strings.HasSuffix(frame.Package, expected) { + t.Fatalf("expected package %q but received %q", expected, frame.Package) + } + if expected := "capture_test.go"; frame.File != expected { + t.Fatalf("expected file %q but received %q", expected, frame.File) + } +} diff --git a/libcontainer/stacktrace/frame.go b/libcontainer/stacktrace/frame.go new file mode 100644 index 0000000..0d590d9 --- /dev/null +++ b/libcontainer/stacktrace/frame.go @@ -0,0 +1,38 @@ +package stacktrace + +import ( + "path/filepath" + "runtime" + "strings" +) + +// NewFrame returns a new stack frame for the provided information +func NewFrame(pc uintptr, file string, line int) Frame { + fn := runtime.FuncForPC(pc) + if fn == nil { + return Frame{} + } + pack, name := parseFunctionName(fn.Name()) + return Frame{ + Line: line, + File: filepath.Base(file), + Package: pack, + Function: name, + } +} + +func parseFunctionName(name string) (string, string) { + i := strings.LastIndex(name, ".") + if i == -1 { + return "", name + } + return name[:i], name[i+1:] +} + +// Frame contains all the information for a stack frame within a go program +type Frame struct { + File string + Function string + Package string + Line int +} diff --git a/libcontainer/stacktrace/frame_test.go b/libcontainer/stacktrace/frame_test.go new file mode 100644 index 0000000..c6fc78e --- /dev/null +++ b/libcontainer/stacktrace/frame_test.go @@ -0,0 +1,20 @@ +package stacktrace + +import "testing" + +func TestParsePackageName(t *testing.T) { + var ( + name = "github.com/opencontainers/runc/libcontainer/stacktrace.captureFunc" + expectedPackage = "github.com/opencontainers/runc/libcontainer/stacktrace" + expectedFunction = "captureFunc" + ) + + pack, funcName := parseFunctionName(name) + if pack != expectedPackage { + t.Fatalf("expected package %q but received %q", expectedPackage, pack) + } + + if funcName != expectedFunction { + t.Fatalf("expected function %q but received %q", expectedFunction, funcName) + } +} diff --git a/libcontainer/stacktrace/stacktrace.go b/libcontainer/stacktrace/stacktrace.go new file mode 100644 index 0000000..5e8b58d --- /dev/null +++ b/libcontainer/stacktrace/stacktrace.go @@ -0,0 +1,5 @@ +package stacktrace + +type Stacktrace struct { + Frames []Frame +} diff --git a/libcontainer/standard_init_linux.go b/libcontainer/standard_init_linux.go new file mode 100644 index 0000000..4e03b8b --- /dev/null +++ b/libcontainer/standard_init_linux.go @@ -0,0 +1,214 @@ +// +build linux + +package libcontainer + +import ( + "fmt" + "os" + "os/exec" + "runtime" + "syscall" //only for Exec + + "github.com/opencontainers/runc/libcontainer/apparmor" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/keys" + "github.com/opencontainers/runc/libcontainer/seccomp" + "github.com/opencontainers/runc/libcontainer/system" + "github.com/opencontainers/selinux/go-selinux/label" + "github.com/pkg/errors" + + "golang.org/x/sys/unix" +) + +type linuxStandardInit struct { + pipe *os.File + consoleSocket *os.File + parentPid int + fifoFd int + config *initConfig +} + +func (l *linuxStandardInit) getSessionRingParams() (string, uint32, uint32) { + var newperms uint32 + + if l.config.Config.Namespaces.Contains(configs.NEWUSER) { + // With user ns we need 'other' search permissions. + newperms = 0x8 + } else { + // Without user ns we need 'UID' search permissions. + newperms = 0x80000 + } + + // Create a unique per session container name that we can join in setns; + // However, other containers can also join it. + return fmt.Sprintf("_ses.%s", l.config.ContainerId), 0xffffffff, newperms +} + +func (l *linuxStandardInit) Init() error { + runtime.LockOSThread() + defer runtime.UnlockOSThread() + if !l.config.Config.NoNewKeyring { + if err := label.SetKeyLabel(l.config.ProcessLabel); err != nil { + return err + } + defer label.SetKeyLabel("") + ringname, keepperms, newperms := l.getSessionRingParams() + + // Do not inherit the parent's session keyring. + if sessKeyId, err := keys.JoinSessionKeyring(ringname); err != nil { + // If keyrings aren't supported then it is likely we are on an + // older kernel (or inside an LXC container). While we could bail, + // the security feature we are using here is best-effort (it only + // really provides marginal protection since VFS credentials are + // the only significant protection of keyrings). + // + // TODO(cyphar): Log this so people know what's going on, once we + // have proper logging in 'runc init'. + if errors.Cause(err) != unix.ENOSYS { + return errors.Wrap(err, "join session keyring") + } + } else { + // Make session keyring searcheable. If we've gotten this far we + // bail on any error -- we don't want to have a keyring with bad + // permissions. + if err := keys.ModKeyringPerm(sessKeyId, keepperms, newperms); err != nil { + return errors.Wrap(err, "mod keyring permissions") + } + } + } + + if err := setupNetwork(l.config); err != nil { + return err + } + if err := setupRoute(l.config.Config); err != nil { + return err + } + + label.Init() + if err := prepareRootfs(l.pipe, l.config); err != nil { + return err + } + // Set up the console. This has to be done *before* we finalize the rootfs, + // but *after* we've given the user the chance to set up all of the mounts + // they wanted. + if l.config.CreateConsole { + if err := setupConsole(l.consoleSocket, l.config, true); err != nil { + return err + } + if err := system.Setctty(); err != nil { + return errors.Wrap(err, "setctty") + } + } + + // Finish the rootfs setup. + if l.config.Config.Namespaces.Contains(configs.NEWNS) { + if err := finalizeRootfs(l.config.Config); err != nil { + return err + } + } + + if hostname := l.config.Config.Hostname; hostname != "" { + if err := unix.Sethostname([]byte(hostname)); err != nil { + return errors.Wrap(err, "sethostname") + } + } + if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil { + return errors.Wrap(err, "apply apparmor profile") + } + + for key, value := range l.config.Config.Sysctl { + if err := writeSystemProperty(key, value); err != nil { + return errors.Wrapf(err, "write sysctl key %s", key) + } + } + for _, path := range l.config.Config.ReadonlyPaths { + if err := readonlyPath(path); err != nil { + return errors.Wrapf(err, "readonly path %s", path) + } + } + for _, path := range l.config.Config.MaskPaths { + if err := maskPath(path, l.config.Config.MountLabel); err != nil { + return errors.Wrapf(err, "mask path %s", path) + } + } + pdeath, err := system.GetParentDeathSignal() + if err != nil { + return errors.Wrap(err, "get pdeath signal") + } + if l.config.NoNewPrivileges { + if err := unix.Prctl(unix.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil { + return errors.Wrap(err, "set nonewprivileges") + } + } + // Tell our parent that we're ready to Execv. This must be done before the + // Seccomp rules have been applied, because we need to be able to read and + // write to a socket. + if err := syncParentReady(l.pipe); err != nil { + return errors.Wrap(err, "sync ready") + } + if err := label.SetProcessLabel(l.config.ProcessLabel); err != nil { + return errors.Wrap(err, "set process label") + } + defer label.SetProcessLabel("") + // Without NoNewPrivileges seccomp is a privileged operation, so we need to + // do this before dropping capabilities; otherwise do it as late as possible + // just before execve so as few syscalls take place after it as possible. + if l.config.Config.Seccomp != nil && !l.config.NoNewPrivileges { + if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil { + return err + } + } + if err := finalizeNamespace(l.config); err != nil { + return err + } + // finalizeNamespace can change user/group which clears the parent death + // signal, so we restore it here. + if err := pdeath.Restore(); err != nil { + return errors.Wrap(err, "restore pdeath signal") + } + // Compare the parent from the initial start of the init process and make + // sure that it did not change. if the parent changes that means it died + // and we were reparented to something else so we should just kill ourself + // and not cause problems for someone else. + if unix.Getppid() != l.parentPid { + return unix.Kill(unix.Getpid(), unix.SIGKILL) + } + // Check for the arg before waiting to make sure it exists and it is + // returned as a create time error. + name, err := exec.LookPath(l.config.Args[0]) + if err != nil { + return err + } + // Close the pipe to signal that we have completed our init. + l.pipe.Close() + // Wait for the FIFO to be opened on the other side before exec-ing the + // user process. We open it through /proc/self/fd/$fd, because the fd that + // was given to us was an O_PATH fd to the fifo itself. Linux allows us to + // re-open an O_PATH fd through /proc. + fd, err := unix.Open(fmt.Sprintf("/proc/self/fd/%d", l.fifoFd), unix.O_WRONLY|unix.O_CLOEXEC, 0) + if err != nil { + return newSystemErrorWithCause(err, "open exec fifo") + } + if _, err := unix.Write(fd, []byte("0")); err != nil { + return newSystemErrorWithCause(err, "write 0 exec fifo") + } + // Close the O_PATH fifofd fd before exec because the kernel resets + // dumpable in the wrong order. This has been fixed in newer kernels, but + // we keep this to ensure CVE-2016-9962 doesn't re-emerge on older kernels. + // N.B. the core issue itself (passing dirfds to the host filesystem) has + // since been resolved. + // https://github.com/torvalds/linux/blob/v4.9/fs/exec.c#L1290-L1318 + unix.Close(l.fifoFd) + // Set seccomp as close to execve as possible, so as few syscalls take + // place afterward (reducing the amount of syscalls that users need to + // enable in their seccomp profiles). + if l.config.Config.Seccomp != nil && l.config.NoNewPrivileges { + if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil { + return newSystemErrorWithCause(err, "init seccomp") + } + } + if err := syscall.Exec(name, l.config.Args[0:], os.Environ()); err != nil { + return newSystemErrorWithCause(err, "exec user process") + } + return nil +} diff --git a/libcontainer/state_linux.go b/libcontainer/state_linux.go new file mode 100644 index 0000000..5c16a42 --- /dev/null +++ b/libcontainer/state_linux.go @@ -0,0 +1,251 @@ +// +build linux + +package libcontainer + +import ( + "fmt" + "os" + "path/filepath" + + "github.com/opencontainers/runc/libcontainer/configs" + + "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" +) + +func newStateTransitionError(from, to containerState) error { + return &stateTransitionError{ + From: from.status().String(), + To: to.status().String(), + } +} + +// stateTransitionError is returned when an invalid state transition happens from one +// state to another. +type stateTransitionError struct { + From string + To string +} + +func (s *stateTransitionError) Error() string { + return fmt.Sprintf("invalid state transition from %s to %s", s.From, s.To) +} + +type containerState interface { + transition(containerState) error + destroy() error + status() Status +} + +func destroy(c *linuxContainer) error { + if !c.config.Namespaces.Contains(configs.NEWPID) { + if err := signalAllProcesses(c.cgroupManager, unix.SIGKILL); err != nil { + logrus.Warn(err) + } + } + err := c.cgroupManager.Destroy() + if c.intelRdtManager != nil { + if ierr := c.intelRdtManager.Destroy(); err == nil { + err = ierr + } + } + if rerr := os.RemoveAll(c.root); err == nil { + err = rerr + } + c.initProcess = nil + if herr := runPoststopHooks(c); err == nil { + err = herr + } + c.state = &stoppedState{c: c} + return err +} + +func runPoststopHooks(c *linuxContainer) error { + if c.config.Hooks != nil { + s, err := c.currentOCIState() + if err != nil { + return err + } + for _, hook := range c.config.Hooks.Poststop { + if err := hook.Run(s); err != nil { + return err + } + } + } + return nil +} + +// stoppedState represents a container is a stopped/destroyed state. +type stoppedState struct { + c *linuxContainer +} + +func (b *stoppedState) status() Status { + return Stopped +} + +func (b *stoppedState) transition(s containerState) error { + switch s.(type) { + case *runningState, *restoredState: + b.c.state = s + return nil + case *stoppedState: + return nil + } + return newStateTransitionError(b, s) +} + +func (b *stoppedState) destroy() error { + return destroy(b.c) +} + +// runningState represents a container that is currently running. +type runningState struct { + c *linuxContainer +} + +func (r *runningState) status() Status { + return Running +} + +func (r *runningState) transition(s containerState) error { + switch s.(type) { + case *stoppedState: + t, err := r.c.runType() + if err != nil { + return err + } + if t == Running { + return newGenericError(fmt.Errorf("container still running"), ContainerNotStopped) + } + r.c.state = s + return nil + case *pausedState: + r.c.state = s + return nil + case *runningState: + return nil + } + return newStateTransitionError(r, s) +} + +func (r *runningState) destroy() error { + t, err := r.c.runType() + if err != nil { + return err + } + if t == Running { + return newGenericError(fmt.Errorf("container is not destroyed"), ContainerNotStopped) + } + return destroy(r.c) +} + +type createdState struct { + c *linuxContainer +} + +func (i *createdState) status() Status { + return Created +} + +func (i *createdState) transition(s containerState) error { + switch s.(type) { + case *runningState, *pausedState, *stoppedState: + i.c.state = s + return nil + case *createdState: + return nil + } + return newStateTransitionError(i, s) +} + +func (i *createdState) destroy() error { + i.c.initProcess.signal(unix.SIGKILL) + return destroy(i.c) +} + +// pausedState represents a container that is currently pause. It cannot be destroyed in a +// paused state and must transition back to running first. +type pausedState struct { + c *linuxContainer +} + +func (p *pausedState) status() Status { + return Paused +} + +func (p *pausedState) transition(s containerState) error { + switch s.(type) { + case *runningState, *stoppedState: + p.c.state = s + return nil + case *pausedState: + return nil + } + return newStateTransitionError(p, s) +} + +func (p *pausedState) destroy() error { + t, err := p.c.runType() + if err != nil { + return err + } + if t != Running && t != Created { + if err := p.c.cgroupManager.Freeze(configs.Thawed); err != nil { + return err + } + return destroy(p.c) + } + return newGenericError(fmt.Errorf("container is paused"), ContainerPaused) +} + +// restoredState is the same as the running state but also has associated checkpoint +// information that maybe need destroyed when the container is stopped and destroy is called. +type restoredState struct { + imageDir string + c *linuxContainer +} + +func (r *restoredState) status() Status { + return Running +} + +func (r *restoredState) transition(s containerState) error { + switch s.(type) { + case *stoppedState, *runningState: + return nil + } + return newStateTransitionError(r, s) +} + +func (r *restoredState) destroy() error { + if _, err := os.Stat(filepath.Join(r.c.root, "checkpoint")); err != nil { + if !os.IsNotExist(err) { + return err + } + } + return destroy(r.c) +} + +// loadedState is used whenever a container is restored, loaded, or setting additional +// processes inside and it should not be destroyed when it is exiting. +type loadedState struct { + c *linuxContainer + s Status +} + +func (n *loadedState) status() Status { + return n.s +} + +func (n *loadedState) transition(s containerState) error { + n.c.state = s + return nil +} + +func (n *loadedState) destroy() error { + if err := n.c.refreshState(); err != nil { + return err + } + return n.c.state.destroy() +} diff --git a/libcontainer/state_linux_test.go b/libcontainer/state_linux_test.go new file mode 100644 index 0000000..6ef516b --- /dev/null +++ b/libcontainer/state_linux_test.go @@ -0,0 +1,116 @@ +// +build linux + +package libcontainer + +import ( + "reflect" + "testing" +) + +var states = map[containerState]Status{ + &createdState{}: Created, + &runningState{}: Running, + &restoredState{}: Running, + &pausedState{}: Paused, + &stoppedState{}: Stopped, + &loadedState{s: Running}: Running, +} + +func TestStateStatus(t *testing.T) { + for s, status := range states { + if s.status() != status { + t.Fatalf("state returned %s but expected %s", s.status(), status) + } + } +} + +func isStateTransitionError(err error) bool { + _, ok := err.(*stateTransitionError) + return ok +} + +func testTransitions(t *testing.T, initialState containerState, valid []containerState) { + validMap := map[reflect.Type]interface{}{} + for _, validState := range valid { + validMap[reflect.TypeOf(validState)] = nil + t.Run(validState.status().String(), func(t *testing.T) { + if err := initialState.transition(validState); err != nil { + t.Fatal(err) + } + }) + } + for state := range states { + if _, ok := validMap[reflect.TypeOf(state)]; ok { + continue + } + t.Run(state.status().String(), func(t *testing.T) { + err := initialState.transition(state) + if err == nil { + t.Fatal("transition should fail") + } + if !isStateTransitionError(err) { + t.Fatal("expected stateTransitionError") + } + }) + } +} + +func TestStoppedStateTransition(t *testing.T) { + testTransitions( + t, + &stoppedState{c: &linuxContainer{}}, + []containerState{ + &stoppedState{}, + &runningState{}, + &restoredState{}, + }, + ) +} + +func TestPausedStateTransition(t *testing.T) { + testTransitions( + t, + &pausedState{c: &linuxContainer{}}, + []containerState{ + &pausedState{}, + &runningState{}, + &stoppedState{}, + }, + ) +} + +func TestRestoredStateTransition(t *testing.T) { + testTransitions( + t, + &restoredState{c: &linuxContainer{}}, + []containerState{ + &stoppedState{}, + &runningState{}, + }, + ) +} + +func TestRunningStateTransition(t *testing.T) { + testTransitions( + t, + &runningState{c: &linuxContainer{}}, + []containerState{ + &stoppedState{}, + &pausedState{}, + &runningState{}, + }, + ) +} + +func TestCreatedStateTransition(t *testing.T) { + testTransitions( + t, + &createdState{c: &linuxContainer{}}, + []containerState{ + &stoppedState{}, + &pausedState{}, + &runningState{}, + &createdState{}, + }, + ) +} diff --git a/libcontainer/stats_linux.go b/libcontainer/stats_linux.go new file mode 100644 index 0000000..fff9dd3 --- /dev/null +++ b/libcontainer/stats_linux.go @@ -0,0 +1,13 @@ +package libcontainer + +import ( + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/intelrdt" + "github.com/opencontainers/runc/types" +) + +type Stats struct { + Interfaces []*types.NetworkInterface + CgroupStats *cgroups.Stats + IntelRdtStats *intelrdt.Stats +} diff --git a/libcontainer/sync.go b/libcontainer/sync.go new file mode 100644 index 0000000..a8704a2 --- /dev/null +++ b/libcontainer/sync.go @@ -0,0 +1,104 @@ +package libcontainer + +import ( + "encoding/json" + "fmt" + "io" + + "github.com/opencontainers/runc/libcontainer/utils" +) + +type syncType string + +// Constants that are used for synchronisation between the parent and child +// during container setup. They come in pairs (with procError being a generic +// response which is followed by a &genericError). +// +// [ child ] <-> [ parent ] +// +// procHooks --> [run hooks] +// <-- procResume +// +// procConsole --> +// <-- procConsoleReq +// [send(fd)] --> [recv(fd)] +// <-- procConsoleAck +// +// procReady --> [final setup] +// <-- procRun +const ( + procError syncType = "procError" + procReady syncType = "procReady" + procRun syncType = "procRun" + procHooks syncType = "procHooks" + procResume syncType = "procResume" +) + +type syncT struct { + Type syncType `json:"type"` +} + +// writeSync is used to write to a synchronisation pipe. An error is returned +// if there was a problem writing the payload. +func writeSync(pipe io.Writer, sync syncType) error { + return utils.WriteJSON(pipe, syncT{sync}) +} + +// readSync is used to read from a synchronisation pipe. An error is returned +// if we got a genericError, the pipe was closed, or we got an unexpected flag. +func readSync(pipe io.Reader, expected syncType) error { + var procSync syncT + if err := json.NewDecoder(pipe).Decode(&procSync); err != nil { + if err == io.EOF { + return fmt.Errorf("parent closed synchronisation channel") + } + + if procSync.Type == procError { + var ierr genericError + + if err := json.NewDecoder(pipe).Decode(&ierr); err != nil { + return fmt.Errorf("failed reading error from parent: %v", err) + } + + return &ierr + } + + if procSync.Type != expected { + return fmt.Errorf("invalid synchronisation flag from parent") + } + } + return nil +} + +// parseSync runs the given callback function on each syncT received from the +// child. It will return once io.EOF is returned from the given pipe. +func parseSync(pipe io.Reader, fn func(*syncT) error) error { + dec := json.NewDecoder(pipe) + for { + var sync syncT + if err := dec.Decode(&sync); err != nil { + if err == io.EOF { + break + } + return err + } + + // We handle this case outside fn for cleanliness reasons. + var ierr *genericError + if sync.Type == procError { + if err := dec.Decode(&ierr); err != nil && err != io.EOF { + return newSystemErrorWithCause(err, "decoding proc error from init") + } + if ierr != nil { + return ierr + } + // Programmer error. + panic("No error following JSON procError payload.") + } + + if err := fn(&sync); err != nil { + return err + } + } + return nil +} diff --git a/libcontainer/system/linux.go b/libcontainer/system/linux.go new file mode 100644 index 0000000..a4ae890 --- /dev/null +++ b/libcontainer/system/linux.go @@ -0,0 +1,155 @@ +// +build linux + +package system + +import ( + "os" + "os/exec" + "syscall" // only for exec + "unsafe" + + "github.com/opencontainers/runc/libcontainer/user" + "golang.org/x/sys/unix" +) + +// If arg2 is nonzero, set the "child subreaper" attribute of the +// calling process; if arg2 is zero, unset the attribute. When a +// process is marked as a child subreaper, all of the children +// that it creates, and their descendants, will be marked as +// having a subreaper. In effect, a subreaper fulfills the role +// of init(1) for its descendant processes. Upon termination of +// a process that is orphaned (i.e., its immediate parent has +// already terminated) and marked as having a subreaper, the +// nearest still living ancestor subreaper will receive a SIGCHLD +// signal and be able to wait(2) on the process to discover its +// termination status. +const PR_SET_CHILD_SUBREAPER = 36 + +type ParentDeathSignal int + +func (p ParentDeathSignal) Restore() error { + if p == 0 { + return nil + } + current, err := GetParentDeathSignal() + if err != nil { + return err + } + if p == current { + return nil + } + return p.Set() +} + +func (p ParentDeathSignal) Set() error { + return SetParentDeathSignal(uintptr(p)) +} + +func Execv(cmd string, args []string, env []string) error { + name, err := exec.LookPath(cmd) + if err != nil { + return err + } + + return syscall.Exec(name, args, env) +} + +func Prlimit(pid, resource int, limit unix.Rlimit) error { + _, _, err := unix.RawSyscall6(unix.SYS_PRLIMIT64, uintptr(pid), uintptr(resource), uintptr(unsafe.Pointer(&limit)), uintptr(unsafe.Pointer(&limit)), 0, 0) + if err != 0 { + return err + } + return nil +} + +func SetParentDeathSignal(sig uintptr) error { + if err := unix.Prctl(unix.PR_SET_PDEATHSIG, sig, 0, 0, 0); err != nil { + return err + } + return nil +} + +func GetParentDeathSignal() (ParentDeathSignal, error) { + var sig int + if err := unix.Prctl(unix.PR_GET_PDEATHSIG, uintptr(unsafe.Pointer(&sig)), 0, 0, 0); err != nil { + return -1, err + } + return ParentDeathSignal(sig), nil +} + +func SetKeepCaps() error { + if err := unix.Prctl(unix.PR_SET_KEEPCAPS, 1, 0, 0, 0); err != nil { + return err + } + + return nil +} + +func ClearKeepCaps() error { + if err := unix.Prctl(unix.PR_SET_KEEPCAPS, 0, 0, 0, 0); err != nil { + return err + } + + return nil +} + +func Setctty() error { + if err := unix.IoctlSetInt(0, unix.TIOCSCTTY, 0); err != nil { + return err + } + return nil +} + +// RunningInUserNS detects whether we are currently running in a user namespace. +// Originally copied from github.com/lxc/lxd/shared/util.go +func RunningInUserNS() bool { + uidmap, err := user.CurrentProcessUIDMap() + if err != nil { + // This kernel-provided file only exists if user namespaces are supported + return false + } + return UIDMapInUserNS(uidmap) +} + +func UIDMapInUserNS(uidmap []user.IDMap) bool { + /* + * We assume we are in the initial user namespace if we have a full + * range - 4294967295 uids starting at uid 0. + */ + if len(uidmap) == 1 && uidmap[0].ID == 0 && uidmap[0].ParentID == 0 && uidmap[0].Count == 4294967295 { + return false + } + return true +} + +// GetParentNSeuid returns the euid within the parent user namespace +func GetParentNSeuid() int64 { + euid := int64(os.Geteuid()) + uidmap, err := user.CurrentProcessUIDMap() + if err != nil { + // This kernel-provided file only exists if user namespaces are supported + return euid + } + for _, um := range uidmap { + if um.ID <= euid && euid <= um.ID+um.Count-1 { + return um.ParentID + euid - um.ID + } + } + return euid +} + +// SetSubreaper sets the value i as the subreaper setting for the calling process +func SetSubreaper(i int) error { + return unix.Prctl(PR_SET_CHILD_SUBREAPER, uintptr(i), 0, 0, 0) +} + +// GetSubreaper returns the subreaper setting for the calling process +func GetSubreaper() (int, error) { + var i uintptr + + if err := unix.Prctl(unix.PR_GET_CHILD_SUBREAPER, uintptr(unsafe.Pointer(&i)), 0, 0, 0); err != nil { + return -1, err + } + + return int(i), nil +} diff --git a/libcontainer/system/linux_test.go b/libcontainer/system/linux_test.go new file mode 100644 index 0000000..4d613d8 --- /dev/null +++ b/libcontainer/system/linux_test.go @@ -0,0 +1,45 @@ +// +build linux + +package system + +import ( + "strings" + "testing" + + "github.com/opencontainers/runc/libcontainer/user" +) + +func TestUIDMapInUserNS(t *testing.T) { + cases := []struct { + s string + expected bool + }{ + { + s: " 0 0 4294967295\n", + expected: false, + }, + { + s: " 0 0 1\n", + expected: true, + }, + { + s: " 0 1001 1\n 1 231072 65536\n", + expected: true, + }, + { + // file exist but empty (the initial state when userns is created. see man 7 user_namespaces) + s: "", + expected: true, + }, + } + for _, c := range cases { + uidmap, err := user.ParseIDMap(strings.NewReader(c.s)) + if err != nil { + t.Fatal(err) + } + actual := UIDMapInUserNS(uidmap) + if c.expected != actual { + t.Fatalf("expected %v, got %v for %q", c.expected, actual, c.s) + } + } +} diff --git a/libcontainer/system/proc.go b/libcontainer/system/proc.go new file mode 100644 index 0000000..79232a4 --- /dev/null +++ b/libcontainer/system/proc.go @@ -0,0 +1,113 @@ +package system + +import ( + "fmt" + "io/ioutil" + "path/filepath" + "strconv" + "strings" +) + +// State is the status of a process. +type State rune + +const ( // Only values for Linux 3.14 and later are listed here + Dead State = 'X' + DiskSleep State = 'D' + Running State = 'R' + Sleeping State = 'S' + Stopped State = 'T' + TracingStop State = 't' + Zombie State = 'Z' +) + +// String forms of the state from proc(5)'s documentation for +// /proc/[pid]/status' "State" field. +func (s State) String() string { + switch s { + case Dead: + return "dead" + case DiskSleep: + return "disk sleep" + case Running: + return "running" + case Sleeping: + return "sleeping" + case Stopped: + return "stopped" + case TracingStop: + return "tracing stop" + case Zombie: + return "zombie" + default: + return fmt.Sprintf("unknown (%c)", s) + } +} + +// Stat_t represents the information from /proc/[pid]/stat, as +// described in proc(5) with names based on the /proc/[pid]/status +// fields. +type Stat_t struct { + // PID is the process ID. + PID uint + + // Name is the command run by the process. + Name string + + // State is the state of the process. + State State + + // StartTime is the number of clock ticks after system boot (since + // Linux 2.6). + StartTime uint64 +} + +// Stat returns a Stat_t instance for the specified process. +func Stat(pid int) (stat Stat_t, err error) { + bytes, err := ioutil.ReadFile(filepath.Join("/proc", strconv.Itoa(pid), "stat")) + if err != nil { + return stat, err + } + return parseStat(string(bytes)) +} + +// GetProcessStartTime is deprecated. Use Stat(pid) and +// Stat_t.StartTime instead. +func GetProcessStartTime(pid int) (string, error) { + stat, err := Stat(pid) + if err != nil { + return "", err + } + return fmt.Sprintf("%d", stat.StartTime), nil +} + +func parseStat(data string) (stat Stat_t, err error) { + // From proc(5), field 2 could contain space and is inside `(` and `)`. + // The following is an example: + // 89653 (gunicorn: maste) S 89630 89653 89653 0 -1 4194560 29689 28896 0 3 146 32 76 19 20 0 1 0 2971844 52965376 3920 18446744073709551615 1 1 0 0 0 0 0 16781312 137447943 0 0 0 17 1 0 0 0 0 0 0 0 0 0 0 0 0 0 + i := strings.LastIndex(data, ")") + if i <= 2 || i >= len(data)-1 { + return stat, fmt.Errorf("invalid stat data: %q", data) + } + + parts := strings.SplitN(data[:i], "(", 2) + if len(parts) != 2 { + return stat, fmt.Errorf("invalid stat data: %q", data) + } + + stat.Name = parts[1] + _, err = fmt.Sscanf(parts[0], "%d", &stat.PID) + if err != nil { + return stat, err + } + + // parts indexes should be offset by 3 from the field number given + // proc(5), because parts is zero-indexed and we've removed fields + // one (PID) and two (Name) in the paren-split. + parts = strings.Split(data[i+2:], " ") + var state int + fmt.Sscanf(parts[3-3], "%c", &state) + stat.State = State(state) + fmt.Sscanf(parts[22-3], "%d", &stat.StartTime) + return stat, nil +} diff --git a/libcontainer/system/proc_test.go b/libcontainer/system/proc_test.go new file mode 100644 index 0000000..7e1acc5 --- /dev/null +++ b/libcontainer/system/proc_test.go @@ -0,0 +1,45 @@ +package system + +import "testing" + +func TestParseStartTime(t *testing.T) { + data := map[string]Stat_t{ + "4902 (gunicorn: maste) S 4885 4902 4902 0 -1 4194560 29683 29929 61 83 78 16 96 17 20 0 1 0 9126532 52965376 1903 18446744073709551615 4194304 7461796 140733928751520 140733928698072 139816984959091 0 0 16781312 137447943 1 0 0 17 3 0 0 9 0 0 9559488 10071156 33050624 140733928758775 140733928758945 140733928758945 140733928759264 0": { + PID: 4902, + Name: "gunicorn: maste", + State: 'S', + StartTime: 9126532, + }, + "9534 (cat) R 9323 9534 9323 34828 9534 4194304 95 0 0 0 0 0 0 0 20 0 1 0 9214966 7626752 168 18446744073709551615 4194304 4240332 140732237651568 140732237650920 140570710391216 0 0 0 0 0 0 0 17 1 0 0 0 0 0 6340112 6341364 21553152 140732237653865 140732237653885 140732237653885 140732237656047 0": { + PID: 9534, + Name: "cat", + State: 'R', + StartTime: 9214966, + }, + + "24767 (irq/44-mei_me) S 2 0 0 0 -1 2129984 0 0 0 0 0 0 0 0 -51 0 1 0 8722075 0 0 18446744073709551615 0 0 0 0 0 0 0 2147483647 0 0 0 0 17 1 50 1 0 0 0 0 0 0 0 0 0 0 0": { + PID: 24767, + Name: "irq/44-mei_me", + State: 'S', + StartTime: 8722075, + }, + } + for line, expected := range data { + st, err := parseStat(line) + if err != nil { + t.Fatal(err) + } + if st.PID != expected.PID { + t.Fatalf("expected PID %q but received %q", expected.PID, st.PID) + } + if st.State != expected.State { + t.Fatalf("expected state %q but received %q", expected.State, st.State) + } + if st.Name != expected.Name { + t.Fatalf("expected name %q but received %q", expected.Name, st.Name) + } + if st.StartTime != expected.StartTime { + t.Fatalf("expected start time %q but received %q", expected.StartTime, st.StartTime) + } + } +} diff --git a/libcontainer/system/syscall_linux_32.go b/libcontainer/system/syscall_linux_32.go new file mode 100644 index 0000000..c5ca5d8 --- /dev/null +++ b/libcontainer/system/syscall_linux_32.go @@ -0,0 +1,26 @@ +// +build linux +// +build 386 arm + +package system + +import ( + "golang.org/x/sys/unix" +) + +// Setuid sets the uid of the calling thread to the specified uid. +func Setuid(uid int) (err error) { + _, _, e1 := unix.RawSyscall(unix.SYS_SETUID32, uintptr(uid), 0, 0) + if e1 != 0 { + err = e1 + } + return +} + +// Setgid sets the gid of the calling thread to the specified gid. +func Setgid(gid int) (err error) { + _, _, e1 := unix.RawSyscall(unix.SYS_SETGID32, uintptr(gid), 0, 0) + if e1 != 0 { + err = e1 + } + return +} diff --git a/libcontainer/system/syscall_linux_64.go b/libcontainer/system/syscall_linux_64.go new file mode 100644 index 0000000..e05e30a --- /dev/null +++ b/libcontainer/system/syscall_linux_64.go @@ -0,0 +1,26 @@ +// +build linux +// +build arm64 amd64 mips mipsle mips64 mips64le ppc ppc64 ppc64le riscv64 s390x + +package system + +import ( + "golang.org/x/sys/unix" +) + +// Setuid sets the uid of the calling thread to the specified uid. +func Setuid(uid int) (err error) { + _, _, e1 := unix.RawSyscall(unix.SYS_SETUID, uintptr(uid), 0, 0) + if e1 != 0 { + err = e1 + } + return +} + +// Setgid sets the gid of the calling thread to the specified gid. +func Setgid(gid int) (err error) { + _, _, e1 := unix.RawSyscall(unix.SYS_SETGID, uintptr(gid), 0, 0) + if e1 != 0 { + err = e1 + } + return +} diff --git a/libcontainer/system/sysconfig.go b/libcontainer/system/sysconfig.go new file mode 100644 index 0000000..b8434f1 --- /dev/null +++ b/libcontainer/system/sysconfig.go @@ -0,0 +1,12 @@ +// +build cgo,linux + +package system + +/* +#include +*/ +import "C" + +func GetClockTicks() int { + return int(C.sysconf(C._SC_CLK_TCK)) +} diff --git a/libcontainer/system/sysconfig_notcgo.go b/libcontainer/system/sysconfig_notcgo.go new file mode 100644 index 0000000..d93b5d5 --- /dev/null +++ b/libcontainer/system/sysconfig_notcgo.go @@ -0,0 +1,15 @@ +// +build !cgo windows + +package system + +func GetClockTicks() int { + // TODO figure out a better alternative for platforms where we're missing cgo + // + // TODO Windows. This could be implemented using Win32 QueryPerformanceFrequency(). + // https://msdn.microsoft.com/en-us/library/windows/desktop/ms644905(v=vs.85).aspx + // + // An example of its usage can be found here. + // https://msdn.microsoft.com/en-us/library/windows/desktop/dn553408(v=vs.85).aspx + + return 100 +} diff --git a/libcontainer/system/unsupported.go b/libcontainer/system/unsupported.go new file mode 100644 index 0000000..b94be74 --- /dev/null +++ b/libcontainer/system/unsupported.go @@ -0,0 +1,27 @@ +// +build !linux + +package system + +import ( + "os" + + "github.com/opencontainers/runc/libcontainer/user" +) + +// RunningInUserNS is a stub for non-Linux systems +// Always returns false +func RunningInUserNS() bool { + return false +} + +// UIDMapInUserNS is a stub for non-Linux systems +// Always returns false +func UIDMapInUserNS(uidmap []user.IDMap) bool { + return false +} + +// GetParentNSeuid returns the euid within the parent user namespace +// Always returns os.Geteuid on non-linux +func GetParentNSeuid() int { + return os.Geteuid() +} diff --git a/libcontainer/system/xattrs_linux.go b/libcontainer/system/xattrs_linux.go new file mode 100644 index 0000000..a6823fc --- /dev/null +++ b/libcontainer/system/xattrs_linux.go @@ -0,0 +1,35 @@ +package system + +import "golang.org/x/sys/unix" + +// Returns a []byte slice if the xattr is set and nil otherwise +// Requires path and its attribute as arguments +func Lgetxattr(path string, attr string) ([]byte, error) { + var sz int + // Start with a 128 length byte array + dest := make([]byte, 128) + sz, errno := unix.Lgetxattr(path, attr, dest) + + switch { + case errno == unix.ENODATA: + return nil, errno + case errno == unix.ENOTSUP: + return nil, errno + case errno == unix.ERANGE: + // 128 byte array might just not be good enough, + // A dummy buffer is used to get the real size + // of the xattrs on disk + sz, errno = unix.Lgetxattr(path, attr, []byte{}) + if errno != nil { + return nil, errno + } + dest = make([]byte, sz) + sz, errno = unix.Lgetxattr(path, attr, dest) + if errno != nil { + return nil, errno + } + case errno != nil: + return nil, errno + } + return dest[:sz], nil +} diff --git a/libcontainer/user/MAINTAINERS b/libcontainer/user/MAINTAINERS new file mode 100644 index 0000000..edbe200 --- /dev/null +++ b/libcontainer/user/MAINTAINERS @@ -0,0 +1,2 @@ +Tianon Gravi (@tianon) +Aleksa Sarai (@cyphar) diff --git a/libcontainer/user/lookup.go b/libcontainer/user/lookup.go new file mode 100644 index 0000000..6fd8dd0 --- /dev/null +++ b/libcontainer/user/lookup.go @@ -0,0 +1,41 @@ +package user + +import ( + "errors" +) + +var ( + // The current operating system does not provide the required data for user lookups. + ErrUnsupported = errors.New("user lookup: operating system does not provide passwd-formatted data") + // No matching entries found in file. + ErrNoPasswdEntries = errors.New("no matching entries in passwd file") + ErrNoGroupEntries = errors.New("no matching entries in group file") +) + +// LookupUser looks up a user by their username in /etc/passwd. If the user +// cannot be found (or there is no /etc/passwd file on the filesystem), then +// LookupUser returns an error. +func LookupUser(username string) (User, error) { + return lookupUser(username) +} + +// LookupUid looks up a user by their user id in /etc/passwd. If the user cannot +// be found (or there is no /etc/passwd file on the filesystem), then LookupId +// returns an error. +func LookupUid(uid int) (User, error) { + return lookupUid(uid) +} + +// LookupGroup looks up a group by its name in /etc/group. If the group cannot +// be found (or there is no /etc/group file on the filesystem), then LookupGroup +// returns an error. +func LookupGroup(groupname string) (Group, error) { + return lookupGroup(groupname) +} + +// LookupGid looks up a group by its group id in /etc/group. If the group cannot +// be found (or there is no /etc/group file on the filesystem), then LookupGid +// returns an error. +func LookupGid(gid int) (Group, error) { + return lookupGid(gid) +} diff --git a/libcontainer/user/lookup_unix.go b/libcontainer/user/lookup_unix.go new file mode 100644 index 0000000..92b5ae8 --- /dev/null +++ b/libcontainer/user/lookup_unix.go @@ -0,0 +1,144 @@ +// +build darwin dragonfly freebsd linux netbsd openbsd solaris + +package user + +import ( + "io" + "os" + "strconv" + + "golang.org/x/sys/unix" +) + +// Unix-specific path to the passwd and group formatted files. +const ( + unixPasswdPath = "/etc/passwd" + unixGroupPath = "/etc/group" +) + +func lookupUser(username string) (User, error) { + return lookupUserFunc(func(u User) bool { + return u.Name == username + }) +} + +func lookupUid(uid int) (User, error) { + return lookupUserFunc(func(u User) bool { + return u.Uid == uid + }) +} + +func lookupUserFunc(filter func(u User) bool) (User, error) { + // Get operating system-specific passwd reader-closer. + passwd, err := GetPasswd() + if err != nil { + return User{}, err + } + defer passwd.Close() + + // Get the users. + users, err := ParsePasswdFilter(passwd, filter) + if err != nil { + return User{}, err + } + + // No user entries found. + if len(users) == 0 { + return User{}, ErrNoPasswdEntries + } + + // Assume the first entry is the "correct" one. + return users[0], nil +} + +func lookupGroup(groupname string) (Group, error) { + return lookupGroupFunc(func(g Group) bool { + return g.Name == groupname + }) +} + +func lookupGid(gid int) (Group, error) { + return lookupGroupFunc(func(g Group) bool { + return g.Gid == gid + }) +} + +func lookupGroupFunc(filter func(g Group) bool) (Group, error) { + // Get operating system-specific group reader-closer. + group, err := GetGroup() + if err != nil { + return Group{}, err + } + defer group.Close() + + // Get the users. + groups, err := ParseGroupFilter(group, filter) + if err != nil { + return Group{}, err + } + + // No user entries found. + if len(groups) == 0 { + return Group{}, ErrNoGroupEntries + } + + // Assume the first entry is the "correct" one. + return groups[0], nil +} + +func GetPasswdPath() (string, error) { + return unixPasswdPath, nil +} + +func GetPasswd() (io.ReadCloser, error) { + return os.Open(unixPasswdPath) +} + +func GetGroupPath() (string, error) { + return unixGroupPath, nil +} + +func GetGroup() (io.ReadCloser, error) { + return os.Open(unixGroupPath) +} + +// CurrentUser looks up the current user by their user id in /etc/passwd. If the +// user cannot be found (or there is no /etc/passwd file on the filesystem), +// then CurrentUser returns an error. +func CurrentUser() (User, error) { + return LookupUid(unix.Getuid()) +} + +// CurrentGroup looks up the current user's group by their primary group id's +// entry in /etc/passwd. If the group cannot be found (or there is no +// /etc/group file on the filesystem), then CurrentGroup returns an error. +func CurrentGroup() (Group, error) { + return LookupGid(unix.Getgid()) +} + +func currentUserSubIDs(fileName string) ([]SubID, error) { + u, err := CurrentUser() + if err != nil { + return nil, err + } + filter := func(entry SubID) bool { + return entry.Name == u.Name || entry.Name == strconv.Itoa(u.Uid) + } + return ParseSubIDFileFilter(fileName, filter) +} + +func CurrentUserSubUIDs() ([]SubID, error) { + return currentUserSubIDs("/etc/subuid") +} + +func CurrentUserSubGIDs() ([]SubID, error) { + return currentUserSubIDs("/etc/subgid") +} + +func CurrentProcessUIDMap() ([]IDMap, error) { + return ParseIDMapFile("/proc/self/uid_map") +} + +func CurrentProcessGIDMap() ([]IDMap, error) { + return ParseIDMapFile("/proc/self/gid_map") +} diff --git a/libcontainer/user/lookup_windows.go b/libcontainer/user/lookup_windows.go new file mode 100644 index 0000000..65cd40e --- /dev/null +++ b/libcontainer/user/lookup_windows.go @@ -0,0 +1,40 @@ +// +build windows + +package user + +import ( + "fmt" + "os/user" +) + +func lookupUser(username string) (User, error) { + u, err := user.Lookup(username) + if err != nil { + return User{}, err + } + return userFromOS(u) +} + +func lookupUid(uid int) (User, error) { + u, err := user.LookupId(fmt.Sprintf("%d", uid)) + if err != nil { + return User{}, err + } + return userFromOS(u) +} + +func lookupGroup(groupname string) (Group, error) { + g, err := user.LookupGroup(groupname) + if err != nil { + return Group{}, err + } + return groupFromOS(g) +} + +func lookupGid(gid int) (Group, error) { + g, err := user.LookupGroupId(fmt.Sprintf("%d", gid)) + if err != nil { + return Group{}, err + } + return groupFromOS(g) +} diff --git a/libcontainer/user/user.go b/libcontainer/user/user.go new file mode 100644 index 0000000..7b912bb --- /dev/null +++ b/libcontainer/user/user.go @@ -0,0 +1,608 @@ +package user + +import ( + "bufio" + "fmt" + "io" + "os" + "os/user" + "strconv" + "strings" +) + +const ( + minId = 0 + maxId = 1<<31 - 1 //for 32-bit systems compatibility +) + +var ( + ErrRange = fmt.Errorf("uids and gids must be in range %d-%d", minId, maxId) +) + +type User struct { + Name string + Pass string + Uid int + Gid int + Gecos string + Home string + Shell string +} + +// userFromOS converts an os/user.(*User) to local User +// +// (This does not include Pass, Shell or Gecos) +func userFromOS(u *user.User) (User, error) { + newUser := User{ + Name: u.Username, + Home: u.HomeDir, + } + id, err := strconv.Atoi(u.Uid) + if err != nil { + return newUser, err + } + newUser.Uid = id + + id, err = strconv.Atoi(u.Gid) + if err != nil { + return newUser, err + } + newUser.Gid = id + return newUser, nil +} + +type Group struct { + Name string + Pass string + Gid int + List []string +} + +// groupFromOS converts an os/user.(*Group) to local Group +// +// (This does not include Pass, Shell or Gecos) +func groupFromOS(g *user.Group) (Group, error) { + newGroup := Group{ + Name: g.Name, + } + + id, err := strconv.Atoi(g.Gid) + if err != nil { + return newGroup, err + } + newGroup.Gid = id + + return newGroup, nil +} + +// SubID represents an entry in /etc/sub{u,g}id +type SubID struct { + Name string + SubID int64 + Count int64 +} + +// IDMap represents an entry in /proc/PID/{u,g}id_map +type IDMap struct { + ID int64 + ParentID int64 + Count int64 +} + +func parseLine(line string, v ...interface{}) { + parseParts(strings.Split(line, ":"), v...) +} + +func parseParts(parts []string, v ...interface{}) { + if len(parts) == 0 { + return + } + + for i, p := range parts { + // Ignore cases where we don't have enough fields to populate the arguments. + // Some configuration files like to misbehave. + if len(v) <= i { + break + } + + // Use the type of the argument to figure out how to parse it, scanf() style. + // This is legit. + switch e := v[i].(type) { + case *string: + *e = p + case *int: + // "numbers", with conversion errors ignored because of some misbehaving configuration files. + *e, _ = strconv.Atoi(p) + case *int64: + *e, _ = strconv.ParseInt(p, 10, 64) + case *[]string: + // Comma-separated lists. + if p != "" { + *e = strings.Split(p, ",") + } else { + *e = []string{} + } + default: + // Someone goof'd when writing code using this function. Scream so they can hear us. + panic(fmt.Sprintf("parseLine only accepts {*string, *int, *int64, *[]string} as arguments! %#v is not a pointer!", e)) + } + } +} + +func ParsePasswdFile(path string) ([]User, error) { + passwd, err := os.Open(path) + if err != nil { + return nil, err + } + defer passwd.Close() + return ParsePasswd(passwd) +} + +func ParsePasswd(passwd io.Reader) ([]User, error) { + return ParsePasswdFilter(passwd, nil) +} + +func ParsePasswdFileFilter(path string, filter func(User) bool) ([]User, error) { + passwd, err := os.Open(path) + if err != nil { + return nil, err + } + defer passwd.Close() + return ParsePasswdFilter(passwd, filter) +} + +func ParsePasswdFilter(r io.Reader, filter func(User) bool) ([]User, error) { + if r == nil { + return nil, fmt.Errorf("nil source for passwd-formatted data") + } + + var ( + s = bufio.NewScanner(r) + out = []User{} + ) + + for s.Scan() { + if err := s.Err(); err != nil { + return nil, err + } + + line := strings.TrimSpace(s.Text()) + if line == "" { + continue + } + + // see: man 5 passwd + // name:password:UID:GID:GECOS:directory:shell + // Name:Pass:Uid:Gid:Gecos:Home:Shell + // root:x:0:0:root:/root:/bin/bash + // adm:x:3:4:adm:/var/adm:/bin/false + p := User{} + parseLine(line, &p.Name, &p.Pass, &p.Uid, &p.Gid, &p.Gecos, &p.Home, &p.Shell) + + if filter == nil || filter(p) { + out = append(out, p) + } + } + + return out, nil +} + +func ParseGroupFile(path string) ([]Group, error) { + group, err := os.Open(path) + if err != nil { + return nil, err + } + + defer group.Close() + return ParseGroup(group) +} + +func ParseGroup(group io.Reader) ([]Group, error) { + return ParseGroupFilter(group, nil) +} + +func ParseGroupFileFilter(path string, filter func(Group) bool) ([]Group, error) { + group, err := os.Open(path) + if err != nil { + return nil, err + } + defer group.Close() + return ParseGroupFilter(group, filter) +} + +func ParseGroupFilter(r io.Reader, filter func(Group) bool) ([]Group, error) { + if r == nil { + return nil, fmt.Errorf("nil source for group-formatted data") + } + + var ( + s = bufio.NewScanner(r) + out = []Group{} + ) + + for s.Scan() { + if err := s.Err(); err != nil { + return nil, err + } + + text := s.Text() + if text == "" { + continue + } + + // see: man 5 group + // group_name:password:GID:user_list + // Name:Pass:Gid:List + // root:x:0:root + // adm:x:4:root,adm,daemon + p := Group{} + parseLine(text, &p.Name, &p.Pass, &p.Gid, &p.List) + + if filter == nil || filter(p) { + out = append(out, p) + } + } + + return out, nil +} + +type ExecUser struct { + Uid int + Gid int + Sgids []int + Home string +} + +// GetExecUserPath is a wrapper for GetExecUser. It reads data from each of the +// given file paths and uses that data as the arguments to GetExecUser. If the +// files cannot be opened for any reason, the error is ignored and a nil +// io.Reader is passed instead. +func GetExecUserPath(userSpec string, defaults *ExecUser, passwdPath, groupPath string) (*ExecUser, error) { + var passwd, group io.Reader + + if passwdFile, err := os.Open(passwdPath); err == nil { + passwd = passwdFile + defer passwdFile.Close() + } + + if groupFile, err := os.Open(groupPath); err == nil { + group = groupFile + defer groupFile.Close() + } + + return GetExecUser(userSpec, defaults, passwd, group) +} + +// GetExecUser parses a user specification string (using the passwd and group +// readers as sources for /etc/passwd and /etc/group data, respectively). In +// the case of blank fields or missing data from the sources, the values in +// defaults is used. +// +// GetExecUser will return an error if a user or group literal could not be +// found in any entry in passwd and group respectively. +// +// Examples of valid user specifications are: +// * "" +// * "user" +// * "uid" +// * "user:group" +// * "uid:gid +// * "user:gid" +// * "uid:group" +// +// It should be noted that if you specify a numeric user or group id, they will +// not be evaluated as usernames (only the metadata will be filled). So attempting +// to parse a user with user.Name = "1337" will produce the user with a UID of +// 1337. +func GetExecUser(userSpec string, defaults *ExecUser, passwd, group io.Reader) (*ExecUser, error) { + if defaults == nil { + defaults = new(ExecUser) + } + + // Copy over defaults. + user := &ExecUser{ + Uid: defaults.Uid, + Gid: defaults.Gid, + Sgids: defaults.Sgids, + Home: defaults.Home, + } + + // Sgids slice *cannot* be nil. + if user.Sgids == nil { + user.Sgids = []int{} + } + + // Allow for userArg to have either "user" syntax, or optionally "user:group" syntax + var userArg, groupArg string + parseLine(userSpec, &userArg, &groupArg) + + // Convert userArg and groupArg to be numeric, so we don't have to execute + // Atoi *twice* for each iteration over lines. + uidArg, uidErr := strconv.Atoi(userArg) + gidArg, gidErr := strconv.Atoi(groupArg) + + // Find the matching user. + users, err := ParsePasswdFilter(passwd, func(u User) bool { + if userArg == "" { + // Default to current state of the user. + return u.Uid == user.Uid + } + + if uidErr == nil { + // If the userArg is numeric, always treat it as a UID. + return uidArg == u.Uid + } + + return u.Name == userArg + }) + + // If we can't find the user, we have to bail. + if err != nil && passwd != nil { + if userArg == "" { + userArg = strconv.Itoa(user.Uid) + } + return nil, fmt.Errorf("unable to find user %s: %v", userArg, err) + } + + var matchedUserName string + if len(users) > 0 { + // First match wins, even if there's more than one matching entry. + matchedUserName = users[0].Name + user.Uid = users[0].Uid + user.Gid = users[0].Gid + user.Home = users[0].Home + } else if userArg != "" { + // If we can't find a user with the given username, the only other valid + // option is if it's a numeric username with no associated entry in passwd. + + if uidErr != nil { + // Not numeric. + return nil, fmt.Errorf("unable to find user %s: %v", userArg, ErrNoPasswdEntries) + } + user.Uid = uidArg + + // Must be inside valid uid range. + if user.Uid < minId || user.Uid > maxId { + return nil, ErrRange + } + + // Okay, so it's numeric. We can just roll with this. + } + + // On to the groups. If we matched a username, we need to do this because of + // the supplementary group IDs. + if groupArg != "" || matchedUserName != "" { + groups, err := ParseGroupFilter(group, func(g Group) bool { + // If the group argument isn't explicit, we'll just search for it. + if groupArg == "" { + // Check if user is a member of this group. + for _, u := range g.List { + if u == matchedUserName { + return true + } + } + return false + } + + if gidErr == nil { + // If the groupArg is numeric, always treat it as a GID. + return gidArg == g.Gid + } + + return g.Name == groupArg + }) + if err != nil && group != nil { + return nil, fmt.Errorf("unable to find groups for spec %v: %v", matchedUserName, err) + } + + // Only start modifying user.Gid if it is in explicit form. + if groupArg != "" { + if len(groups) > 0 { + // First match wins, even if there's more than one matching entry. + user.Gid = groups[0].Gid + } else { + // If we can't find a group with the given name, the only other valid + // option is if it's a numeric group name with no associated entry in group. + + if gidErr != nil { + // Not numeric. + return nil, fmt.Errorf("unable to find group %s: %v", groupArg, ErrNoGroupEntries) + } + user.Gid = gidArg + + // Must be inside valid gid range. + if user.Gid < minId || user.Gid > maxId { + return nil, ErrRange + } + + // Okay, so it's numeric. We can just roll with this. + } + } else if len(groups) > 0 { + // Supplementary group ids only make sense if in the implicit form. + user.Sgids = make([]int, len(groups)) + for i, group := range groups { + user.Sgids[i] = group.Gid + } + } + } + + return user, nil +} + +// GetAdditionalGroups looks up a list of groups by name or group id +// against the given /etc/group formatted data. If a group name cannot +// be found, an error will be returned. If a group id cannot be found, +// or the given group data is nil, the id will be returned as-is +// provided it is in the legal range. +func GetAdditionalGroups(additionalGroups []string, group io.Reader) ([]int, error) { + var groups = []Group{} + if group != nil { + var err error + groups, err = ParseGroupFilter(group, func(g Group) bool { + for _, ag := range additionalGroups { + if g.Name == ag || strconv.Itoa(g.Gid) == ag { + return true + } + } + return false + }) + if err != nil { + return nil, fmt.Errorf("Unable to find additional groups %v: %v", additionalGroups, err) + } + } + + gidMap := make(map[int]struct{}) + for _, ag := range additionalGroups { + var found bool + for _, g := range groups { + // if we found a matched group either by name or gid, take the + // first matched as correct + if g.Name == ag || strconv.Itoa(g.Gid) == ag { + if _, ok := gidMap[g.Gid]; !ok { + gidMap[g.Gid] = struct{}{} + found = true + break + } + } + } + // we asked for a group but didn't find it. let's check to see + // if we wanted a numeric group + if !found { + gid, err := strconv.Atoi(ag) + if err != nil { + return nil, fmt.Errorf("Unable to find group %s", ag) + } + // Ensure gid is inside gid range. + if gid < minId || gid > maxId { + return nil, ErrRange + } + gidMap[gid] = struct{}{} + } + } + gids := []int{} + for gid := range gidMap { + gids = append(gids, gid) + } + return gids, nil +} + +// GetAdditionalGroupsPath is a wrapper around GetAdditionalGroups +// that opens the groupPath given and gives it as an argument to +// GetAdditionalGroups. +func GetAdditionalGroupsPath(additionalGroups []string, groupPath string) ([]int, error) { + var group io.Reader + + if groupFile, err := os.Open(groupPath); err == nil { + group = groupFile + defer groupFile.Close() + } + return GetAdditionalGroups(additionalGroups, group) +} + +func ParseSubIDFile(path string) ([]SubID, error) { + subid, err := os.Open(path) + if err != nil { + return nil, err + } + defer subid.Close() + return ParseSubID(subid) +} + +func ParseSubID(subid io.Reader) ([]SubID, error) { + return ParseSubIDFilter(subid, nil) +} + +func ParseSubIDFileFilter(path string, filter func(SubID) bool) ([]SubID, error) { + subid, err := os.Open(path) + if err != nil { + return nil, err + } + defer subid.Close() + return ParseSubIDFilter(subid, filter) +} + +func ParseSubIDFilter(r io.Reader, filter func(SubID) bool) ([]SubID, error) { + if r == nil { + return nil, fmt.Errorf("nil source for subid-formatted data") + } + + var ( + s = bufio.NewScanner(r) + out = []SubID{} + ) + + for s.Scan() { + if err := s.Err(); err != nil { + return nil, err + } + + line := strings.TrimSpace(s.Text()) + if line == "" { + continue + } + + // see: man 5 subuid + p := SubID{} + parseLine(line, &p.Name, &p.SubID, &p.Count) + + if filter == nil || filter(p) { + out = append(out, p) + } + } + + return out, nil +} + +func ParseIDMapFile(path string) ([]IDMap, error) { + r, err := os.Open(path) + if err != nil { + return nil, err + } + defer r.Close() + return ParseIDMap(r) +} + +func ParseIDMap(r io.Reader) ([]IDMap, error) { + return ParseIDMapFilter(r, nil) +} + +func ParseIDMapFileFilter(path string, filter func(IDMap) bool) ([]IDMap, error) { + r, err := os.Open(path) + if err != nil { + return nil, err + } + defer r.Close() + return ParseIDMapFilter(r, filter) +} + +func ParseIDMapFilter(r io.Reader, filter func(IDMap) bool) ([]IDMap, error) { + if r == nil { + return nil, fmt.Errorf("nil source for idmap-formatted data") + } + + var ( + s = bufio.NewScanner(r) + out = []IDMap{} + ) + + for s.Scan() { + if err := s.Err(); err != nil { + return nil, err + } + + line := strings.TrimSpace(s.Text()) + if line == "" { + continue + } + + // see: man 7 user_namespaces + p := IDMap{} + parseParts(strings.Fields(line), &p.ID, &p.ParentID, &p.Count) + + if filter == nil || filter(p) { + out = append(out, p) + } + } + + return out, nil +} diff --git a/libcontainer/user/user_test.go b/libcontainer/user/user_test.go new file mode 100644 index 0000000..24ee559 --- /dev/null +++ b/libcontainer/user/user_test.go @@ -0,0 +1,507 @@ +package user + +import ( + "io" + "reflect" + "sort" + "strconv" + "strings" + "testing" + + "github.com/opencontainers/runc/libcontainer/utils" +) + +func TestUserParseLine(t *testing.T) { + var ( + a, b string + c []string + d int + ) + + parseLine("", &a, &b) + if a != "" || b != "" { + t.Fatalf("a and b should be empty ('%v', '%v')", a, b) + } + + parseLine("a", &a, &b) + if a != "a" || b != "" { + t.Fatalf("a should be 'a' and b should be empty ('%v', '%v')", a, b) + } + + parseLine("bad boys:corny cows", &a, &b) + if a != "bad boys" || b != "corny cows" { + t.Fatalf("a should be 'bad boys' and b should be 'corny cows' ('%v', '%v')", a, b) + } + + parseLine("", &c) + if len(c) != 0 { + t.Fatalf("c should be empty (%#v)", c) + } + + parseLine("d,e,f:g:h:i,j,k", &c, &a, &b, &c) + if a != "g" || b != "h" || len(c) != 3 || c[0] != "i" || c[1] != "j" || c[2] != "k" { + t.Fatalf("a should be 'g', b should be 'h', and c should be ['i','j','k'] ('%v', '%v', '%#v')", a, b, c) + } + + parseLine("::::::::::", &a, &b, &c) + if a != "" || b != "" || len(c) != 0 { + t.Fatalf("a, b, and c should all be empty ('%v', '%v', '%#v')", a, b, c) + } + + parseLine("not a number", &d) + if d != 0 { + t.Fatalf("d should be 0 (%v)", d) + } + + parseLine("b:12:c", &a, &d, &b) + if a != "b" || b != "c" || d != 12 { + t.Fatalf("a should be 'b' and b should be 'c', and d should be 12 ('%v', '%v', %v)", a, b, d) + } +} + +func TestUserParsePasswd(t *testing.T) { + users, err := ParsePasswdFilter(strings.NewReader(` +root:x:0:0:root:/root:/bin/bash +adm:x:3:4:adm:/var/adm:/bin/false +this is just some garbage data +`), nil) + if err != nil { + t.Fatalf("Unexpected error: %v", err) + } + if len(users) != 3 { + t.Fatalf("Expected 3 users, got %v", len(users)) + } + if users[0].Uid != 0 || users[0].Name != "root" { + t.Fatalf("Expected users[0] to be 0 - root, got %v - %v", users[0].Uid, users[0].Name) + } + if users[1].Uid != 3 || users[1].Name != "adm" { + t.Fatalf("Expected users[1] to be 3 - adm, got %v - %v", users[1].Uid, users[1].Name) + } +} + +func TestUserParseGroup(t *testing.T) { + groups, err := ParseGroupFilter(strings.NewReader(` +root:x:0:root +adm:x:4:root,adm,daemon +this is just some garbage data +`), nil) + if err != nil { + t.Fatalf("Unexpected error: %v", err) + } + if len(groups) != 3 { + t.Fatalf("Expected 3 groups, got %v", len(groups)) + } + if groups[0].Gid != 0 || groups[0].Name != "root" || len(groups[0].List) != 1 { + t.Fatalf("Expected groups[0] to be 0 - root - 1 member, got %v - %v - %v", groups[0].Gid, groups[0].Name, len(groups[0].List)) + } + if groups[1].Gid != 4 || groups[1].Name != "adm" || len(groups[1].List) != 3 { + t.Fatalf("Expected groups[1] to be 4 - adm - 3 members, got %v - %v - %v", groups[1].Gid, groups[1].Name, len(groups[1].List)) + } +} + +func TestValidGetExecUser(t *testing.T) { + const passwdContent = ` +root:x:0:0:root user:/root:/bin/bash +adm:x:42:43:adm:/var/adm:/bin/false +111:x:222:333::/var/garbage +odd:x:111:112::/home/odd::::: +this is just some garbage data +` + const groupContent = ` +root:x:0:root +adm:x:43: +grp:x:1234:root,adm +444:x:555:111 +odd:x:444: +this is just some garbage data +` + defaultExecUser := ExecUser{ + Uid: 8888, + Gid: 8888, + Sgids: []int{8888}, + Home: "/8888", + } + + tests := []struct { + ref string + expected ExecUser + }{ + { + ref: "root", + expected: ExecUser{ + Uid: 0, + Gid: 0, + Sgids: []int{0, 1234}, + Home: "/root", + }, + }, + { + ref: "adm", + expected: ExecUser{ + Uid: 42, + Gid: 43, + Sgids: []int{1234}, + Home: "/var/adm", + }, + }, + { + ref: "root:adm", + expected: ExecUser{ + Uid: 0, + Gid: 43, + Sgids: defaultExecUser.Sgids, + Home: "/root", + }, + }, + { + ref: "adm:1234", + expected: ExecUser{ + Uid: 42, + Gid: 1234, + Sgids: defaultExecUser.Sgids, + Home: "/var/adm", + }, + }, + { + ref: "42:1234", + expected: ExecUser{ + Uid: 42, + Gid: 1234, + Sgids: defaultExecUser.Sgids, + Home: "/var/adm", + }, + }, + { + ref: "1337:1234", + expected: ExecUser{ + Uid: 1337, + Gid: 1234, + Sgids: defaultExecUser.Sgids, + Home: defaultExecUser.Home, + }, + }, + { + ref: "1337", + expected: ExecUser{ + Uid: 1337, + Gid: defaultExecUser.Gid, + Sgids: defaultExecUser.Sgids, + Home: defaultExecUser.Home, + }, + }, + { + ref: "", + expected: ExecUser{ + Uid: defaultExecUser.Uid, + Gid: defaultExecUser.Gid, + Sgids: defaultExecUser.Sgids, + Home: defaultExecUser.Home, + }, + }, + + // Regression tests for #695. + { + ref: "111", + expected: ExecUser{ + Uid: 111, + Gid: 112, + Sgids: defaultExecUser.Sgids, + Home: "/home/odd", + }, + }, + { + ref: "111:444", + expected: ExecUser{ + Uid: 111, + Gid: 444, + Sgids: defaultExecUser.Sgids, + Home: "/home/odd", + }, + }, + } + + for _, test := range tests { + passwd := strings.NewReader(passwdContent) + group := strings.NewReader(groupContent) + + execUser, err := GetExecUser(test.ref, &defaultExecUser, passwd, group) + if err != nil { + t.Logf("got unexpected error when parsing '%s': %s", test.ref, err.Error()) + t.Fail() + continue + } + + if !reflect.DeepEqual(test.expected, *execUser) { + t.Logf("ref: %v", test.ref) + t.Logf("got: %#v", execUser) + t.Logf("expected: %#v", test.expected) + t.Fail() + continue + } + } +} + +func TestInvalidGetExecUser(t *testing.T) { + const passwdContent = ` +root:x:0:0:root user:/root:/bin/bash +adm:x:42:43:adm:/var/adm:/bin/false +-42:x:12:13:broken:/very/broken +this is just some garbage data +` + const groupContent = ` +root:x:0:root +adm:x:43: +grp:x:1234:root,adm +this is just some garbage data +` + + tests := []string{ + // No such user/group. + "notuser", + "notuser:notgroup", + "root:notgroup", + "notuser:adm", + "8888:notgroup", + "notuser:8888", + + // Invalid user/group values. + "-1:0", + "0:-3", + "-5:-2", + "-42", + "-43", + } + + for _, test := range tests { + passwd := strings.NewReader(passwdContent) + group := strings.NewReader(groupContent) + + execUser, err := GetExecUser(test, nil, passwd, group) + if err == nil { + t.Logf("got unexpected success when parsing '%s': %#v", test, execUser) + t.Fail() + continue + } + } +} + +func TestGetExecUserNilSources(t *testing.T) { + const passwdContent = ` +root:x:0:0:root user:/root:/bin/bash +adm:x:42:43:adm:/var/adm:/bin/false +this is just some garbage data +` + const groupContent = ` +root:x:0:root +adm:x:43: +grp:x:1234:root,adm +this is just some garbage data +` + + defaultExecUser := ExecUser{ + Uid: 8888, + Gid: 8888, + Sgids: []int{8888}, + Home: "/8888", + } + + tests := []struct { + ref string + passwd, group bool + expected ExecUser + }{ + { + ref: "", + passwd: false, + group: false, + expected: ExecUser{ + Uid: 8888, + Gid: 8888, + Sgids: []int{8888}, + Home: "/8888", + }, + }, + { + ref: "root", + passwd: true, + group: false, + expected: ExecUser{ + Uid: 0, + Gid: 0, + Sgids: []int{8888}, + Home: "/root", + }, + }, + { + ref: "0", + passwd: false, + group: false, + expected: ExecUser{ + Uid: 0, + Gid: 8888, + Sgids: []int{8888}, + Home: "/8888", + }, + }, + { + ref: "0:0", + passwd: false, + group: false, + expected: ExecUser{ + Uid: 0, + Gid: 0, + Sgids: []int{8888}, + Home: "/8888", + }, + }, + } + + for _, test := range tests { + var passwd, group io.Reader + + if test.passwd { + passwd = strings.NewReader(passwdContent) + } + + if test.group { + group = strings.NewReader(groupContent) + } + + execUser, err := GetExecUser(test.ref, &defaultExecUser, passwd, group) + if err != nil { + t.Logf("got unexpected error when parsing '%s': %s", test.ref, err.Error()) + t.Fail() + continue + } + + if !reflect.DeepEqual(test.expected, *execUser) { + t.Logf("got: %#v", execUser) + t.Logf("expected: %#v", test.expected) + t.Fail() + continue + } + } +} + +func TestGetAdditionalGroups(t *testing.T) { + type foo struct { + groups []string + expected []int + hasError bool + } + + const groupContent = ` +root:x:0:root +adm:x:43: +grp:x:1234:root,adm +adm:x:4343:root,adm-duplicate +this is just some garbage data +` + tests := []foo{ + { + // empty group + groups: []string{}, + expected: []int{}, + }, + { + // single group + groups: []string{"adm"}, + expected: []int{43}, + }, + { + // multiple groups + groups: []string{"adm", "grp"}, + expected: []int{43, 1234}, + }, + { + // invalid group + groups: []string{"adm", "grp", "not-exist"}, + expected: nil, + hasError: true, + }, + { + // group with numeric id + groups: []string{"43"}, + expected: []int{43}, + }, + { + // group with unknown numeric id + groups: []string{"adm", "10001"}, + expected: []int{43, 10001}, + }, + { + // groups specified twice with numeric and name + groups: []string{"adm", "43"}, + expected: []int{43}, + }, + { + // groups with too small id + groups: []string{"-1"}, + expected: nil, + hasError: true, + }, + } + + if utils.GetIntSize() > 4 { + tests = append(tests, foo{ + // groups with too large id + groups: []string{strconv.Itoa(1 << 31)}, + expected: nil, + hasError: true, + }) + } + + for _, test := range tests { + group := strings.NewReader(groupContent) + + gids, err := GetAdditionalGroups(test.groups, group) + if test.hasError && err == nil { + t.Errorf("Parse(%#v) expects error but has none", test) + continue + } + if !test.hasError && err != nil { + t.Errorf("Parse(%#v) has error %v", test, err) + continue + } + sort.Sort(sort.IntSlice(gids)) + if !reflect.DeepEqual(gids, test.expected) { + t.Errorf("Gids(%v), expect %v from groups %v", gids, test.expected, test.groups) + } + } +} + +func TestGetAdditionalGroupsNumeric(t *testing.T) { + tests := []struct { + groups []string + expected []int + hasError bool + }{ + { + // numeric groups only + groups: []string{"1234", "5678"}, + expected: []int{1234, 5678}, + }, + { + // numeric and alphabetic + groups: []string{"1234", "fake"}, + expected: nil, + hasError: true, + }, + } + + for _, test := range tests { + gids, err := GetAdditionalGroups(test.groups, nil) + if test.hasError && err == nil { + t.Errorf("Parse(%#v) expects error but has none", test) + continue + } + if !test.hasError && err != nil { + t.Errorf("Parse(%#v) has error %v", test, err) + continue + } + sort.Sort(sort.IntSlice(gids)) + if !reflect.DeepEqual(gids, test.expected) { + t.Errorf("Gids(%v), expect %v from groups %v", gids, test.expected, test.groups) + } + } +} diff --git a/libcontainer/utils/cmsg.go b/libcontainer/utils/cmsg.go new file mode 100644 index 0000000..c8a9364 --- /dev/null +++ b/libcontainer/utils/cmsg.go @@ -0,0 +1,93 @@ +// +build linux + +package utils + +/* + * Copyright 2016, 2017 SUSE LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import ( + "fmt" + "os" + + "golang.org/x/sys/unix" +) + +// MaxSendfdLen is the maximum length of the name of a file descriptor being +// sent using SendFd. The name of the file handle returned by RecvFd will never +// be larger than this value. +const MaxNameLen = 4096 + +// oobSpace is the size of the oob slice required to store a single FD. Note +// that unix.UnixRights appears to make the assumption that fd is always int32, +// so sizeof(fd) = 4. +var oobSpace = unix.CmsgSpace(4) + +// RecvFd waits for a file descriptor to be sent over the given AF_UNIX +// socket. The file name of the remote file descriptor will be recreated +// locally (it is sent as non-auxiliary data in the same payload). +func RecvFd(socket *os.File) (*os.File, error) { + // For some reason, unix.Recvmsg uses the length rather than the capacity + // when passing the msg_controllen and other attributes to recvmsg. So we + // have to actually set the length. + name := make([]byte, MaxNameLen) + oob := make([]byte, oobSpace) + + sockfd := socket.Fd() + n, oobn, _, _, err := unix.Recvmsg(int(sockfd), name, oob, 0) + if err != nil { + return nil, err + } + + if n >= MaxNameLen || oobn != oobSpace { + return nil, fmt.Errorf("recvfd: incorrect number of bytes read (n=%d oobn=%d)", n, oobn) + } + + // Truncate. + name = name[:n] + oob = oob[:oobn] + + scms, err := unix.ParseSocketControlMessage(oob) + if err != nil { + return nil, err + } + if len(scms) != 1 { + return nil, fmt.Errorf("recvfd: number of SCMs is not 1: %d", len(scms)) + } + scm := scms[0] + + fds, err := unix.ParseUnixRights(&scm) + if err != nil { + return nil, err + } + if len(fds) != 1 { + return nil, fmt.Errorf("recvfd: number of fds is not 1: %d", len(fds)) + } + fd := uintptr(fds[0]) + + return os.NewFile(fd, string(name)), nil +} + +// SendFd sends a file descriptor over the given AF_UNIX socket. In +// addition, the file.Name() of the given file will also be sent as +// non-auxiliary data in the same payload (allowing to send contextual +// information for a file descriptor). +func SendFd(socket *os.File, name string, fd uintptr) error { + if len(name) >= MaxNameLen { + return fmt.Errorf("sendfd: filename too long: %s", name) + } + oob := unix.UnixRights(int(fd)) + return unix.Sendmsg(int(socket.Fd()), []byte(name), oob, nil, 0) +} diff --git a/libcontainer/utils/utils.go b/libcontainer/utils/utils.go new file mode 100644 index 0000000..40ccfaa --- /dev/null +++ b/libcontainer/utils/utils.go @@ -0,0 +1,112 @@ +package utils + +import ( + "encoding/json" + "io" + "os" + "path/filepath" + "strings" + "unsafe" + + "golang.org/x/sys/unix" +) + +const ( + exitSignalOffset = 128 +) + +// ResolveRootfs ensures that the current working directory is +// not a symlink and returns the absolute path to the rootfs +func ResolveRootfs(uncleanRootfs string) (string, error) { + rootfs, err := filepath.Abs(uncleanRootfs) + if err != nil { + return "", err + } + return filepath.EvalSymlinks(rootfs) +} + +// ExitStatus returns the correct exit status for a process based on if it +// was signaled or exited cleanly +func ExitStatus(status unix.WaitStatus) int { + if status.Signaled() { + return exitSignalOffset + int(status.Signal()) + } + return status.ExitStatus() +} + +// WriteJSON writes the provided struct v to w using standard json marshaling +func WriteJSON(w io.Writer, v interface{}) error { + data, err := json.Marshal(v) + if err != nil { + return err + } + _, err = w.Write(data) + return err +} + +// CleanPath makes a path safe for use with filepath.Join. This is done by not +// only cleaning the path, but also (if the path is relative) adding a leading +// '/' and cleaning it (then removing the leading '/'). This ensures that a +// path resulting from prepending another path will always resolve to lexically +// be a subdirectory of the prefixed path. This is all done lexically, so paths +// that include symlinks won't be safe as a result of using CleanPath. +func CleanPath(path string) string { + // Deal with empty strings nicely. + if path == "" { + return "" + } + + // Ensure that all paths are cleaned (especially problematic ones like + // "/../../../../../" which can cause lots of issues). + path = filepath.Clean(path) + + // If the path isn't absolute, we need to do more processing to fix paths + // such as "../../../..//some/path". We also shouldn't convert absolute + // paths to relative ones. + if !filepath.IsAbs(path) { + path = filepath.Clean(string(os.PathSeparator) + path) + // This can't fail, as (by definition) all paths are relative to root. + path, _ = filepath.Rel(string(os.PathSeparator), path) + } + + // Clean the path again for good measure. + return filepath.Clean(path) +} + +// SearchLabels searches a list of key-value pairs for the provided key and +// returns the corresponding value. The pairs must be separated with '='. +func SearchLabels(labels []string, query string) string { + for _, l := range labels { + parts := strings.SplitN(l, "=", 2) + if len(parts) < 2 { + continue + } + if parts[0] == query { + return parts[1] + } + } + return "" +} + +// Annotations returns the bundle path and user defined annotations from the +// libcontainer state. We need to remove the bundle because that is a label +// added by libcontainer. +func Annotations(labels []string) (bundle string, userAnnotations map[string]string) { + userAnnotations = make(map[string]string) + for _, l := range labels { + parts := strings.SplitN(l, "=", 2) + if len(parts) < 2 { + continue + } + if parts[0] == "bundle" { + bundle = parts[1] + } else { + userAnnotations[parts[0]] = parts[1] + } + } + return +} + +func GetIntSize() int { + return int(unsafe.Sizeof(1)) +} diff --git a/libcontainer/utils/utils_test.go b/libcontainer/utils/utils_test.go new file mode 100644 index 0000000..395eedc --- /dev/null +++ b/libcontainer/utils/utils_test.go @@ -0,0 +1,142 @@ +package utils + +import ( + "bytes" + "fmt" + "os" + "path/filepath" + "testing" + + "golang.org/x/sys/unix" +) + +var labelTest = []struct { + labels []string + query string + expectedValue string +}{ + {[]string{"bundle=/path/to/bundle"}, "bundle", "/path/to/bundle"}, + {[]string{"test=a", "test=b"}, "bundle", ""}, + {[]string{"bundle=a", "test=b", "bundle=c"}, "bundle", "a"}, + {[]string{"", "test=a", "bundle=b"}, "bundle", "b"}, + {[]string{"test", "bundle=a"}, "bundle", "a"}, + {[]string{"test=a", "bundle="}, "bundle", ""}, +} + +func TestSearchLabels(t *testing.T) { + for _, tt := range labelTest { + if v := SearchLabels(tt.labels, tt.query); v != tt.expectedValue { + t.Errorf("expected value '%s' for query '%s'; got '%s'", tt.expectedValue, tt.query, v) + } + } +} + +func TestResolveRootfs(t *testing.T) { + dir := "rootfs" + os.Mkdir(dir, 0600) + defer os.Remove(dir) + + path, err := ResolveRootfs(dir) + if err != nil { + t.Fatal(err) + } + pwd, err := os.Getwd() + if err != nil { + t.Fatal(err) + } + if path != fmt.Sprintf("%s/%s", pwd, "rootfs") { + t.Errorf("expected rootfs to be abs and was %s", path) + } +} + +func TestResolveRootfsWithSymlink(t *testing.T) { + dir := "rootfs" + tmpDir, _ := filepath.EvalSymlinks(os.TempDir()) + os.Symlink(tmpDir, dir) + defer os.Remove(dir) + + path, err := ResolveRootfs(dir) + if err != nil { + t.Fatal(err) + } + + if path != tmpDir { + t.Errorf("expected rootfs to be the real path %s and was %s", path, os.TempDir()) + } +} + +func TestResolveRootfsWithNonExistingDir(t *testing.T) { + _, err := ResolveRootfs("foo") + if err == nil { + t.Error("expected error to happen but received nil") + } +} + +func TestExitStatus(t *testing.T) { + status := unix.WaitStatus(0) + ex := ExitStatus(status) + if ex != 0 { + t.Errorf("expected exit status to equal 0 and received %d", ex) + } +} + +func TestExitStatusSignaled(t *testing.T) { + status := unix.WaitStatus(2) + ex := ExitStatus(status) + if ex != 130 { + t.Errorf("expected exit status to equal 130 and received %d", ex) + } +} + +func TestWriteJSON(t *testing.T) { + person := struct { + Name string + Age int + }{ + Name: "Alice", + Age: 30, + } + + var b bytes.Buffer + err := WriteJSON(&b, person) + if err != nil { + t.Fatal(err) + } + + expected := `{"Name":"Alice","Age":30}` + if b.String() != expected { + t.Errorf("expected to write %s but was %s", expected, b.String()) + } +} + +func TestCleanPath(t *testing.T) { + path := CleanPath("") + if path != "" { + t.Errorf("expected to receive empty string and received %s", path) + } + + path = CleanPath("rootfs") + if path != "rootfs" { + t.Errorf("expected to receive 'rootfs' and received %s", path) + } + + path = CleanPath("../../../var") + if path != "var" { + t.Errorf("expected to receive 'var' and received %s", path) + } + + path = CleanPath("/../../../var") + if path != "/var" { + t.Errorf("expected to receive '/var' and received %s", path) + } + + path = CleanPath("/foo/bar/") + if path != "/foo/bar" { + t.Errorf("expected to receive '/foo/bar' and received %s", path) + } + + path = CleanPath("/foo/bar/../") + if path != "/foo" { + t.Errorf("expected to receive '/foo' and received %s", path) + } +} diff --git a/libcontainer/utils/utils_unix.go b/libcontainer/utils/utils_unix.go new file mode 100644 index 0000000..1576f2d --- /dev/null +++ b/libcontainer/utils/utils_unix.go @@ -0,0 +1,68 @@ +// +build !windows + +package utils + +import ( + "fmt" + "os" + "strconv" + + "golang.org/x/sys/unix" +) + +// EnsureProcHandle returns whether or not the given file handle is on procfs. +func EnsureProcHandle(fh *os.File) error { + var buf unix.Statfs_t + if err := unix.Fstatfs(int(fh.Fd()), &buf); err != nil { + return fmt.Errorf("ensure %s is on procfs: %v", fh.Name(), err) + } + if buf.Type != unix.PROC_SUPER_MAGIC { + return fmt.Errorf("%s is not on procfs", fh.Name()) + } + return nil +} + +// CloseExecFrom applies O_CLOEXEC to all file descriptors currently open for +// the process (except for those below the given fd value). +func CloseExecFrom(minFd int) error { + fdDir, err := os.Open("/proc/self/fd") + if err != nil { + return err + } + defer fdDir.Close() + + if err := EnsureProcHandle(fdDir); err != nil { + return err + } + + fdList, err := fdDir.Readdirnames(-1) + if err != nil { + return err + } + for _, fdStr := range fdList { + fd, err := strconv.Atoi(fdStr) + // Ignore non-numeric file names. + if err != nil { + continue + } + // Ignore descriptors lower than our specified minimum. + if fd < minFd { + continue + } + // Intentionally ignore errors from unix.CloseOnExec -- the cases where + // this might fail are basically file descriptors that have already + // been closed (including and especially the one that was created when + // ioutil.ReadDir did the "opendir" syscall). + unix.CloseOnExec(fd) + } + return nil +} + +// NewSockPair returns a new unix socket pair +func NewSockPair(name string) (parent *os.File, child *os.File, err error) { + fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0) + if err != nil { + return nil, nil, err + } + return os.NewFile(uintptr(fds[1]), name+"-p"), os.NewFile(uintptr(fds[0]), name+"-c"), nil +} diff --git a/list.go b/list.go new file mode 100644 index 0000000..0313d8c --- /dev/null +++ b/list.go @@ -0,0 +1,175 @@ +// +build linux + +package main + +import ( + "fmt" + "io/ioutil" + "os" + "path/filepath" + "syscall" + "text/tabwriter" + "time" + + "encoding/json" + + "github.com/opencontainers/runc/libcontainer" + "github.com/opencontainers/runc/libcontainer/user" + "github.com/opencontainers/runc/libcontainer/utils" + "github.com/urfave/cli" +) + +const formatOptions = `table or json` + +// containerState represents the platform agnostic pieces relating to a +// running container's status and state +type containerState struct { + // Version is the OCI version for the container + Version string `json:"ociVersion"` + // ID is the container ID + ID string `json:"id"` + // InitProcessPid is the init process id in the parent namespace + InitProcessPid int `json:"pid"` + // Status is the current status of the container, running, paused, ... + Status string `json:"status"` + // Bundle is the path on the filesystem to the bundle + Bundle string `json:"bundle"` + // Rootfs is a path to a directory containing the container's root filesystem. + Rootfs string `json:"rootfs"` + // Created is the unix timestamp for the creation time of the container in UTC + Created time.Time `json:"created"` + // Annotations is the user defined annotations added to the config. + Annotations map[string]string `json:"annotations,omitempty"` + // The owner of the state directory (the owner of the container). + Owner string `json:"owner"` +} + +var listCommand = cli.Command{ + Name: "list", + Usage: "lists containers started by runc with the given root", + ArgsUsage: ` + +Where the given root is specified via the global option "--root" +(default: "/run/runc"). + +EXAMPLE 1: +To list containers created via the default "--root": + # runc list + +EXAMPLE 2: +To list containers created using a non-default value for "--root": + # runc --root value list`, + Flags: []cli.Flag{ + cli.StringFlag{ + Name: "format, f", + Value: "table", + Usage: `select one of: ` + formatOptions, + }, + cli.BoolFlag{ + Name: "quiet, q", + Usage: "display only container IDs", + }, + }, + Action: func(context *cli.Context) error { + if err := checkArgs(context, 0, exactArgs); err != nil { + return err + } + s, err := getContainers(context) + if err != nil { + return err + } + + if context.Bool("quiet") { + for _, item := range s { + fmt.Println(item.ID) + } + return nil + } + + switch context.String("format") { + case "table": + w := tabwriter.NewWriter(os.Stdout, 12, 1, 3, ' ', 0) + fmt.Fprint(w, "ID\tPID\tSTATUS\tBUNDLE\tCREATED\tOWNER\n") + for _, item := range s { + fmt.Fprintf(w, "%s\t%d\t%s\t%s\t%s\t%s\n", + item.ID, + item.InitProcessPid, + item.Status, + item.Bundle, + item.Created.Format(time.RFC3339Nano), + item.Owner) + } + if err := w.Flush(); err != nil { + return err + } + case "json": + if err := json.NewEncoder(os.Stdout).Encode(s); err != nil { + return err + } + default: + return fmt.Errorf("invalid format option") + } + return nil + }, +} + +func getContainers(context *cli.Context) ([]containerState, error) { + factory, err := loadFactory(context) + if err != nil { + return nil, err + } + root := context.GlobalString("root") + absRoot, err := filepath.Abs(root) + if err != nil { + return nil, err + } + list, err := ioutil.ReadDir(absRoot) + if err != nil { + fatal(err) + } + + var s []containerState + for _, item := range list { + if item.IsDir() { + // This cast is safe on Linux. + stat := item.Sys().(*syscall.Stat_t) + owner, err := user.LookupUid(int(stat.Uid)) + if err != nil { + owner.Name = fmt.Sprintf("#%d", stat.Uid) + } + + container, err := factory.Load(item.Name()) + if err != nil { + fmt.Fprintf(os.Stderr, "load container %s: %v\n", item.Name(), err) + continue + } + containerStatus, err := container.Status() + if err != nil { + fmt.Fprintf(os.Stderr, "status for %s: %v\n", item.Name(), err) + continue + } + state, err := container.State() + if err != nil { + fmt.Fprintf(os.Stderr, "state for %s: %v\n", item.Name(), err) + continue + } + pid := state.BaseState.InitProcessPid + if containerStatus == libcontainer.Stopped { + pid = 0 + } + bundle, annotations := utils.Annotations(state.Config.Labels) + s = append(s, containerState{ + Version: state.BaseState.Config.Version, + ID: state.BaseState.ID, + InitProcessPid: pid, + Status: containerStatus.String(), + Bundle: bundle, + Rootfs: state.BaseState.Config.Rootfs, + Created: state.BaseState.Created, + Annotations: annotations, + Owner: owner.Name, + }) + } + } + return s, nil +} diff --git a/main.go b/main.go new file mode 100644 index 0000000..3a8c163 --- /dev/null +++ b/main.go @@ -0,0 +1,176 @@ +package main + +import ( + "fmt" + "io" + "os" + "strings" + + "github.com/opencontainers/runc/libcontainer/logs" + + "github.com/opencontainers/runtime-spec/specs-go" + + "github.com/sirupsen/logrus" + "github.com/urfave/cli" +) + +// version will be populated by the Makefile, read from +// VERSION file of the source code. +var version = "" + +// gitCommit will be the hash that the binary was built from +// and will be populated by the Makefile +var gitCommit = "" + +const ( + specConfig = "config.json" + usage = `Open Container Initiative runtime + +runc is a command line client for running applications packaged according to +the Open Container Initiative (OCI) format and is a compliant implementation of the +Open Container Initiative specification. + +runc integrates well with existing process supervisors to provide a production +container runtime environment for applications. It can be used with your +existing process monitoring tools and the container will be spawned as a +direct child of the process supervisor. + +Containers are configured using bundles. A bundle for a container is a directory +that includes a specification file named "` + specConfig + `" and a root filesystem. +The root filesystem contains the contents of the container. + +To start a new instance of a container: + + # runc run [ -b bundle ] + +Where "" is your name for the instance of the container that you +are starting. The name you provide for the container instance must be unique on +your host. Providing the bundle directory using "-b" is optional. The default +value for "bundle" is the current directory.` +) + +func main() { + app := cli.NewApp() + app.Name = "runc" + app.Usage = usage + + var v []string + if version != "" { + v = append(v, version) + } + if gitCommit != "" { + v = append(v, fmt.Sprintf("commit: %s", gitCommit)) + } + v = append(v, fmt.Sprintf("spec: %s", specs.Version)) + app.Version = strings.Join(v, "\n") + + root := "/run/runc" + if shouldHonorXDGRuntimeDir() { + if runtimeDir := os.Getenv("XDG_RUNTIME_DIR"); runtimeDir != "" { + root = runtimeDir + "/runc" + // According to the XDG specification, we need to set anything in + // XDG_RUNTIME_DIR to have a sticky bit if we don't want it to get + // auto-pruned. + if err := os.MkdirAll(root, 0700); err != nil { + fatal(err) + } + if err := os.Chmod(root, 0700|os.ModeSticky); err != nil { + fatal(err) + } + } + } + + app.Flags = []cli.Flag{ + cli.BoolFlag{ + Name: "debug", + Usage: "enable debug output for logging", + }, + cli.StringFlag{ + Name: "log", + Value: "", + Usage: "set the log file path where internal debug information is written", + }, + cli.StringFlag{ + Name: "log-format", + Value: "text", + Usage: "set the format used by logs ('text' (default), or 'json')", + }, + cli.StringFlag{ + Name: "root", + Value: root, + Usage: "root directory for storage of container state (this should be located in tmpfs)", + }, + cli.StringFlag{ + Name: "criu", + Value: "criu", + Usage: "path to the criu binary used for checkpoint and restore", + }, + cli.BoolFlag{ + Name: "systemd-cgroup", + Usage: "enable systemd cgroup support, expects cgroupsPath to be of form \"slice:prefix:name\" for e.g. \"system.slice:runc:434234\"", + }, + cli.StringFlag{ + Name: "rootless", + Value: "auto", + Usage: "ignore cgroup permission errors ('true', 'false', or 'auto')", + }, + } + app.Commands = []cli.Command{ + checkpointCommand, + createCommand, + deleteCommand, + eventsCommand, + execCommand, + initCommand, + killCommand, + listCommand, + pauseCommand, + psCommand, + restoreCommand, + resumeCommand, + runCommand, + specCommand, + startCommand, + stateCommand, + updateCommand, + } + app.Before = func(context *cli.Context) error { + return logs.ConfigureLogging(createLogConfig(context)) + } + + // If the command returns an error, cli takes upon itself to print + // the error on cli.ErrWriter and exit. + // Use our own writer here to ensure the log gets sent to the right location. + cli.ErrWriter = &FatalWriter{cli.ErrWriter} + if err := app.Run(os.Args); err != nil { + fatal(err) + } +} + +type FatalWriter struct { + cliErrWriter io.Writer +} + +func (f *FatalWriter) Write(p []byte) (n int, err error) { + logrus.Error(string(p)) + return f.cliErrWriter.Write(p) +} + +func createLogConfig(context *cli.Context) logs.Config { + logFilePath := context.GlobalString("log") + logPipeFd := "" + if logFilePath == "" { + logPipeFd = "2" + } + config := logs.Config{ + LogPipeFd: logPipeFd, + LogLevel: logrus.InfoLevel, + LogFilePath: logFilePath, + LogFormat: context.GlobalString("log-format"), + } + if context.GlobalBool("debug") { + config.LogLevel = logrus.DebugLevel + } + + return config +} diff --git a/man/README.md b/man/README.md new file mode 100644 index 0000000..1d7a54f --- /dev/null +++ b/man/README.md @@ -0,0 +1,11 @@ +runc man pages +==================== + +This directory contains man pages for runc in markdown format. + +To generate man pages from it, use this command + + ./md2man-all.sh + +You will see man pages generated under the man8 directory. + diff --git a/man/md2man-all.sh b/man/md2man-all.sh new file mode 100755 index 0000000..f850ddf --- /dev/null +++ b/man/md2man-all.sh @@ -0,0 +1,27 @@ +#!/bin/bash +set -e + +# get into this script's directory +cd "$(dirname "$(readlink -f "$BASH_SOURCE")")" + +[ "$1" = '-q' ] || { + set -x + pwd +} + +if ! ( which go-md2man &>/dev/null ); then + echo "To install man pages, please install 'go-md2man'." + exit 0 +fi + +for FILE in *.md; do + base="$(basename "$FILE")" + name="${base%.md}" + num="${name##*.}" + if [ -z "$num" -o "$name" = "$num" ]; then + # skip files that aren't of the format xxxx.N.md (like README.md) + continue + fi + mkdir -p "./man${num}" + go-md2man -in "$FILE" -out "./man${num}/${name}" +done diff --git a/man/runc-checkpoint.8.md b/man/runc-checkpoint.8.md new file mode 100644 index 0000000..08e6b1f --- /dev/null +++ b/man/runc-checkpoint.8.md @@ -0,0 +1,30 @@ +% runc-checkpoint "8" + +# NAME + runc checkpoint - checkpoint a running container + +# SYNOPSIS + runc checkpoint [command options] `` + +Where "``" is the name for the instance of the container to be +checkpointed. + +# DESCRIPTION + The checkpoint command saves the state of the container instance. + +# OPTIONS + --image-path value path for saving criu image files + --work-path value path for saving work files and logs + --parent-path value path for previous criu image files in pre-dump + --leave-running leave the process running after checkpointing + --tcp-established allow open tcp connections + --ext-unix-sk allow external unix sockets + --shell-job allow shell jobs + --lazy-pages use userfaultfd to lazily restore memory pages + --status-fd value criu writes \0 to this FD once lazy-pages is ready + --page-server value ADDRESS:PORT of the page server + --file-locks handle file locks, for safety + --pre-dump dump container's memory information only, leave the container running after this + --manage-cgroups-mode value cgroups mode: 'soft' (default), 'full' and 'strict' + --empty-ns value create a namespace, but don't restore its properties + --auto-dedup enable auto deduplication of memory images diff --git a/man/runc-create.8.md b/man/runc-create.8.md new file mode 100644 index 0000000..99c0a2c --- /dev/null +++ b/man/runc-create.8.md @@ -0,0 +1,29 @@ +% runc-create "8" + +# NAME + runc create - create a container + +# SYNOPSIS + runc create [command options] `` + +Where "``" is your name for the instance of the container that you +are starting. The name you provide for the container instance must be unique on +your host. + +# DESCRIPTION + The create command creates an instance of a container for a bundle. The bundle +is a directory with a specification file named "config.json" and a root +filesystem. + +The specification file includes an args parameter. The args parameter is used +to specify command(s) that get run when the container is started. To change the +command(s) that get executed on start, edit the args parameter of the spec. See +"runc spec --help" for more explanation. + +# OPTIONS + --bundle value, -b value path to the root of the bundle directory, defaults to the current directory + --console-socket value path to an AF_UNIX socket which will receive a file descriptor referencing the master end of the console's pseudoterminal + --pid-file value specify the file to write the process id to + --no-pivot do not use pivot root to jail process inside rootfs. This should be used whenever the rootfs is on top of a ramdisk + --no-new-keyring do not create a new session keyring for the container. This will cause the container to inherit the calling processes session key + --preserve-fds value Pass N additional file descriptors to the container (stdio + $LISTEN_FDS + N in total) (default: 0) diff --git a/man/runc-delete.8.md b/man/runc-delete.8.md new file mode 100644 index 0000000..84922a0 --- /dev/null +++ b/man/runc-delete.8.md @@ -0,0 +1,19 @@ +% runc-delete "8" + +# NAME + runc delete - delete any resources held by the container often used with detached container + +# SYNOPSIS + runc delete [command options] `` + +Where "``" is the name for the instance of the container. + +# OPTIONS + --force, -f Forcibly deletes the container if it is still running (uses SIGKILL) + +# EXAMPLE +For example, if the container id is "ubuntu01" and runc list currently shows the +status of "ubuntu01" as "stopped" the following will delete resources held for +"ubuntu01" removing "ubuntu01" from the runc list of containers: + + # runc delete ubuntu01 diff --git a/man/runc-events.8.md b/man/runc-events.8.md new file mode 100644 index 0000000..d998a38 --- /dev/null +++ b/man/runc-events.8.md @@ -0,0 +1,17 @@ +% runc-events "8" + +# NAME + runc events - display container events such as OOM notifications, cpu, memory, and IO usage statistics + +# SYNOPSIS + runc events [command options] `` + +Where "``" is the name for the instance of the container. + +# DESCRIPTION + The events command displays information about the container. By default the +information is displayed once every 5 seconds. + +# OPTIONS + --interval value set the stats collection interval (default: 5s) + --stats display the container's stats then exit diff --git a/man/runc-exec.8.md b/man/runc-exec.8.md new file mode 100644 index 0000000..dbaaefe --- /dev/null +++ b/man/runc-exec.8.md @@ -0,0 +1,33 @@ +% runc-exec "8" + +# NAME + runc exec - execute new process inside the container + +# SYNOPSIS + runc exec [command options] `` -- `` [args...] + +Where "``" is the name for the instance of the container and +"``" is the command to be executed in the container. + +# EXAMPLE +For example, if the container is configured to run the linux ps command the +following will output a list of processes running in the container: + + # runc exec ps + +# OPTIONS + --console value specify the pty slave path for use with the container + --cwd value current working directory in the container + --env value, -e value set environment variables + --tty, -t allocate a pseudo-TTY + --user value, -u value UID (format: [:]) + --additional-gids value, -g value additional gids + --process value, -p value path to the process.json + --detach, -d detach from the container's process + --pid-file value specify the file to write the process id to + --process-label value set the asm process label for the process commonly used with selinux + --apparmor value set the apparmor profile for the process + --no-new-privs set the no new privileges value for the process + --cap value, -c value add a capability to the bounding set for the process + --no-subreaper disable the use of the subreaper used to reap reparented processes + --preserve-fds value pass N additional file descriptors to the container (stdio + $LISTEN_FDS + N in total) (default: 0) diff --git a/man/runc-kill.8.md b/man/runc-kill.8.md new file mode 100644 index 0000000..1ea579a --- /dev/null +++ b/man/runc-kill.8.md @@ -0,0 +1,20 @@ +% runc-kill "8" + +# NAME + runc kill - kill sends the specified signal (default: SIGTERM) to the container's init process + +# SYNOPSIS + runc kill [command options] `` `` + +Where "``" is the name for the instance of the container and +"``" is the signal to be sent to the init process. + +# OPTIONS + --all, -a send the specified signal to all processes inside the container + +# EXAMPLE + +For example, if the container id is "ubuntu01" the following will send a "KILL" +signal to the init process of the "ubuntu01" container: + + # runc kill ubuntu01 KILL diff --git a/man/runc-list.8.md b/man/runc-list.8.md new file mode 100644 index 0000000..46cd5d0 --- /dev/null +++ b/man/runc-list.8.md @@ -0,0 +1,21 @@ +% runc-list "8" + +# NAME + runc list - lists containers started by runc with the given root + +# SYNOPSIS + runc list [command options] + +# EXAMPLE +Where the given root is specified via the global option "--root" +(default: "/run/runc"). + +To list containers created via the default "--root": + # runc list + +To list containers created using a non-default value for "--root": + # runc --root value list + +# OPTIONS + --format value, -f value select one of: table or json (default: "table") + --quiet, -q display only container IDs diff --git a/man/runc-pause.8.md b/man/runc-pause.8.md new file mode 100644 index 0000000..965f7da --- /dev/null +++ b/man/runc-pause.8.md @@ -0,0 +1,14 @@ +% runc-pause "8" + +# NAME + runc pause - pause suspends all processes inside the container + +# SYNOPSIS + runc pause `` + +Where "``" is the name for the instance of the container to be +paused. + +# DESCRIPTION + The pause command suspends all processes in the instance of the container. +Use runc list to identify instances of containers and their current status. diff --git a/man/runc-ps.8.md b/man/runc-ps.8.md new file mode 100644 index 0000000..1fad467 --- /dev/null +++ b/man/runc-ps.8.md @@ -0,0 +1,15 @@ +% runc-ps "8" + +# NAME + runc ps - ps displays the processes running inside a container + +# SYNOPSIS + runc ps [command options] `` [ps options] + +# OPTIONS + --format value, -f value select one of: table(default) or json + +The default format is table. The following will output the processes of a container +in json format: + + # runc ps -f json diff --git a/man/runc-restore.8.md b/man/runc-restore.8.md new file mode 100644 index 0000000..e475bd5 --- /dev/null +++ b/man/runc-restore.8.md @@ -0,0 +1,28 @@ +% runc-restore "8" + +# NAME + runc restore - restore a container from a previous checkpoint + +# SYNOPSIS + runc restore [command options] `` + +Where "``" is the name for the instance of the container to be +restored. + +# DESCRIPTION + Restores the saved state of the container instance that was previously saved +using the runc checkpoint command. + +# OPTIONS + --image-path value path to criu image files for restoring + --work-path value path for saving work files and logs + --tcp-established allow open tcp connections + --ext-unix-sk allow external unix sockets + --shell-job allow shell jobs + --file-locks handle file locks, for safety + --manage-cgroups-mode value cgroups mode: 'soft' (default), 'full' and 'strict' + --bundle value, -b value path to the root of the bundle directory + --detach, -d detach from the container's process + --pid-file value specify the file to write the process id to + --no-subreaper disable the use of the subreaper used to reap reparented processes + --no-pivot do not use pivot root to jail process inside rootfs. This should be used whenever the rootfs is on top of a ramdisk diff --git a/man/runc-resume.8.md b/man/runc-resume.8.md new file mode 100644 index 0000000..25d342f --- /dev/null +++ b/man/runc-resume.8.md @@ -0,0 +1,14 @@ +% runc-resume "8" + +# NAME + runc resume - resumes all processes that have been previously paused + +# SYNOPSIS + runc resume `` + +Where "``" is the name for the instance of the container to be +resumed. + +# DESCRIPTION + The resume command resumes all processes in the instance of the container. +Use runc list to identify instances of containers and their current status. diff --git a/man/runc-run.8.md b/man/runc-run.8.md new file mode 100644 index 0000000..ad2b8b2 --- /dev/null +++ b/man/runc-run.8.md @@ -0,0 +1,31 @@ +% runc-run "8" + +# NAME + runc run - create and run a container + +# SYNOPSIS + runc run [command options] `` + +Where "``" is your name for the instance of the container that you +are starting. The name you provide for the container instance must be unique on +your host. + +# DESCRIPTION + The run command creates an instance of a container for a bundle. The bundle +is a directory with a specification file named "config.json" and a root +filesystem. + +The specification file includes an args parameter. The args parameter is used +to specify command(s) that get run when the container is started. To change the +command(s) that get executed on start, edit the args parameter of the spec. See +"runc spec --help" for more explanation. + +# OPTIONS + --bundle value, -b value path to the root of the bundle directory, defaults to the current directory + --console-socket value path to an AF_UNIX socket which will receive a file descriptor referencing the master end of the console's pseudoterminal + --detach, -d detach from the container's process + --pid-file value specify the file to write the process id to + --no-subreaper disable the use of the subreaper used to reap reparented processes + --no-pivot do not use pivot root to jail process inside rootfs. This should be used whenever the rootfs is on top of a ramdisk + --no-new-keyring do not create a new session keyring for the container. This will cause the container to inherit the calling processes session key + --preserve-fds value Pass N additional file descriptors to the container (stdio + $LISTEN_FDS + N in total) (default: 0) diff --git a/man/runc-spec.8.md b/man/runc-spec.8.md new file mode 100644 index 0000000..6a181cd --- /dev/null +++ b/man/runc-spec.8.md @@ -0,0 +1,56 @@ +% runc-spec "8" + +# NAME + runc spec - create a new specification file + +# SYNOPSIS + runc spec [command options] [arguments...] + +# DESCRIPTION + The spec command creates the new specification file named "config.json" for +the bundle. + +The spec generated is just a starter file. Editing of the spec is required to +achieve desired results. For example, the newly generated spec includes an args +parameter that is initially set to call the "sh" command when the container is +started. Calling "sh" may work for an ubuntu container or busybox, but will not +work for containers that do not include the "sh" program. + +# EXAMPLE + To run docker's hello-world container one needs to set the args parameter +in the spec to call hello. This can be done using the sed command or a text +editor. The following commands create a bundle for hello-world, change the +default args parameter in the spec from "sh" to "/hello", then run the hello +command in a new hello-world container named container1: + + mkdir hello + cd hello + docker pull hello-world + docker export $(docker create hello-world) > hello-world.tar + mkdir rootfs + tar -C rootfs -xf hello-world.tar + runc spec + sed -i 's;"sh";"/hello";' config.json + runc start container1 + +In the start command above, "container1" is the name for the instance of the +container that you are starting. The name you provide for the container instance +must be unique on your host. + +An alternative for generating a customized spec config is to use "oci-runtime-tool", the +sub-command "oci-runtime-tool generate" has lots of options that can be used to do any +customizations as you want, see [runtime-tools](https://github.com/opencontainers/runtime-tools) +to get more information. + +When starting a container through runc, runc needs root privilege. If not +already running as root, you can use sudo to give runc root privilege. For +example: "sudo runc start container1" will give runc root privilege to start the +container on your host. + +Alternatively, you can start a rootless container, which has the ability to run without root privileges. +For this to work, the specification file needs to be adjusted accordingly. +You can pass the parameter **--rootless** to this command to generate a proper rootless spec file. + +# OPTIONS + --bundle value, -b value path to the root of the bundle directory + --rootless generate a configuration for a rootless container diff --git a/man/runc-start.8.md b/man/runc-start.8.md new file mode 100644 index 0000000..e4bbacc --- /dev/null +++ b/man/runc-start.8.md @@ -0,0 +1,14 @@ +% runc-start "8" + +# NAME + runc start - start executes the user defined process in a created container + +# SYNOPSIS + runc start `` + +Where "``" is your name for the instance of the container that you +are starting. The name you provide for the container instance must be unique on +your host. + +# DESCRIPTION + The start command executes the user defined process in a created container. diff --git a/man/runc-state.8.md b/man/runc-state.8.md new file mode 100644 index 0000000..768f79f --- /dev/null +++ b/man/runc-state.8.md @@ -0,0 +1,13 @@ +% runc-state "8" + +# NAME + runc state - output the state of a container + +# SYNOPSIS + runc state `` + +Where "``" is your name for the instance of the container. + +# DESCRIPTION + The state command outputs current state information for the +instance of a container. diff --git a/man/runc-update.8.md b/man/runc-update.8.md new file mode 100644 index 0000000..fa269d6 --- /dev/null +++ b/man/runc-update.8.md @@ -0,0 +1,55 @@ +% runc-update "8" + +# NAME + runc update - update container resource constraints + +# SYNOPSIS + runc update [command options] `` + +# DESCRIPTION + The data can be read from a file or the standard input, the +accepted format is as follow (unchanged values can be omitted): + + { + "memory": { + "limit": 0, + "reservation": 0, + "swap": 0, + "kernel": 0, + "kernelTCP": 0 + }, + "cpu": { + "shares": 0, + "quota": 0, + "period": 0, + "realtimeRuntime": 0, + "realtimePeriod": 0, + "cpus": "", + "mems": "" + }, + "blockIO": { + "blkioWeight": 0 + } + } + +Note: if data is to be read from a file or the standard input, all +other options are ignored. + +# OPTIONS + --resources value, -r value path to the file containing the resources to update or '-' to read from the standard input + --blkio-weight value Specifies per cgroup weight, range is from 10 to 1000 (default: 0) + --cpu-period value CPU CFS period to be used for hardcapping (in usecs). 0 to use system default + --cpu-quota value CPU CFS hardcap limit (in usecs). Allowed cpu time in a given period + --cpu-rt-period value CPU realtime period to be used for hardcapping (in usecs). 0 to use system default + --cpu-rt-runtime value CPU realtime hardcap limit (in usecs). Allowed cpu time in a given period + --cpu-share value CPU shares (relative weight vs. other containers) + --cpuset-cpus value CPU(s) to use + --cpuset-mems value Memory node(s) to use + --kernel-memory value Kernel memory limit (in bytes) + --kernel-memory-tcp value Kernel memory limit (in bytes) for tcp buffer + --memory value Memory limit (in bytes) + --memory-reservation value Memory reservation or soft_limit (in bytes) + --memory-swap value Total memory usage (memory + swap); set '-1' to enable unlimited swap + --pids-limit value Maximum number of pids allowed in the container (default: 0) + --l3-cache-schema The string of Intel RDT/CAT L3 cache schema + --mem-bw-schema The string of Intel RDT/MBA memory bandwidth schema diff --git a/man/runc.8.md b/man/runc.8.md new file mode 100644 index 0000000..49df525 --- /dev/null +++ b/man/runc.8.md @@ -0,0 +1,61 @@ +% runc "8" + +# NAME + runc - Open Container Initiative runtime + +# SYNOPSIS + runc [global options] command [command options] [arguments...] + +# DESCRIPTION +runc is a command line client for running applications packaged according to +the Open Container Initiative (OCI) format and is a compliant implementation of the +Open Container Initiative specification. + +runc integrates well with existing process supervisors to provide a production +container runtime environment for applications. It can be used with your +existing process monitoring tools and the container will be spawned as a +direct child of the process supervisor. + +Containers are configured using bundles. A bundle for a container is a directory +that includes a specification file named "config.json" and a root filesystem. +The root filesystem contains the contents of the container. + +To start a new instance of a container: + + # runc start [ -b bundle ] + +Where "``" is your name for the instance of the container that you +are starting. The name you provide for the container instance must be unique on +your host. Providing the bundle directory using "-b" is optional. The default +value for "bundle" is the current directory. + +# COMMANDS + checkpoint checkpoint a running container + create create a container + delete delete any resources held by the container often used with detached containers + events display container events such as OOM notifications, cpu, memory, IO and network stats + exec execute new process inside the container + init initialize the namespaces and launch the process (do not call it outside of runc) + kill kill sends the specified signal (default: SIGTERM) to the container's init process + list lists containers started by runc with the given root + pause pause suspends all processes inside the container + ps displays the processes running inside a container + restore restore a container from a previous checkpoint + resume resumes all processes that have been previously paused + run create and run a container + spec create a new specification file + start executes the user defined process in a created container + state output the state of a container + update update container resource constraints + help, h Shows a list of commands or help for one command + +# GLOBAL OPTIONS + --debug enable debug output for logging + --log value set the log file path where internal debug information is written (default: "/dev/null") + --log-format value set the format used by logs ('text' (default), or 'json') (default: "text") + --root value root directory for storage of container state (this should be located in tmpfs) (default: "/run/runc" or $XDG_RUNTIME_DIR/runc for rootless containers) + --criu value path to the criu binary used for checkpoint and restore (default: "criu") + --systemd-cgroup enable systemd cgroup support, expects cgroupsPath to be of form "slice:prefix:name" for e.g. "system.slice:runc:434234" + --rootless value enable rootless mode ('true', 'false', or 'auto') (default: "auto") + --help, -h show help + --version, -v print the version diff --git a/notify_socket.go b/notify_socket.go new file mode 100644 index 0000000..e7453c6 --- /dev/null +++ b/notify_socket.go @@ -0,0 +1,116 @@ +// +build linux + +package main + +import ( + "bytes" + "fmt" + "net" + "os" + "path/filepath" + + "github.com/opencontainers/runtime-spec/specs-go" + + "github.com/sirupsen/logrus" + "github.com/urfave/cli" +) + +type notifySocket struct { + socket *net.UnixConn + host string + socketPath string +} + +func newNotifySocket(context *cli.Context, notifySocketHost string, id string) *notifySocket { + if notifySocketHost == "" { + return nil + } + + root := filepath.Join(context.GlobalString("root"), id) + path := filepath.Join(root, "notify.sock") + + notifySocket := ¬ifySocket{ + socket: nil, + host: notifySocketHost, + socketPath: path, + } + + return notifySocket +} + +func (s *notifySocket) Close() error { + return s.socket.Close() +} + +// If systemd is supporting sd_notify protocol, this function will add support +// for sd_notify protocol from within the container. +func (s *notifySocket) setupSpec(context *cli.Context, spec *specs.Spec) { + mount := specs.Mount{Destination: s.host, Source: s.socketPath, Options: []string{"bind"}} + spec.Mounts = append(spec.Mounts, mount) + spec.Process.Env = append(spec.Process.Env, fmt.Sprintf("NOTIFY_SOCKET=%s", s.host)) +} + +func (s *notifySocket) setupSocket() error { + addr := net.UnixAddr{ + Name: s.socketPath, + Net: "unixgram", + } + + socket, err := net.ListenUnixgram("unixgram", &addr) + if err != nil { + return err + } + + err = os.Chmod(s.socketPath, 0777) + if err != nil { + socket.Close() + return err + } + + s.socket = socket + return nil +} + +// pid1 must be set only with -d, as it is used to set the new process as the main process +// for the service in systemd +func (s *notifySocket) run(pid1 int) { + buf := make([]byte, 512) + notifySocketHostAddr := net.UnixAddr{Name: s.host, Net: "unixgram"} + client, err := net.DialUnix("unixgram", nil, ¬ifySocketHostAddr) + if err != nil { + logrus.Error(err) + return + } + for { + r, err := s.socket.Read(buf) + if err != nil { + break + } + var out bytes.Buffer + for _, line := range bytes.Split(buf[0:r], []byte{'\n'}) { + if bytes.HasPrefix(line, []byte("READY=")) { + _, err = out.Write(line) + if err != nil { + return + } + + _, err = out.Write([]byte{'\n'}) + if err != nil { + return + } + + _, err = client.Write(out.Bytes()) + if err != nil { + return + } + + // now we can inform systemd to use pid1 as the pid to monitor + if pid1 > 0 { + newPid := fmt.Sprintf("MAINPID=%d\n", pid1) + client.Write([]byte(newPid)) + } + return + } + } + } +} diff --git a/pause.go b/pause.go new file mode 100644 index 0000000..224c79f --- /dev/null +++ b/pause.go @@ -0,0 +1,66 @@ +// +build linux + +package main + +import ( + "github.com/sirupsen/logrus" + "github.com/urfave/cli" +) + +var pauseCommand = cli.Command{ + Name: "pause", + Usage: "pause suspends all processes inside the container", + ArgsUsage: ` + +Where "" is the name for the instance of the container to be +paused. `, + Description: `The pause command suspends all processes in the instance of the container. + +Use runc list to identify instances of containers and their current status.`, + Action: func(context *cli.Context) error { + if err := checkArgs(context, 1, exactArgs); err != nil { + return err + } + rootlessCg, err := shouldUseRootlessCgroupManager(context) + if err != nil { + return err + } + if rootlessCg { + logrus.Warnf("runc pause may fail if you don't have the full access to cgroups") + } + container, err := getContainer(context) + if err != nil { + return err + } + return container.Pause() + }, +} + +var resumeCommand = cli.Command{ + Name: "resume", + Usage: "resumes all processes that have been previously paused", + ArgsUsage: ` + +Where "" is the name for the instance of the container to be +resumed.`, + Description: `The resume command resumes all processes in the instance of the container. + +Use runc list to identify instances of containers and their current status.`, + Action: func(context *cli.Context) error { + if err := checkArgs(context, 1, exactArgs); err != nil { + return err + } + rootlessCg, err := shouldUseRootlessCgroupManager(context) + if err != nil { + return err + } + if rootlessCg { + logrus.Warn("runc resume may fail if you don't have the full access to cgroups") + } + container, err := getContainer(context) + if err != nil { + return err + } + return container.Resume() + }, +} diff --git a/ps.go b/ps.go new file mode 100644 index 0000000..e7f635f --- /dev/null +++ b/ps.go @@ -0,0 +1,113 @@ +// +build linux + +package main + +import ( + "encoding/json" + "fmt" + "os" + "os/exec" + "strconv" + "strings" + + "github.com/sirupsen/logrus" + "github.com/urfave/cli" +) + +var psCommand = cli.Command{ + Name: "ps", + Usage: "ps displays the processes running inside a container", + ArgsUsage: ` [ps options]`, + Flags: []cli.Flag{ + cli.StringFlag{ + Name: "format, f", + Value: "table", + Usage: `select one of: ` + formatOptions, + }, + }, + Action: func(context *cli.Context) error { + if err := checkArgs(context, 1, minArgs); err != nil { + return err + } + rootlessCg, err := shouldUseRootlessCgroupManager(context) + if err != nil { + return err + } + if rootlessCg { + logrus.Warn("runc ps may fail if you don't have the full access to cgroups") + } + + container, err := getContainer(context) + if err != nil { + return err + } + + pids, err := container.Processes() + if err != nil { + return err + } + + switch context.String("format") { + case "table": + case "json": + return json.NewEncoder(os.Stdout).Encode(pids) + default: + return fmt.Errorf("invalid format option") + } + + // [1:] is to remove command name, ex: + // context.Args(): [containet_id ps_arg1 ps_arg2 ...] + // psArgs: [ps_arg1 ps_arg2 ...] + // + psArgs := context.Args()[1:] + if len(psArgs) == 0 { + psArgs = []string{"-ef"} + } + + cmd := exec.Command("ps", psArgs...) + output, err := cmd.CombinedOutput() + if err != nil { + return fmt.Errorf("%s: %s", err, output) + } + + lines := strings.Split(string(output), "\n") + pidIndex, err := getPidIndex(lines[0]) + if err != nil { + return err + } + + fmt.Println(lines[0]) + for _, line := range lines[1:] { + if len(line) == 0 { + continue + } + fields := strings.Fields(line) + p, err := strconv.Atoi(fields[pidIndex]) + if err != nil { + return fmt.Errorf("unexpected pid '%s': %s", fields[pidIndex], err) + } + + for _, pid := range pids { + if pid == p { + fmt.Println(line) + break + } + } + } + return nil + }, + SkipArgReorder: true, +} + +func getPidIndex(title string) (int, error) { + titles := strings.Fields(title) + + pidIndex := -1 + for i, name := range titles { + if name == "PID" { + return i, nil + } + } + + return pidIndex, fmt.Errorf("couldn't find PID field in ps output") +} diff --git a/restore.go b/restore.go new file mode 100644 index 0000000..53f50d2 --- /dev/null +++ b/restore.go @@ -0,0 +1,142 @@ +// +build linux + +package main + +import ( + "os" + + "github.com/opencontainers/runc/libcontainer" + "github.com/opencontainers/runc/libcontainer/system" + "github.com/sirupsen/logrus" + "github.com/urfave/cli" +) + +var restoreCommand = cli.Command{ + Name: "restore", + Usage: "restore a container from a previous checkpoint", + ArgsUsage: ` + +Where "" is the name for the instance of the container to be +restored.`, + Description: `Restores the saved state of the container instance that was previously saved +using the runc checkpoint command.`, + Flags: []cli.Flag{ + cli.StringFlag{ + Name: "console-socket", + Value: "", + Usage: "path to an AF_UNIX socket which will receive a file descriptor referencing the master end of the console's pseudoterminal", + }, + cli.StringFlag{ + Name: "image-path", + Value: "", + Usage: "path to criu image files for restoring", + }, + cli.StringFlag{ + Name: "work-path", + Value: "", + Usage: "path for saving work files and logs", + }, + cli.BoolFlag{ + Name: "tcp-established", + Usage: "allow open tcp connections", + }, + cli.BoolFlag{ + Name: "ext-unix-sk", + Usage: "allow external unix sockets", + }, + cli.BoolFlag{ + Name: "shell-job", + Usage: "allow shell jobs", + }, + cli.BoolFlag{ + Name: "file-locks", + Usage: "handle file locks, for safety", + }, + cli.StringFlag{ + Name: "manage-cgroups-mode", + Value: "", + Usage: "cgroups mode: 'soft' (default), 'full' and 'strict'", + }, + cli.StringFlag{ + Name: "bundle, b", + Value: "", + Usage: "path to the root of the bundle directory", + }, + cli.BoolFlag{ + Name: "detach,d", + Usage: "detach from the container's process", + }, + cli.StringFlag{ + Name: "pid-file", + Value: "", + Usage: "specify the file to write the process id to", + }, + cli.BoolFlag{ + Name: "no-subreaper", + Usage: "disable the use of the subreaper used to reap reparented processes", + }, + cli.BoolFlag{ + Name: "no-pivot", + Usage: "do not use pivot root to jail process inside rootfs. This should be used whenever the rootfs is on top of a ramdisk", + }, + cli.StringSliceFlag{ + Name: "empty-ns", + Usage: "create a namespace, but don't restore its properties", + }, + cli.BoolFlag{ + Name: "auto-dedup", + Usage: "enable auto deduplication of memory images", + }, + cli.BoolFlag{ + Name: "lazy-pages", + Usage: "use userfaultfd to lazily restore memory pages", + }, + }, + Action: func(context *cli.Context) error { + if err := checkArgs(context, 1, exactArgs); err != nil { + return err + } + // XXX: Currently this is untested with rootless containers. + if os.Geteuid() != 0 || system.RunningInUserNS() { + logrus.Warn("runc checkpoint is untested with rootless containers") + } + + spec, err := setupSpec(context) + if err != nil { + return err + } + options := criuOptions(context) + if err := setEmptyNsMask(context, options); err != nil { + return err + } + status, err := startContainer(context, spec, CT_ACT_RESTORE, options) + if err != nil { + return err + } + // exit with the container's exit status so any external supervisor is + // notified of the exit with the correct exit status. + os.Exit(status) + return nil + }, +} + +func criuOptions(context *cli.Context) *libcontainer.CriuOpts { + imagePath := getCheckpointImagePath(context) + if err := os.MkdirAll(imagePath, 0755); err != nil { + fatal(err) + } + return &libcontainer.CriuOpts{ + ImagesDirectory: imagePath, + WorkDirectory: context.String("work-path"), + ParentImage: context.String("parent-path"), + LeaveRunning: context.Bool("leave-running"), + TcpEstablished: context.Bool("tcp-established"), + ExternalUnixConnections: context.Bool("ext-unix-sk"), + ShellJob: context.Bool("shell-job"), + FileLocks: context.Bool("file-locks"), + PreDump: context.Bool("pre-dump"), + AutoDedup: context.Bool("auto-dedup"), + LazyPages: context.Bool("lazy-pages"), + StatusFd: context.String("status-fd"), + } +} diff --git a/rlimit_linux.go b/rlimit_linux.go new file mode 100644 index 0000000..c97a0fb --- /dev/null +++ b/rlimit_linux.go @@ -0,0 +1,49 @@ +package main + +import "fmt" + +const ( + RLIMIT_CPU = iota // CPU time in sec + RLIMIT_FSIZE // Maximum filesize + RLIMIT_DATA // max data size + RLIMIT_STACK // max stack size + RLIMIT_CORE // max core file size + RLIMIT_RSS // max resident set size + RLIMIT_NPROC // max number of processes + RLIMIT_NOFILE // max number of open files + RLIMIT_MEMLOCK // max locked-in-memory address space + RLIMIT_AS // address space limit + RLIMIT_LOCKS // maximum file locks held + RLIMIT_SIGPENDING // max number of pending signals + RLIMIT_MSGQUEUE // maximum bytes in POSIX mqueues + RLIMIT_NICE // max nice prio allowed to raise to + RLIMIT_RTPRIO // maximum realtime priority + RLIMIT_RTTIME // timeout for RT tasks in us +) + +var rlimitMap = map[string]int{ + "RLIMIT_CPU": RLIMIT_CPU, + "RLIMIT_FSIZE": RLIMIT_FSIZE, + "RLIMIT_DATA": RLIMIT_DATA, + "RLIMIT_STACK": RLIMIT_STACK, + "RLIMIT_CORE": RLIMIT_CORE, + "RLIMIT_RSS": RLIMIT_RSS, + "RLIMIT_NPROC": RLIMIT_NPROC, + "RLIMIT_NOFILE": RLIMIT_NOFILE, + "RLIMIT_MEMLOCK": RLIMIT_MEMLOCK, + "RLIMIT_AS": RLIMIT_AS, + "RLIMIT_LOCKS": RLIMIT_LOCKS, + "RLIMIT_SIGPENDING": RLIMIT_SIGPENDING, + "RLIMIT_MSGQUEUE": RLIMIT_MSGQUEUE, + "RLIMIT_NICE": RLIMIT_NICE, + "RLIMIT_RTPRIO": RLIMIT_RTPRIO, + "RLIMIT_RTTIME": RLIMIT_RTTIME, +} + +func strToRlimit(key string) (int, error) { + rl, ok := rlimitMap[key] + if !ok { + return 0, fmt.Errorf("wrong rlimit value: %s", key) + } + return rl, nil +} diff --git a/rootless_linux.go b/rootless_linux.go new file mode 100644 index 0000000..3c425dc --- /dev/null +++ b/rootless_linux.go @@ -0,0 +1,58 @@ +// +build linux + +package main + +import ( + "os" + + "github.com/opencontainers/runc/libcontainer/system" + "github.com/urfave/cli" +) + +func shouldUseRootlessCgroupManager(context *cli.Context) (bool, error) { + if context != nil { + b, err := parseBoolOrAuto(context.GlobalString("rootless")) + if err != nil { + return false, err + } + // nil b stands for "auto detect" + if b != nil { + return *b, nil + } + + if context.GlobalBool("systemd-cgroup") { + return false, nil + } + } + if os.Geteuid() != 0 { + return true, nil + } + if !system.RunningInUserNS() { + // euid == 0 , in the initial ns (i.e. the real root) + return false, nil + } + // euid = 0, in a userns. + // As we are unaware of cgroups path, we can't determine whether we have the full + // access to the cgroups path. + // Either way, we can safely decide to use the rootless cgroups manager. + return true, nil +} + +func shouldHonorXDGRuntimeDir() bool { + if os.Getenv("XDG_RUNTIME_DIR") == "" { + return false + } + if os.Geteuid() != 0 { + return true + } + if !system.RunningInUserNS() { + // euid == 0 , in the initial ns (i.e. the real root) + // in this case, we should use /run/runc and ignore + // $XDG_RUNTIME_DIR (e.g. /run/user/0) for backward + // compatibility. + return false + } + // euid = 0, in a userns. + u, ok := os.LookupEnv("USER") + return !ok || u != "root" +} diff --git a/run.go b/run.go new file mode 100644 index 0000000..f8d6317 --- /dev/null +++ b/run.go @@ -0,0 +1,84 @@ +// +build linux + +package main + +import ( + "os" + + "github.com/urfave/cli" +) + +// default action is to start a container +var runCommand = cli.Command{ + Name: "run", + Usage: "create and run a container", + ArgsUsage: ` + +Where "" is your name for the instance of the container that you +are starting. The name you provide for the container instance must be unique on +your host.`, + Description: `The run command creates an instance of a container for a bundle. The bundle +is a directory with a specification file named "` + specConfig + `" and a root +filesystem. + +The specification file includes an args parameter. The args parameter is used +to specify command(s) that get run when the container is started. To change the +command(s) that get executed on start, edit the args parameter of the spec. See +"runc spec --help" for more explanation.`, + Flags: []cli.Flag{ + cli.StringFlag{ + Name: "bundle, b", + Value: "", + Usage: `path to the root of the bundle directory, defaults to the current directory`, + }, + cli.StringFlag{ + Name: "console-socket", + Value: "", + Usage: "path to an AF_UNIX socket which will receive a file descriptor referencing the master end of the console's pseudoterminal", + }, + cli.BoolFlag{ + Name: "detach, d", + Usage: "detach from the container's process", + }, + cli.StringFlag{ + Name: "pid-file", + Value: "", + Usage: "specify the file to write the process id to", + }, + cli.BoolFlag{ + Name: "no-subreaper", + Usage: "disable the use of the subreaper used to reap reparented processes", + }, + cli.BoolFlag{ + Name: "no-pivot", + Usage: "do not use pivot root to jail process inside rootfs. This should be used whenever the rootfs is on top of a ramdisk", + }, + cli.BoolFlag{ + Name: "no-new-keyring", + Usage: "do not create a new session keyring for the container. This will cause the container to inherit the calling processes session key", + }, + cli.IntFlag{ + Name: "preserve-fds", + Usage: "Pass N additional file descriptors to the container (stdio + $LISTEN_FDS + N in total)", + }, + }, + Action: func(context *cli.Context) error { + if err := checkArgs(context, 1, exactArgs); err != nil { + return err + } + if err := revisePidFile(context); err != nil { + return err + } + spec, err := setupSpec(context) + if err != nil { + return err + } + status, err := startContainer(context, spec, CT_ACT_RUN, nil) + if err == nil { + // exit with the container's exit status so any external supervisor is + // notified of the exit with the correct exit status. + os.Exit(status) + } + return err + }, +} diff --git a/script/.validate b/script/.validate new file mode 100644 index 0000000..170d674 --- /dev/null +++ b/script/.validate @@ -0,0 +1,33 @@ +#!/bin/bash + +if [ -z "$VALIDATE_UPSTREAM" ]; then + # this is kind of an expensive check, so let's not do this twice if we + # are running more than one validate bundlescript + + VALIDATE_REPO='https://github.com/opencontainers/runc.git' + VALIDATE_BRANCH='master' + + if [ "$TRAVIS" = 'true' -a "$TRAVIS_PULL_REQUEST" != 'false' ]; then + VALIDATE_REPO="https://github.com/${TRAVIS_REPO_SLUG}.git" + VALIDATE_BRANCH="${TRAVIS_BRANCH}" + fi + + VALIDATE_HEAD="$(git rev-parse --verify HEAD)" + + git fetch -q "$VALIDATE_REPO" "refs/heads/$VALIDATE_BRANCH" + VALIDATE_UPSTREAM="$(git rev-parse --verify FETCH_HEAD)" + + VALIDATE_COMMIT_LOG="$VALIDATE_UPSTREAM..$VALIDATE_HEAD" + VALIDATE_COMMIT_DIFF="$VALIDATE_UPSTREAM...$VALIDATE_HEAD" + + validate_diff() { + if [ "$VALIDATE_UPSTREAM" != "$VALIDATE_HEAD" ]; then + git diff "$VALIDATE_COMMIT_DIFF" "$@" + fi + } + validate_log() { + if [ "$VALIDATE_UPSTREAM" != "$VALIDATE_HEAD" ]; then + git log "$VALIDATE_COMMIT_LOG" "$@" + fi + } +fi diff --git a/script/check-config.sh b/script/check-config.sh new file mode 100755 index 0000000..6b8158e --- /dev/null +++ b/script/check-config.sh @@ -0,0 +1,253 @@ +#!/usr/bin/env bash +set -e + +# bits of this were adapted from check_config.sh in docker +# see also https://github.com/docker/docker/blob/master/contrib/check-config.sh + +possibleConfigs=( + '/proc/config.gz' + "/boot/config-$(uname -r)" + "/usr/src/linux-$(uname -r)/.config" + '/usr/src/linux/.config' +) +possibleConfigFiles=( + 'config.gz' + "config-$(uname -r)" + '.config' +) + +if ! command -v zgrep &>/dev/null; then + zgrep() { + zcat "$2" | grep "$1" + } +fi + +kernelVersion="$(uname -r)" +kernelMajor="${kernelVersion%%.*}" +kernelMinor="${kernelVersion#$kernelMajor.}" +kernelMinor="${kernelMinor%%.*}" + +is_set() { + zgrep "CONFIG_$1=[y|m]" "$CONFIG" >/dev/null +} +is_set_in_kernel() { + zgrep "CONFIG_$1=y" "$CONFIG" >/dev/null +} +is_set_as_module() { + zgrep "CONFIG_$1=m" "$CONFIG" >/dev/null +} + +color() { + local codes=() + if [ "$1" = 'bold' ]; then + codes=("${codes[@]}" '1') + shift + fi + if [ "$#" -gt 0 ]; then + local code + case "$1" in + # see https://en.wikipedia.org/wiki/ANSI_escape_code#Colors + black) code=30 ;; + red) code=31 ;; + green) code=32 ;; + yellow) code=33 ;; + blue) code=34 ;; + magenta) code=35 ;; + cyan) code=36 ;; + white) code=37 ;; + esac + if [ "$code" ]; then + codes=("${codes[@]}" "$code") + fi + fi + local IFS=';' + echo -en '\033['"${codes[*]}"'m' +} +wrap_color() { + text="$1" + shift + color "$@" + echo -n "$text" + color reset + echo +} + +wrap_good() { + echo "$(wrap_color "$1" white): $(wrap_color "$2" green)" +} +wrap_bad() { + echo "$(wrap_color "$1" bold): $(wrap_color "$2" bold red)" +} +wrap_warning() { + wrap_color >&2 "$*" red +} + +check_flag() { + if is_set_in_kernel "$1"; then + wrap_good "CONFIG_$1" 'enabled' + elif is_set_as_module "$1"; then + wrap_good "CONFIG_$1" 'enabled (as module)' + else + wrap_bad "CONFIG_$1" 'missing' + fi +} + +check_flags() { + for flag in "$@"; do + echo "- $(check_flag "$flag")" + done +} + +check_distro_userns() { + source /etc/os-release 2>/dev/null || /bin/true + if [[ "${ID}" =~ ^(centos|rhel)$ && "${VERSION_ID}" =~ ^7 ]]; then + # this is a CentOS7 or RHEL7 system + grep -q "user_namespace.enable=1" /proc/cmdline || { + # no user namespace support enabled + wrap_bad " (RHEL7/CentOS7" "User namespaces disabled; add 'user_namespace.enable=1' to boot command line)" + } + fi +} + +is_config() { + local config="$1" + + # Todo: more check + [[ -f "$config" ]] && return 0 + return 1 +} + +search_config() { + local target_dir="$1" + [[ "$target_dir" ]] || target_dir=("${possibleConfigs[@]}") + + local tryConfig + for tryConfig in "${target_dir[@]}"; do + is_config "$tryConfig" && { + CONFIG="$tryConfig" + return + } + [[ -d "$tryConfig" ]] && { + for tryFile in "${possibleConfigFiles[@]}"; do + is_config "$tryConfig/$tryFile" && { + CONFIG="$tryConfig/$tryFile" + return + } + done + } + done + + wrap_warning "error: cannot find kernel config" + wrap_warning " try running this script again, specifying the kernel config:" + wrap_warning " CONFIG=/path/to/kernel/.config $0 or $0 /path/to/kernel/.config" + exit 1 +} + +CONFIG="$1" + +is_config "$CONFIG" || { + if [[ ! "$CONFIG" ]]; then + wrap_color "info: no config specified, searching for kernel config ..." white + search_config + elif [[ -d "$CONFIG" ]]; then + wrap_color "info: input is a directory, searching for kernel config in this directory..." white + search_config "$CONFIG" + else + wrap_warning "warning: $CONFIG seems not a kernel config, searching other paths for kernel config ..." + search_config + fi +} + +wrap_color "info: reading kernel config from $CONFIG ..." white +echo + +echo 'Generally Necessary:' + +echo -n '- ' +cgroupSubsystemDir="$(awk '/[, ](cpu|cpuacct|cpuset|devices|freezer|memory)[, ]/ && $3 == "cgroup" { print $2 }' /proc/mounts | head -n1)" +cgroupDir="$(dirname "$cgroupSubsystemDir")" +if [ -d "$cgroupDir/cpu" -o -d "$cgroupDir/cpuacct" -o -d "$cgroupDir/cpuset" -o -d "$cgroupDir/devices" -o -d "$cgroupDir/freezer" -o -d "$cgroupDir/memory" ]; then + echo "$(wrap_good 'cgroup hierarchy' 'properly mounted') [$cgroupDir]" +else + if [ "$cgroupSubsystemDir" ]; then + echo "$(wrap_bad 'cgroup hierarchy' 'single mountpoint!') [$cgroupSubsystemDir]" + else + echo "$(wrap_bad 'cgroup hierarchy' 'nonexistent??')" + fi + echo " $(wrap_color '(see https://github.com/tianon/cgroupfs-mount)' yellow)" +fi + +if [ "$(cat /sys/module/apparmor/parameters/enabled 2>/dev/null)" = 'Y' ]; then + echo -n '- ' + if command -v apparmor_parser &>/dev/null; then + echo "$(wrap_good 'apparmor' 'enabled and tools installed')" + else + echo "$(wrap_bad 'apparmor' 'enabled, but apparmor_parser missing')" + echo -n ' ' + if command -v apt-get &>/dev/null; then + echo "$(wrap_color '(use "apt-get install apparmor" to fix this)')" + elif command -v yum &>/dev/null; then + echo "$(wrap_color '(your best bet is "yum install apparmor-parser")')" + else + echo "$(wrap_color '(look for an "apparmor" package for your distribution)')" + fi + fi +fi + +flags=( + NAMESPACES {NET,PID,IPC,UTS}_NS + CGROUPS CGROUP_CPUACCT CGROUP_DEVICE CGROUP_FREEZER CGROUP_SCHED CPUSETS MEMCG + KEYS + VETH BRIDGE BRIDGE_NETFILTER + NF_NAT_IPV4 IP_NF_FILTER IP_NF_TARGET_MASQUERADE + NETFILTER_XT_MATCH_{ADDRTYPE,CONNTRACK,IPVS} + IP_NF_NAT NF_NAT NF_NAT_NEEDED + + # required for bind-mounting /dev/mqueue into containers + POSIX_MQUEUE +) +check_flags "${flags[@]}" +echo + +echo 'Optional Features:' +{ + check_flags USER_NS + check_distro_userns + + check_flags SECCOMP + check_flags CGROUP_PIDS + + check_flags MEMCG_SWAP MEMCG_SWAP_ENABLED + if is_set MEMCG_SWAP && ! is_set MEMCG_SWAP_ENABLED; then + echo " $(wrap_color '(note that cgroup swap accounting is not enabled in your kernel config, you can enable it by setting boot option "swapaccount=1")' bold black)" + fi +} + +if [ "$kernelMajor" -lt 4 ] || [ "$kernelMajor" -eq 4 -a "$kernelMinor" -le 5 ]; then + check_flags MEMCG_KMEM +fi + +if [ "$kernelMajor" -lt 3 ] || [ "$kernelMajor" -eq 3 -a "$kernelMinor" -le 18 ]; then + check_flags RESOURCE_COUNTERS +fi + +if [ "$kernelMajor" -lt 3 ] || [ "$kernelMajor" -eq 3 -a "$kernelMinor" -le 13 ]; then + netprio=NETPRIO_CGROUP +else + netprio=CGROUP_NET_PRIO +fi + +flags=( + BLK_CGROUP BLK_DEV_THROTTLING IOSCHED_CFQ CFQ_GROUP_IOSCHED + CGROUP_PERF + CGROUP_HUGETLB + NET_CLS_CGROUP $netprio + CFS_BANDWIDTH FAIR_GROUP_SCHED RT_GROUP_SCHED + IP_NF_TARGET_REDIRECT + IP_VS + IP_VS_NFCT + IP_VS_PROTO_TCP + IP_VS_PROTO_UDP + IP_VS_RR +) +check_flags "${flags[@]}" diff --git a/script/release.sh b/script/release.sh new file mode 100755 index 0000000..a1ebc95 --- /dev/null +++ b/script/release.sh @@ -0,0 +1,130 @@ +#!/bin/bash +# Copyright (C) 2017 SUSE LLC. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -e + +## ---> +# Project-specific options and functions. In *theory* you shouldn't need to +# touch anything else in this script in order to use this elsewhere. +project="runc" +root="$(readlink -f "$(dirname "${BASH_SOURCE}")/..")" + +# This function takes an output path as an argument, where the built +# (preferably static) binary should be placed. +function build_project() { + builddir="$(dirname "$1")" + + # Build with all tags enabled. + make -C "$root" COMMIT_NO= BUILDTAGS="seccomp selinux apparmor" static + mv "$root/$project" "$1" +} + +# End of the easy-to-configure portion. +## <--- + +# Print usage information. +function usage() { + echo "usage: release.sh [-S ] [-c ] [-r ] [-v ]" >&2 + exit 1 +} + +# Log something to stderr. +function log() { + echo "[*] $*" >&2 +} + +# Log something to stderr and then exit with 0. +function bail() { + log "$@" + exit 0 +} + +# Conduct a sanity-check to make sure that GPG provided with the given +# arguments can sign something. Inability to sign things is not a fatal error. +function gpg_cansign() { + gpg "$@" --clear-sign /dev/null +} + +# When creating releases we need to build static binaries, an archive of the +# current commit, and generate detached signatures for both. +keyid="" +commit="HEAD" +version="" +releasedir="" +hashcmd="" +while getopts "S:c:r:v:h:" opt; do + case "$opt" in + S) + keyid="$OPTARG" + ;; + c) + commit="$OPTARG" + ;; + r) + releasedir="$OPTARG" + ;; + v) + version="$OPTARG" + ;; + h) + hashcmd="$OPTARG" + ;; + \:) + echo "Missing argument: -$OPTARG" >&2 + usage + ;; + \?) + echo "Invalid option: -$OPTARG" >&2 + usage + ;; + esac +done + +version="${version:-$(<"$root/VERSION")}" +releasedir="${releasedir:-release/$version}" +hashcmd="${hashcmd:-sha256sum}" +goarch="$(go env GOARCH || echo "amd64")" + +log "creating $project release in '$releasedir'" +log " version: $version" +log " commit: $commit" +log " key: ${keyid:-DEFAULT}" +log " hash: $hashcmd" + +# Make explicit what we're doing. +set -x + +# Make the release directory. +rm -rf "$releasedir" && mkdir -p "$releasedir" + +# Build project. +build_project "$releasedir/$project.$goarch" + +# Generate new archive. +git archive --format=tar --prefix="$project-$version/" "$commit" | xz > "$releasedir/$project.tar.xz" + +# Generate sha256 checksums for both. +( cd "$releasedir" ; "$hashcmd" "$project".{"$goarch",tar.xz} > "$project.$hashcmd" ; ) + +# Set up the gpgflags. +[[ "$keyid" ]] && export gpgflags="--default-key $keyid" +gpg_cansign $gpgflags || bail "Could not find suitable GPG key, skipping signing step." + +# Sign everything. +gpg $gpgflags --detach-sign --armor "$releasedir/$project.$goarch" +gpg $gpgflags --detach-sign --armor "$releasedir/$project.tar.xz" +gpg $gpgflags --clear-sign --armor \ + --output "$releasedir/$project.$hashcmd"{.tmp,} && \ + mv "$releasedir/$project.$hashcmd"{.tmp,} diff --git a/script/tmpmount b/script/tmpmount new file mode 100755 index 0000000..5ac6bc2 --- /dev/null +++ b/script/tmpmount @@ -0,0 +1,4 @@ +#!/bin/bash + +mount -t tmpfs none /tmp +exec "$@" diff --git a/script/validate-c b/script/validate-c new file mode 100755 index 0000000..7c01b51 --- /dev/null +++ b/script/validate-c @@ -0,0 +1,42 @@ +#!/bin/bash + +source "$(dirname "$BASH_SOURCE")/.validate" + +IFS=$'\n' +files=($(validate_diff --diff-filter=ACMR --name-only -- '*.c' | grep -v '^vendor/' || true)) +unset IFS + +# indent(1): "You must use the ‘-T’ option to tell indent the name of all the typenames in your program that are defined by typedef." +INDENT="indent -linux -l120 -T size_t -T jmp_buf" +if [ -z "$(indent --version 2>&1 | grep GNU)" ]; then + echo "Skipping C indentation checks, as GNU indent is not installed." + exit 0 +fi + +badFiles=() +for f in "${files[@]}"; do + orig=$(mktemp) + formatted=$(mktemp) + # we use "git show" here to validate that what's committed is formatted + git show "$VALIDATE_HEAD:$f" > ${orig} + ${INDENT} ${orig} -o ${formatted} + if [ "$(diff -u ${orig} ${formatted})" ]; then + badFiles+=("$f") + fi + rm -f ${orig} ${formatted} +done + +if [ ${#badFiles[@]} -eq 0 ]; then + echo 'Congratulations! All C source files are properly formatted.' +else + { + echo "These files are not properly formatted:" + for f in "${badFiles[@]}"; do + echo " - $f" + done + echo + echo "Please reformat the above files using \"${INDENT}\" and commit the result." + echo + } >&2 + false +fi diff --git a/script/validate-gofmt b/script/validate-gofmt new file mode 100755 index 0000000..8337ed2 --- /dev/null +++ b/script/validate-gofmt @@ -0,0 +1,30 @@ +#!/bin/bash + +source "$(dirname "$BASH_SOURCE")/.validate" + +IFS=$'\n' +files=($(validate_diff --diff-filter=ACMR --name-only -- '*.go' | grep -v '^vendor/' || true)) +unset IFS + +badFiles=() +for f in "${files[@]}"; do + # we use "git show" here to validate that what's committed is formatted + if [ "$(git show "$VALIDATE_HEAD:$f" | gofmt -s -l)" ]; then + badFiles+=("$f") + fi +done + +if [ ${#badFiles[@]} -eq 0 ]; then + echo 'Congratulations! All Go source files are properly formatted.' +else + { + echo "These files are not properly gofmt'd:" + for f in "${badFiles[@]}"; do + echo " - $f" + done + echo + echo 'Please reformat the above files using "gofmt -s -w" and commit the result.' + echo + } >&2 + false +fi diff --git a/signalmap.go b/signalmap.go new file mode 100644 index 0000000..f9a6347 --- /dev/null +++ b/signalmap.go @@ -0,0 +1,47 @@ +// +build linux +// +build !mips,!mipsle,!mips64,!mips64le + +package main + +import ( + "syscall" + + "golang.org/x/sys/unix" +) + +var signalMap = map[string]syscall.Signal{ + "ABRT": unix.SIGABRT, + "ALRM": unix.SIGALRM, + "BUS": unix.SIGBUS, + "CHLD": unix.SIGCHLD, + "CLD": unix.SIGCLD, + "CONT": unix.SIGCONT, + "FPE": unix.SIGFPE, + "HUP": unix.SIGHUP, + "ILL": unix.SIGILL, + "INT": unix.SIGINT, + "IO": unix.SIGIO, + "IOT": unix.SIGIOT, + "KILL": unix.SIGKILL, + "PIPE": unix.SIGPIPE, + "POLL": unix.SIGPOLL, + "PROF": unix.SIGPROF, + "PWR": unix.SIGPWR, + "QUIT": unix.SIGQUIT, + "SEGV": unix.SIGSEGV, + "STKFLT": unix.SIGSTKFLT, + "STOP": unix.SIGSTOP, + "SYS": unix.SIGSYS, + "TERM": unix.SIGTERM, + "TRAP": unix.SIGTRAP, + "TSTP": unix.SIGTSTP, + "TTIN": unix.SIGTTIN, + "TTOU": unix.SIGTTOU, + "URG": unix.SIGURG, + "USR1": unix.SIGUSR1, + "USR2": unix.SIGUSR2, + "VTALRM": unix.SIGVTALRM, + "WINCH": unix.SIGWINCH, + "XCPU": unix.SIGXCPU, + "XFSZ": unix.SIGXFSZ, +} diff --git a/signalmap_mipsx.go b/signalmap_mipsx.go new file mode 100644 index 0000000..046bf15 --- /dev/null +++ b/signalmap_mipsx.go @@ -0,0 +1,45 @@ +// +build linux,mips linux,mipsle linux,mips64 linux,mips64le + +package main + +import ( + "syscall" + + "golang.org/x/sys/unix" +) + +var signalMap = map[string]syscall.Signal{ + "ABRT": unix.SIGABRT, + "ALRM": unix.SIGALRM, + "BUS": unix.SIGBUS, + "CHLD": unix.SIGCHLD, + "CLD": unix.SIGCLD, + "CONT": unix.SIGCONT, + "FPE": unix.SIGFPE, + "HUP": unix.SIGHUP, + "ILL": unix.SIGILL, + "INT": unix.SIGINT, + "IO": unix.SIGIO, + "IOT": unix.SIGIOT, + "KILL": unix.SIGKILL, + "PIPE": unix.SIGPIPE, + "POLL": unix.SIGPOLL, + "PROF": unix.SIGPROF, + "PWR": unix.SIGPWR, + "QUIT": unix.SIGQUIT, + "SEGV": unix.SIGSEGV, + "STOP": unix.SIGSTOP, + "SYS": unix.SIGSYS, + "TERM": unix.SIGTERM, + "TRAP": unix.SIGTRAP, + "TSTP": unix.SIGTSTP, + "TTIN": unix.SIGTTIN, + "TTOU": unix.SIGTTOU, + "URG": unix.SIGURG, + "USR1": unix.SIGUSR1, + "USR2": unix.SIGUSR2, + "VTALRM": unix.SIGVTALRM, + "WINCH": unix.SIGWINCH, + "XCPU": unix.SIGXCPU, + "XFSZ": unix.SIGXFSZ, +} diff --git a/signals.go b/signals.go new file mode 100644 index 0000000..b67f65a --- /dev/null +++ b/signals.go @@ -0,0 +1,139 @@ +// +build linux + +package main + +import ( + "os" + "os/signal" + "syscall" // only for Signal + + "github.com/opencontainers/runc/libcontainer" + "github.com/opencontainers/runc/libcontainer/system" + "github.com/opencontainers/runc/libcontainer/utils" + + "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" +) + +const signalBufferSize = 2048 + +// newSignalHandler returns a signal handler for processing SIGCHLD and SIGWINCH signals +// while still forwarding all other signals to the process. +// If notifySocket is present, use it to read systemd notifications from the container and +// forward them to notifySocketHost. +func newSignalHandler(enableSubreaper bool, notifySocket *notifySocket) *signalHandler { + if enableSubreaper { + // set us as the subreaper before registering the signal handler for the container + if err := system.SetSubreaper(1); err != nil { + logrus.Warn(err) + } + } + // ensure that we have a large buffer size so that we do not miss any signals + // in case we are not processing them fast enough. + s := make(chan os.Signal, signalBufferSize) + // handle all signals for the process. + signal.Notify(s) + return &signalHandler{ + signals: s, + notifySocket: notifySocket, + } +} + +// exit models a process exit status with the pid and +// exit status. +type exit struct { + pid int + status int +} + +type signalHandler struct { + signals chan os.Signal + notifySocket *notifySocket +} + +// forward handles the main signal event loop forwarding, resizing, or reaping depending +// on the signal received. +func (h *signalHandler) forward(process *libcontainer.Process, tty *tty, detach bool) (int, error) { + // make sure we know the pid of our main process so that we can return + // after it dies. + if detach && h.notifySocket == nil { + return 0, nil + } + + pid1, err := process.Pid() + if err != nil { + return -1, err + } + + if h.notifySocket != nil { + if detach { + h.notifySocket.run(pid1) + return 0, nil + } + go h.notifySocket.run(0) + } + + // Perform the initial tty resize. Always ignore errors resizing because + // stdout might have disappeared (due to races with when SIGHUP is sent). + _ = tty.resize() + // Handle and forward signals. + for s := range h.signals { + switch s { + case unix.SIGWINCH: + // Ignore errors resizing, as above. + _ = tty.resize() + case unix.SIGCHLD: + exits, err := h.reap() + if err != nil { + logrus.Error(err) + } + for _, e := range exits { + logrus.WithFields(logrus.Fields{ + "pid": e.pid, + "status": e.status, + }).Debug("process exited") + if e.pid == pid1 { + // call Wait() on the process even though we already have the exit + // status because we must ensure that any of the go specific process + // fun such as flushing pipes are complete before we return. + process.Wait() + if h.notifySocket != nil { + h.notifySocket.Close() + } + return e.status, nil + } + } + default: + logrus.Debugf("sending signal to process %s", s) + if err := unix.Kill(pid1, s.(syscall.Signal)); err != nil { + logrus.Error(err) + } + } + } + return -1, nil +} + +// reap runs wait4 in a loop until we have finished processing any existing exits +// then returns all exits to the main event loop for further processing. +func (h *signalHandler) reap() (exits []exit, err error) { + var ( + ws unix.WaitStatus + rus unix.Rusage + ) + for { + pid, err := unix.Wait4(-1, &ws, unix.WNOHANG, &rus) + if err != nil { + if err == unix.ECHILD { + return exits, nil + } + return nil, err + } + if pid <= 0 { + return exits, nil + } + exits = append(exits, exit{ + pid: pid, + status: utils.ExitStatus(ws), + }) + } +} diff --git a/spec.go b/spec.go new file mode 100644 index 0000000..322a83d --- /dev/null +++ b/spec.go @@ -0,0 +1,145 @@ +// +build linux + +package main + +import ( + "encoding/json" + "fmt" + "io/ioutil" + "os" + + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/specconv" + "github.com/opencontainers/runtime-spec/specs-go" + "github.com/urfave/cli" +) + +var specCommand = cli.Command{ + Name: "spec", + Usage: "create a new specification file", + ArgsUsage: "", + Description: `The spec command creates the new specification file named "` + specConfig + `" for +the bundle. + +The spec generated is just a starter file. Editing of the spec is required to +achieve desired results. For example, the newly generated spec includes an args +parameter that is initially set to call the "sh" command when the container is +started. Calling "sh" may work for an ubuntu container or busybox, but will not +work for containers that do not include the "sh" program. + +EXAMPLE: + To run docker's hello-world container one needs to set the args parameter +in the spec to call hello. This can be done using the sed command or a text +editor. The following commands create a bundle for hello-world, change the +default args parameter in the spec from "sh" to "/hello", then run the hello +command in a new hello-world container named container1: + + mkdir hello + cd hello + docker pull hello-world + docker export $(docker create hello-world) > hello-world.tar + mkdir rootfs + tar -C rootfs -xf hello-world.tar + runc spec + sed -i 's;"sh";"/hello";' ` + specConfig + ` + runc run container1 + +In the run command above, "container1" is the name for the instance of the +container that you are starting. The name you provide for the container instance +must be unique on your host. + +An alternative for generating a customized spec config is to use "oci-runtime-tool", the +sub-command "oci-runtime-tool generate" has lots of options that can be used to do any +customizations as you want, see runtime-tools (https://github.com/opencontainers/runtime-tools) +to get more information. + +When starting a container through runc, runc needs root privilege. If not +already running as root, you can use sudo to give runc root privilege. For +example: "sudo runc start container1" will give runc root privilege to start the +container on your host. + +Alternatively, you can start a rootless container, which has the ability to run +without root privileges. For this to work, the specification file needs to be +adjusted accordingly. You can pass the parameter --rootless to this command to +generate a proper rootless spec file. + +Note that --rootless is not needed when you execute runc as the root in a user namespace +created by an unprivileged user. +`, + Flags: []cli.Flag{ + cli.StringFlag{ + Name: "bundle, b", + Value: "", + Usage: "path to the root of the bundle directory", + }, + cli.BoolFlag{ + Name: "rootless", + Usage: "generate a configuration for a rootless container", + }, + }, + Action: func(context *cli.Context) error { + if err := checkArgs(context, 0, exactArgs); err != nil { + return err + } + spec := specconv.Example() + + rootless := context.Bool("rootless") + if rootless { + specconv.ToRootless(spec) + } + + checkNoFile := func(name string) error { + _, err := os.Stat(name) + if err == nil { + return fmt.Errorf("File %s exists. Remove it first", name) + } + if !os.IsNotExist(err) { + return err + } + return nil + } + bundle := context.String("bundle") + if bundle != "" { + if err := os.Chdir(bundle); err != nil { + return err + } + } + if err := checkNoFile(specConfig); err != nil { + return err + } + data, err := json.MarshalIndent(spec, "", "\t") + if err != nil { + return err + } + return ioutil.WriteFile(specConfig, data, 0666) + }, +} + +// loadSpec loads the specification from the provided path. +func loadSpec(cPath string) (spec *specs.Spec, err error) { + cf, err := os.Open(cPath) + if err != nil { + if os.IsNotExist(err) { + return nil, fmt.Errorf("JSON specification file %s not found", cPath) + } + return nil, err + } + defer cf.Close() + + if err = json.NewDecoder(cf).Decode(&spec); err != nil { + return nil, err + } + return spec, validateProcessSpec(spec.Process) +} + +func createLibContainerRlimit(rlimit specs.POSIXRlimit) (configs.Rlimit, error) { + rl, err := strToRlimit(rlimit.Type) + if err != nil { + return configs.Rlimit{}, err + } + return configs.Rlimit{ + Type: rl, + Hard: rlimit.Hard, + Soft: rlimit.Soft, + }, nil +} diff --git a/start.go b/start.go new file mode 100644 index 0000000..2bb698b --- /dev/null +++ b/start.go @@ -0,0 +1,43 @@ +package main + +import ( + "errors" + "fmt" + + "github.com/opencontainers/runc/libcontainer" + "github.com/urfave/cli" +) + +var startCommand = cli.Command{ + Name: "start", + Usage: "executes the user defined process in a created container", + ArgsUsage: ` + +Where "" is your name for the instance of the container that you +are starting. The name you provide for the container instance must be unique on +your host.`, + Description: `The start command executes the user defined process in a created container.`, + Action: func(context *cli.Context) error { + if err := checkArgs(context, 1, exactArgs); err != nil { + return err + } + container, err := getContainer(context) + if err != nil { + return err + } + status, err := container.Status() + if err != nil { + return err + } + switch status { + case libcontainer.Created: + return container.Exec() + case libcontainer.Stopped: + return errors.New("cannot start a container that has stopped") + case libcontainer.Running: + return errors.New("cannot start an already running container") + default: + return fmt.Errorf("cannot start a container in the %s state\n", status) + } + }, +} diff --git a/state.go b/state.go new file mode 100644 index 0000000..718813c --- /dev/null +++ b/state.go @@ -0,0 +1,60 @@ +// +build linux + +package main + +import ( + "encoding/json" + "os" + + "github.com/opencontainers/runc/libcontainer" + "github.com/opencontainers/runc/libcontainer/utils" + "github.com/urfave/cli" +) + +var stateCommand = cli.Command{ + Name: "state", + Usage: "output the state of a container", + ArgsUsage: ` + +Where "" is your name for the instance of the container.`, + Description: `The state command outputs current state information for the +instance of a container.`, + Action: func(context *cli.Context) error { + if err := checkArgs(context, 1, exactArgs); err != nil { + return err + } + container, err := getContainer(context) + if err != nil { + return err + } + containerStatus, err := container.Status() + if err != nil { + return err + } + state, err := container.State() + if err != nil { + return err + } + pid := state.BaseState.InitProcessPid + if containerStatus == libcontainer.Stopped { + pid = 0 + } + bundle, annotations := utils.Annotations(state.Config.Labels) + cs := containerState{ + Version: state.BaseState.Config.Version, + ID: state.BaseState.ID, + InitProcessPid: pid, + Status: containerStatus.String(), + Bundle: bundle, + Rootfs: state.BaseState.Config.Rootfs, + Created: state.BaseState.Created, + Annotations: annotations, + } + data, err := json.MarshalIndent(cs, "", " ") + if err != nil { + return err + } + os.Stdout.Write(data) + return nil + }, +} diff --git a/tests/integration/README.md b/tests/integration/README.md new file mode 100644 index 0000000..8ee6ebf --- /dev/null +++ b/tests/integration/README.md @@ -0,0 +1,83 @@ +# runc Integration Tests + +Integration tests provide end-to-end testing of runc. + +Note that integration tests do **not** replace unit tests. + +As a rule of thumb, code should be tested thoroughly with unit tests. +Integration tests on the other hand are meant to test a specific feature end +to end. + +Integration tests are written in *bash* using the +[bats](https://github.com/sstephenson/bats) framework. + +## Running integration tests + +The easiest way to run integration tests is with Docker: +``` +$ make integration +``` +Alternatively, you can run integration tests directly on your host through make: +``` +$ sudo make localintegration +``` +Or you can just run them directly using bats +``` +$ sudo bats tests/integration +``` +To run a single test bucket: +``` +$ make integration TESTPATH="/checkpoint.bats" +``` + + +To run them on your host, you will need to setup a development environment plus +[bats](https://github.com/sstephenson/bats#installing-bats-from-source) +For example: +``` +$ cd ~/go/src/github.com +$ git clone https://github.com/sstephenson/bats.git +$ cd bats +$ ./install.sh /usr/local +``` + +> **Note**: There are known issues running the integration tests using +> **devicemapper** as a storage driver, make sure that your docker daemon +> is using **aufs** if you want to successfully run the integration tests. + +## Writing integration tests + +[helper functions] +(https://github.com/opencontainers/runc/blob/master/test/integration/helpers.bash) +are provided in order to facilitate writing tests. + +```sh +#!/usr/bin/env bats + +# This will load the helpers. +load helpers + +# setup is called at the beginning of every test. +function setup() { + # see functions teardown_hello and setup_hello in helpers.bash, used to + # create a pristine environment for running your tests + teardown_hello + setup_hello +} + +# teardown is called at the end of every test. +function teardown() { + teardown_hello +} + +@test "this is a simple test" { + runc run containerid + # "The runc macro" automatically populates $status, $output and $lines. + # Please refer to bats documentation to find out more. + [ "$status" -eq 0 ] + + # check expected output + [[ "${output}" == *"Hello"* ]] +} + +``` diff --git a/tests/integration/cgroups.bats b/tests/integration/cgroups.bats new file mode 100644 index 0000000..17812ab --- /dev/null +++ b/tests/integration/cgroups.bats @@ -0,0 +1,127 @@ +#!/usr/bin/env bats + +load helpers + +function teardown() { + rm -f $BATS_TMPDIR/runc-cgroups-integration-test.json + teardown_running_container test_cgroups_kmem + teardown_running_container test_cgroups_permissions + teardown_busybox +} + +function setup() { + teardown + setup_busybox +} + +function check_cgroup_value() { + cgroup=$1 + source=$2 + expected=$3 + + current=$(cat $cgroup/$source) + echo $cgroup/$source + echo "current" $current "!?" "$expected" + [ "$current" -eq "$expected" ] +} + +@test "runc update --kernel-memory (initialized)" { + [[ "$ROOTLESS" -ne 0 ]] && requires rootless_cgroup + requires cgroups_kmem + + set_cgroups_path "$BUSYBOX_BUNDLE" + + # Set some initial known values + DATA=$(cat <<-EOF + "memory": { + "kernel": 16777216 + }, +EOF + ) + DATA=$(echo ${DATA} | sed 's/\n/\\n/g') + sed -i "s/\(\"resources\": {\)/\1\n${DATA}/" ${BUSYBOX_BUNDLE}/config.json + + # run a detached busybox to work with + runc run -d --console-socket $CONSOLE_SOCKET test_cgroups_kmem + [ "$status" -eq 0 ] + + # update kernel memory limit + runc update test_cgroups_kmem --kernel-memory 50331648 + [ "$status" -eq 0 ] + + # check the value + check_cgroup_value $CGROUP_MEMORY "memory.kmem.limit_in_bytes" 50331648 +} + +@test "runc update --kernel-memory (uninitialized)" { + [[ "$ROOTLESS" -ne 0 ]] && requires rootless_cgroup + requires cgroups_kmem + + set_cgroups_path "$BUSYBOX_BUNDLE" + + # run a detached busybox to work with + runc run -d --console-socket $CONSOLE_SOCKET test_cgroups_kmem + [ "$status" -eq 0 ] + + # update kernel memory limit + runc update test_cgroups_kmem --kernel-memory 50331648 + # Since kernel 4.6, we can update kernel memory without initialization + # because it's accounted by default. + if [ "$KERNEL_MAJOR" -lt 4 ] || [ "$KERNEL_MAJOR" -eq 4 -a "$KERNEL_MINOR" -le 5 ]; then + [ ! "$status" -eq 0 ] + else + [ "$status" -eq 0 ] + check_cgroup_value $CGROUP_MEMORY "memory.kmem.limit_in_bytes" 50331648 + fi +} + +@test "runc create (no limits + no cgrouppath + no permission) succeeds" { + runc run -d --console-socket $CONSOLE_SOCKET test_cgroups_permissions + [ "$status" -eq 0 ] +} + +@test "runc create (rootless + no limits + cgrouppath + no permission) fails with permission error" { + requires rootless + requires rootless_no_cgroup + + set_cgroups_path "$BUSYBOX_BUNDLE" + + runc run -d --console-socket $CONSOLE_SOCKET test_cgroups_permissions + [ "$status" -eq 1 ] + [[ ${lines[1]} == *"permission denied"* ]] +} + +@test "runc create (rootless + limits + no cgrouppath + no permission) fails with informative error" { + requires rootless + requires rootless_no_cgroup + + set_resources_limit "$BUSYBOX_BUNDLE" + + runc run -d --console-socket $CONSOLE_SOCKET test_cgroups_permissions + [ "$status" -eq 1 ] + [[ ${lines[1]} == *"cannot set pids limit: container could not join or create cgroup"* ]] +} + +@test "runc create (limits + cgrouppath + permission on the cgroup dir) succeeds" { + [[ "$ROOTLESS" -ne 0 ]] && requires rootless_cgroup + + set_cgroups_path "$BUSYBOX_BUNDLE" + set_resources_limit "$BUSYBOX_BUNDLE" + + runc run -d --console-socket $CONSOLE_SOCKET test_cgroups_permissions + [ "$status" -eq 0 ] +} + +@test "runc exec (limits + cgrouppath + permission on the cgroup dir) succeeds" { + [[ "$ROOTLESS" -ne 0 ]] && requires rootless_cgroup + + set_cgroups_path "$BUSYBOX_BUNDLE" + set_resources_limit "$BUSYBOX_BUNDLE" + + runc run -d --console-socket $CONSOLE_SOCKET test_cgroups_permissions + [ "$status" -eq 0 ] + + runc exec test_cgroups_permissions echo "cgroups_exec" + [ "$status" -eq 0 ] + [[ ${lines[0]} == *"cgroups_exec"* ]] +} diff --git a/tests/integration/checkpoint.bats b/tests/integration/checkpoint.bats new file mode 100644 index 0000000..87696df --- /dev/null +++ b/tests/integration/checkpoint.bats @@ -0,0 +1,348 @@ +#!/usr/bin/env bats + +load helpers + +function setup() { + if [[ -n "${RUNC_USE_SYSTEMD}" ]] ; then + skip "CRIU test suite is skipped on systemd cgroup driver for now." + fi + + teardown_busybox + setup_busybox +} + +function teardown() { + teardown_busybox +} + +@test "checkpoint and restore" { + # XXX: currently criu require root containers. + requires criu root + + runc run -d --console-socket $CONSOLE_SOCKET test_busybox + [ "$status" -eq 0 ] + + testcontainer test_busybox running + + for i in `seq 2`; do + # checkpoint the running container + runc --criu "$CRIU" checkpoint --work-path ./work-dir test_busybox + ret=$? + # if you are having problems getting criu to work uncomment the following dump: + #cat /run/opencontainer/containers/test_busybox/criu.work/dump.log + cat ./work-dir/dump.log | grep -B 5 Error || true + [ "$ret" -eq 0 ] + + # after checkpoint busybox is no longer running + runc state test_busybox + [ "$status" -ne 0 ] + + # restore from checkpoint + runc --criu "$CRIU" restore -d --work-path ./work-dir --console-socket $CONSOLE_SOCKET test_busybox + ret=$? + cat ./work-dir/restore.log | grep -B 5 Error || true + [ "$ret" -eq 0 ] + + # busybox should be back up and running + testcontainer test_busybox running + done +} + +@test "checkpoint --pre-dump and restore" { + # XXX: currently criu require root containers. + requires criu root + + # The changes to 'terminal' are needed for running in detached mode + sed -i 's;"terminal": true;"terminal": false;' config.json + sed -i 's/"sh"/"sh","-c","for i in `seq 10`; do read xxx || continue; echo ponG $xxx; done"/' config.json + + # The following code creates pipes for stdin and stdout. + # CRIU can't handle fifo-s, so we need all these tricks. + fifo=`mktemp -u /tmp/runc-fifo-XXXXXX` + mkfifo $fifo + + # stdout + cat $fifo | cat $fifo & + pid=$! + exec 50/proc/$pid/fd/0 + + # stdin + cat $fifo | cat $fifo & + pid=$! + exec 60/proc/$pid/fd/0 + + echo -n > $fifo + unlink $fifo + + # run busybox + __runc run -d test_busybox <&60 >&51 2>&51 + [ $? -eq 0 ] + + testcontainer test_busybox running + + #test checkpoint pre-dump + mkdir parent-dir + runc --criu "$CRIU" checkpoint --pre-dump --image-path ./parent-dir test_busybox + [ "$status" -eq 0 ] + + # busybox should still be running + runc state test_busybox + [ "$status" -eq 0 ] + [[ "${output}" == *"running"* ]] + + # checkpoint the running container + mkdir image-dir + mkdir work-dir + runc --criu "$CRIU" checkpoint --parent-path ./parent-dir --work-path ./work-dir --image-path ./image-dir test_busybox + cat ./work-dir/dump.log | grep -B 5 Error || true + [ "$status" -eq 0 ] + + # after checkpoint busybox is no longer running + runc state test_busybox + [ "$status" -ne 0 ] + + # restore from checkpoint + __runc --criu "$CRIU" restore -d --work-path ./work-dir --image-path ./image-dir test_busybox <&60 >&51 2>&51 + ret=$? + cat ./work-dir/restore.log | grep -B 5 Error || true + [ $ret -eq 0 ] + + # busybox should be back up and running + testcontainer test_busybox running + + runc exec --cwd /bin test_busybox echo ok + [ "$status" -eq 0 ] + [[ ${output} == "ok" ]] + + echo Ping >&61 + exec 61>&- + exec 51>&- + run cat <&50 + [ "$status" -eq 0 ] + [[ "${output}" == *"ponG Ping"* ]] +} + +@test "checkpoint --lazy-pages and restore" { + # XXX: currently criu require root containers. + requires criu root + + # check if lazy-pages is supported + run ${CRIU} check --feature uffd-noncoop + if [ "$status" -eq 1 ]; then + # this criu does not support lazy migration; skip the test + skip "this criu does not support lazy migration" + fi + + # The changes to 'terminal' are needed for running in detached mode + sed -i 's;"terminal": true;"terminal": false;' config.json + # This should not be necessary: https://github.com/checkpoint-restore/criu/issues/575 + sed -i 's;"readonly": true;"readonly": false;' config.json + sed -i 's/"sh"/"sh","-c","for i in `seq 10`; do read xxx || continue; echo ponG $xxx; done"/' config.json + + # The following code creates pipes for stdin and stdout. + # CRIU can't handle fifo-s, so we need all these tricks. + fifo=`mktemp -u /tmp/runc-fifo-XXXXXX` + mkfifo $fifo + + # For lazy migration we need to know when CRIU is ready to serve + # the memory pages via TCP. + lazy_pipe=`mktemp -u /tmp/lazy-pipe-XXXXXX` + mkfifo $lazy_pipe + + # TCP port for lazy migration + port=27277 + + # stdout + cat $fifo | cat $fifo & + pid=$! + exec 50/proc/$pid/fd/0 + + # stdin + cat $fifo | cat $fifo & + pid=$! + exec 60/proc/$pid/fd/0 + + echo -n > $fifo + unlink $fifo + + # run busybox + __runc run -d test_busybox <&60 >&51 2>&51 + [ $? -eq 0 ] + + testcontainer test_busybox running + + # checkpoint the running container + mkdir image-dir + mkdir work-dir + # Double fork taken from helpers.bats + # We need to start 'runc checkpoint --lazy-pages' in the background, + # so we double fork in the shell. + (runc --criu "$CRIU" checkpoint --lazy-pages --page-server 0.0.0.0:${port} --status-fd ${lazy_pipe} --work-path ./work-dir --image-path ./image-dir test_busybox & ) & + # Sleeping here. This is ugly, but not sure how else to handle it. + # The return code of the in the background running runc is needed, if + # there is some basic error. If the lazy migration is ready can + # be handled by $lazy_pipe. Which probably will always be ready + # after sleeping two seconds. + sleep 2 + # Check if inventory.img was written + [ -e image-dir/inventory.img ] + # If the inventory.img exists criu checkpointed some things, let's see + # if there were other errors in the log file. + run grep -B 5 Error ./work-dir/dump.log -q + [ "$status" -eq 1 ] + + # This will block until CRIU is ready to serve memory pages + cat $lazy_pipe + [ "$status" -eq 1 ] + + unlink $lazy_pipe + + # Double fork taken from helpers.bats + # We need to start 'criu lazy-pages' in the background, + # so we double fork in the shell. + # Start CRIU in lazy-daemon mode + $(${CRIU} lazy-pages --page-server --address 127.0.0.1 --port ${port} -D image-dir &) & + + # Restore lazily from checkpoint. + # The restored container needs a different name as the checkpointed + # container is not yet destroyed. It is only destroyed at that point + # in time when the last page is lazily transferred to the destination. + # Killing the CRIU on the checkpoint side will let the container + # continue to run if the migration failed at some point. + __runc --criu "$CRIU" restore -d --work-path ./image-dir --image-path ./image-dir --lazy-pages test_busybox_restore <&60 >&51 2>&51 + ret=$? + [ $ret -eq 0 ] + run grep -B 5 Error ./work-dir/dump.log -q + [ "$status" -eq 1 ] + + # busybox should be back up and running + testcontainer test_busybox_restore running + + runc exec --cwd /bin test_busybox_restore echo ok + [ "$status" -eq 0 ] + [[ ${output} == "ok" ]] + + echo Ping >&61 + exec 61>&- + exec 51>&- + run cat <&50 + [ "$status" -eq 0 ] + [[ "${output}" == *"ponG Ping"* ]] +} + +@test "checkpoint and restore in external network namespace" { + # XXX: currently criu require root containers. + requires criu root + + # check if external_net_ns is supported; only with criu 3.10++ + run ${CRIU} check --feature external_net_ns + if [ "$status" -eq 1 ]; then + # this criu does not support external_net_ns; skip the test + skip "this criu does not support external network namespaces" + fi + + # create a temporary name for the test network namespace + tmp=`mktemp` + rm -f $tmp + ns_name=`basename $tmp` + # create network namespace + ip netns add $ns_name + ns_path=`ip netns add $ns_name 2>&1 | sed -e 's/.*"\(.*\)".*/\1/'` + + ns_inode=`ls -iL $ns_path | awk '{ print $1 }'` + + # tell runc which network namespace to use + sed -i "s;\"type\": \"network\";\"type\": \"network\",\"path\": \"$ns_path\";" config.json + + runc run -d --console-socket $CONSOLE_SOCKET test_busybox + [ "$status" -eq 0 ] + + testcontainer test_busybox running + + for i in `seq 2`; do + # checkpoint the running container; this automatically tells CRIU to + # handle the network namespace defined in config.json as an external + runc --criu "$CRIU" checkpoint --work-path ./work-dir test_busybox + ret=$? + # if you are having problems getting criu to work uncomment the following dump: + #cat /run/opencontainer/containers/test_busybox/criu.work/dump.log + cat ./work-dir/dump.log | grep -B 5 Error || true + [ "$ret" -eq 0 ] + + # after checkpoint busybox is no longer running + runc state test_busybox + [ "$status" -ne 0 ] + + # restore from checkpoint; this should restore the container into the existing network namespace + runc --criu "$CRIU" restore -d --work-path ./work-dir --console-socket $CONSOLE_SOCKET test_busybox + ret=$? + cat ./work-dir/restore.log | grep -B 5 Error || true + [ "$ret" -eq 0 ] + + # busybox should be back up and running + testcontainer test_busybox running + + # container should be running in same network namespace as before + pid=`__runc state test_busybox | jq '.pid'` + ns_inode_new=`readlink /proc/$pid/ns/net | sed -e 's/.*\[\(.*\)\]/\1/'` + echo "old network namespace inode $ns_inode" + echo "new network namespace inode $ns_inode_new" + [ "$ns_inode" -eq "$ns_inode_new" ] + done + ip netns del $ns_name +} + +@test "checkpoint and restore with container specific CRIU config" { + # XXX: currently criu require root containers. + requires criu root + + tmp=`mktemp /tmp/runc-criu-XXXXXX.conf` + # This is the file we write to /etc/criu/default.conf + tmplog1=`mktemp /tmp/runc-criu-log-XXXXXX.log` + unlink $tmplog1 + tmplog1=`basename $tmplog1` + # That is the actual configuration file to be used + tmplog2=`mktemp /tmp/runc-criu-log-XXXXXX.log` + unlink $tmplog2 + tmplog2=`basename $tmplog2` + # This adds the annotation 'org.criu.config' to set a container + # specific CRIU config file. + sed -i "s;\"process\";\"annotations\":{\"org.criu.config\": \"$tmp\"},\"process\";" config.json + # Tell CRIU to use another configuration file + mkdir -p /etc/criu + echo "log-file=$tmplog1" > /etc/criu/default.conf + # Make sure the RPC defined configuration file overwrites the previous + echo "log-file=$tmplog2" > $tmp + + runc run -d --console-socket $CONSOLE_SOCKET test_busybox + [ "$status" -eq 0 ] + + testcontainer test_busybox running + + # checkpoint the running container + runc --criu "$CRIU" checkpoint --work-path ./work-dir test_busybox + [ "$status" -eq 0 ] + ! test -f ./work-dir/$tmplog1 + test -f ./work-dir/$tmplog2 + + # after checkpoint busybox is no longer running + runc state test_busybox + [ "$status" -ne 0 ] + + test -f ./work-dir/$tmplog2 && unlink ./work-dir/$tmplog2 + # restore from checkpoint + runc --criu "$CRIU" restore -d --work-path ./work-dir --console-socket $CONSOLE_SOCKET test_busybox + [ "$status" -eq 0 ] + ! test -f ./work-dir/$tmplog1 + test -f ./work-dir/$tmplog2 + + # busybox should be back up and running + testcontainer test_busybox running + unlink $tmp + test -f ./work-dir/$tmplog2 && unlink ./work-dir/$tmplog2 +} + diff --git a/tests/integration/config.json b/tests/integration/config.json new file mode 100644 index 0000000..e69de29 diff --git a/tests/integration/create.bats b/tests/integration/create.bats new file mode 100644 index 0000000..abd4da2 --- /dev/null +++ b/tests/integration/create.bats @@ -0,0 +1,89 @@ +#!/usr/bin/env bats + +load helpers + +function setup() { + teardown_busybox + setup_busybox +} + +function teardown() { + teardown_busybox +} + +@test "runc create" { + runc create --console-socket $CONSOLE_SOCKET test_busybox + [ "$status" -eq 0 ] + + testcontainer test_busybox created + + # start the command + runc start test_busybox + [ "$status" -eq 0 ] + + testcontainer test_busybox running +} + +@test "runc create exec" { + runc create --console-socket $CONSOLE_SOCKET test_busybox + [ "$status" -eq 0 ] + + testcontainer test_busybox created + + runc exec test_busybox true + [ "$status" -eq 0 ] + + testcontainer test_busybox created + + # start the command + runc start test_busybox + [ "$status" -eq 0 ] + + testcontainer test_busybox running +} + +@test "runc create --pid-file" { + runc create --pid-file pid.txt --console-socket $CONSOLE_SOCKET test_busybox + [ "$status" -eq 0 ] + + testcontainer test_busybox created + + # check pid.txt was generated + [ -e pid.txt ] + + run cat pid.txt + [ "$status" -eq 0 ] + [[ ${lines[0]} == $(__runc state test_busybox | jq '.pid') ]] + + # start the command + runc start test_busybox + [ "$status" -eq 0 ] + + testcontainer test_busybox running +} + +@test "runc create --pid-file with new CWD" { + # create pid_file directory as the CWD + run mkdir pid_file + [ "$status" -eq 0 ] + run cd pid_file + [ "$status" -eq 0 ] + + runc create --pid-file pid.txt -b $BUSYBOX_BUNDLE --console-socket $CONSOLE_SOCKET test_busybox + [ "$status" -eq 0 ] + + testcontainer test_busybox created + + # check pid.txt was generated + [ -e pid.txt ] + + run cat pid.txt + [ "$status" -eq 0 ] + [[ ${lines[0]} == $(__runc state test_busybox | jq '.pid') ]] + + # start the command + runc start test_busybox + [ "$status" -eq 0 ] + + testcontainer test_busybox running +} diff --git a/tests/integration/debug.bats b/tests/integration/debug.bats new file mode 100644 index 0000000..e02cf4a --- /dev/null +++ b/tests/integration/debug.bats @@ -0,0 +1,81 @@ +#!/usr/bin/env bats + +load helpers + +function setup() { + teardown_hello + setup_hello +} + +function teardown() { + teardown_hello +} + +@test "global --debug" { + # run hello-world + runc --debug run test_hello + echo "${output}" + [ "$status" -eq 0 ] + + # check expected debug output was sent to stderr + [[ "${output}" == *"level=debug"* ]] + [[ "${output}" == *"nsexec started"* ]] + [[ "${output}" == *"child process in init()"* ]] +} + +@test "global --debug to --log" { + # run hello-world + runc --log log.out --debug run test_hello + [ "$status" -eq 0 ] + + # check output does not include debug info + [[ "${output}" != *"level=debug"* ]] + + # check log.out was generated + [ -e log.out ] + + # check expected debug output was sent to log.out + run cat log.out + [ "$status" -eq 0 ] + [[ "${output}" == *"level=debug"* ]] + [[ "${output}" == *"nsexec started"* ]] + [[ "${output}" == *"child process in init()"* ]] +} + +@test "global --debug to --log --log-format 'text'" { + # run hello-world + runc --log log.out --log-format "text" --debug run test_hello + [ "$status" -eq 0 ] + + # check output does not include debug info + [[ "${output}" != *"level=debug"* ]] + + # check log.out was generated + [ -e log.out ] + + # check expected debug output was sent to log.out + run cat log.out + [ "$status" -eq 0 ] + [[ "${output}" == *"level=debug"* ]] + [[ "${output}" == *"nsexec started"* ]] + [[ "${output}" == *"child process in init()"* ]] +} + +@test "global --debug to --log --log-format 'json'" { + # run hello-world + runc --log log.out --log-format "json" --debug run test_hello + [ "$status" -eq 0 ] + + # check output does not include debug info + [[ "${output}" != *"level=debug"* ]] + + # check log.out was generated + [ -e log.out ] + + # check expected debug output was sent to log.out + run cat log.out + [ "$status" -eq 0 ] + [[ "${output}" == *'"level":"debug"'* ]] + [[ "${output}" == *"nsexec started"* ]] + [[ "${output}" == *"child process in init()"* ]] +} diff --git a/tests/integration/delete.bats b/tests/integration/delete.bats new file mode 100644 index 0000000..c5ed215 --- /dev/null +++ b/tests/integration/delete.bats @@ -0,0 +1,53 @@ +#!/usr/bin/env bats + +load helpers + +function setup() { + teardown_busybox + setup_busybox +} + +function teardown() { + teardown_busybox +} + +@test "runc delete" { + # run busybox detached + runc run -d --console-socket $CONSOLE_SOCKET test_busybox + [ "$status" -eq 0 ] + + # check state + testcontainer test_busybox running + + runc kill test_busybox KILL + [ "$status" -eq 0 ] + # wait for busybox to be in the destroyed state + retry 10 1 eval "__runc state test_busybox | grep -q 'stopped'" + + # delete test_busybox + runc delete test_busybox + [ "$status" -eq 0 ] + + runc state test_busybox + [ "$status" -ne 0 ] +} + +@test "runc delete --force" { + # run busybox detached + runc run -d --console-socket $CONSOLE_SOCKET test_busybox + [ "$status" -eq 0 ] + + # check state + testcontainer test_busybox running + + # force delete test_busybox + runc delete --force test_busybox + + runc state test_busybox + [ "$status" -ne 0 ] +} + +@test "runc delete --force ignore not exist" { + runc delete --force notexists + [ "$status" -eq 0 ] +} diff --git a/tests/integration/events.bats b/tests/integration/events.bats new file mode 100644 index 0000000..b3e6315 --- /dev/null +++ b/tests/integration/events.bats @@ -0,0 +1,109 @@ +#!/usr/bin/env bats + +load helpers + +function setup() { + teardown_busybox + setup_busybox +} + +function teardown() { + teardown_busybox +} + +@test "events --stats" { + # XXX: currently cgroups require root containers. + requires root + + # run busybox detached + runc run -d --console-socket $CONSOLE_SOCKET test_busybox + [ "$status" -eq 0 ] + + # generate stats + runc events --stats test_busybox + [ "$status" -eq 0 ] + [[ "${lines[0]}" == [\{]"\"type\""[:]"\"stats\""[,]"\"id\""[:]"\"test_busybox\""[,]* ]] + [[ "${lines[0]}" == *"data"* ]] +} + +@test "events --interval default " { + # XXX: currently cgroups require root containers. + requires root + + # run busybox detached + runc run -d --console-socket $CONSOLE_SOCKET test_busybox + [ "$status" -eq 0 ] + + # spawn two sub processes (shells) + # the first sub process is an event logger that sends stats events to events.log + # the second sub process waits for an event that includes test_busybox then + # kills the test_busybox container which causes the event logger to exit + (__runc events test_busybox > events.log) & + ( + retry 10 1 eval "grep -q 'test_busybox' events.log" + teardown_running_container test_busybox + ) & + wait # wait for the above sub shells to finish + + [ -e events.log ] + + run cat events.log + [ "$status" -eq 0 ] + [[ "${lines[0]}" == [\{]"\"type\""[:]"\"stats\""[,]"\"id\""[:]"\"test_busybox\""[,]* ]] + [[ "${lines[0]}" == *"data"* ]] +} + +@test "events --interval 1s " { + # XXX: currently cgroups require root containers. + requires root + + # run busybox detached + runc run -d --console-socket $CONSOLE_SOCKET test_busybox + [ "$status" -eq 0 ] + + # spawn two sub processes (shells) + # the first sub process is an event logger that sends stats events to events.log once a second + # the second sub process tries 3 times for an event that incudes test_busybox + # pausing 1s between each attempt then kills the test_busybox container which + # causes the event logger to exit + (__runc events --interval 1s test_busybox > events.log) & + ( + retry 3 1 eval "grep -q 'test_busybox' events.log" + teardown_running_container test_busybox + ) & + wait # wait for the above sub shells to finish + + [ -e events.log ] + + run eval "grep -q 'test_busybox' events.log" + [ "$status" -eq 0 ] +} + +@test "events --interval 100ms " { + # XXX: currently cgroups require root containers. + requires root + + # run busybox detached + runc run -d --console-socket $CONSOLE_SOCKET test_busybox + [ "$status" -eq 0 ] + + #prove there is no carry over of events.log from a prior test + [ ! -e events.log ] + + # spawn two sub processes (shells) + # the first sub process is an event logger that sends stats events to events.log once every 100ms + # the second sub process tries 3 times for an event that incudes test_busybox + # pausing 100s between each attempt then kills the test_busybox container which + # causes the event logger to exit + (__runc events --interval 100ms test_busybox > events.log) & + ( + retry 3 0.100 eval "grep -q 'test_busybox' events.log" + teardown_running_container test_busybox + ) & + wait # wait for the above sub shells to finish + + [ -e events.log ] + + run eval "grep -q 'test_busybox' events.log" + [ "$status" -eq 0 ] +} diff --git a/tests/integration/exec.bats b/tests/integration/exec.bats new file mode 100644 index 0000000..19647c1 --- /dev/null +++ b/tests/integration/exec.bats @@ -0,0 +1,140 @@ +#!/usr/bin/env bats + +load helpers + +function setup() { + teardown_busybox + setup_busybox +} + +function teardown() { + teardown_busybox +} + +@test "runc exec" { + # run busybox detached + runc run -d --console-socket $CONSOLE_SOCKET test_busybox + [ "$status" -eq 0 ] + + runc exec test_busybox echo Hello from exec + [ "$status" -eq 0 ] + echo text echoed = "'""${output}""'" + [[ "${output}" == *"Hello from exec"* ]] +} + +@test "runc exec --pid-file" { + # run busybox detached + runc run -d --console-socket $CONSOLE_SOCKET test_busybox + [ "$status" -eq 0 ] + + runc exec --pid-file pid.txt test_busybox echo Hello from exec + [ "$status" -eq 0 ] + echo text echoed = "'""${output}""'" + [[ "${output}" == *"Hello from exec"* ]] + + # check pid.txt was generated + [ -e pid.txt ] + + run cat pid.txt + [ "$status" -eq 0 ] + [[ ${lines[0]} =~ [0-9]+ ]] + [[ ${lines[0]} != $(__runc state test_busybox | jq '.pid') ]] +} + +@test "runc exec --pid-file with new CWD" { + # create pid_file directory as the CWD + run mkdir pid_file + [ "$status" -eq 0 ] + run cd pid_file + [ "$status" -eq 0 ] + + # run busybox detached + runc run -d -b $BUSYBOX_BUNDLE --console-socket $CONSOLE_SOCKET test_busybox + [ "$status" -eq 0 ] + + runc exec --pid-file pid.txt test_busybox echo Hello from exec + [ "$status" -eq 0 ] + echo text echoed = "'""${output}""'" + [[ "${output}" == *"Hello from exec"* ]] + + # check pid.txt was generated + [ -e pid.txt ] + + run cat pid.txt + [ "$status" -eq 0 ] + [[ ${lines[0]} =~ [0-9]+ ]] + [[ ${lines[0]} != $(__runc state test_busybox | jq '.pid') ]] +} + +@test "runc exec ls -la" { + # run busybox detached + runc run -d --console-socket $CONSOLE_SOCKET test_busybox + [ "$status" -eq 0 ] + + runc exec test_busybox ls -la + [ "$status" -eq 0 ] + [[ ${lines[0]} == *"total"* ]] + [[ ${lines[1]} == *"."* ]] + [[ ${lines[2]} == *".."* ]] +} + +@test "runc exec ls -la with --cwd" { + # run busybox detached + runc run -d --console-socket $CONSOLE_SOCKET test_busybox + [ "$status" -eq 0 ] + + runc exec --cwd /bin test_busybox pwd + [ "$status" -eq 0 ] + [[ ${output} == "/bin"* ]] +} + +@test "runc exec --env" { + # run busybox detached + runc run -d --console-socket $CONSOLE_SOCKET test_busybox + [ "$status" -eq 0 ] + + runc exec --env RUNC_EXEC_TEST=true test_busybox env + [ "$status" -eq 0 ] + + [[ ${output} == *"RUNC_EXEC_TEST=true"* ]] +} + +@test "runc exec --user" { + # --user can't work in rootless containers that don't have idmap. + [[ "$ROOTLESS" -ne 0 ]] && requires rootless_idmap + + # run busybox detached + runc run -d --console-socket $CONSOLE_SOCKET test_busybox + [ "$status" -eq 0 ] + + runc exec --user 1000:1000 test_busybox id + [ "$status" -eq 0 ] + + [[ "${output}" == "uid=1000 gid=1000"* ]] +} + +@test "runc exec --additional-gids" { + requires root + + # run busybox detached + runc run -d --console-socket $CONSOLE_SOCKET test_busybox + [ "$status" -eq 0 ] + + wait_for_container 15 1 test_busybox + + runc exec --user 1000:1000 --additional-gids 100 --additional-gids 65534 test_busybox id + [ "$status" -eq 0 ] + + [[ ${output} == "uid=1000 gid=1000 groups=100(users),65534(nogroup)" ]] +} + +@test "runc exec --preserve-fds" { + # run busybox detached + runc run -d --console-socket $CONSOLE_SOCKET test_busybox + [ "$status" -eq 0 ] + + run bash -c "cat hello > preserve-fds.test; exec 3/ { print $5; exit }') +CGROUP_CPU_BASE_PATH=$(grep "cgroup" /proc/self/mountinfo | gawk 'toupper($NF) ~ /\/ { print $5; exit }') +if [[ -n "${RUNC_USE_SYSTEMD}" ]] ; then + CGROUPS_PATH="/machine.slice/runc-cgroups-integration-test.scope" +else + CGROUPS_PATH="/runc-cgroups-integration-test/test-cgroup" +fi +CGROUP_MEMORY="${CGROUP_MEMORY_BASE_PATH}${CGROUPS_PATH}" + +# CONFIG_MEMCG_KMEM support +KMEM="${CGROUP_MEMORY_BASE_PATH}/memory.kmem.limit_in_bytes" +RT_PERIOD="${CGROUP_CPU_BASE_PATH}/cpu.rt_period_us" + +# Check if we're in rootless mode. +ROOTLESS=$(id -u) + +# Wrapper for runc. +function runc() { + run __runc "$@" + + # Some debug information to make life easier. bats will only print it if the + # test failed, in which case the output is useful. + echo "runc $@ (status=$status):" >&2 + echo "$output" >&2 +} + +# Raw wrapper for runc. +function __runc() { + "$RUNC" ${RUNC_USE_SYSTEMD+--systemd-cgroup} --root "$ROOT" "$@" +} + +# Wrapper for runc spec, which takes only one argument (the bundle path). +function runc_spec() { + ! [[ "$#" > 1 ]] + + local args=() + local bundle="" + + if [ "$ROOTLESS" -ne 0 ]; then + args+=("--rootless") + fi + if [ "$#" -ne 0 ]; then + bundle="$1" + args+=("--bundle" "$bundle") + fi + + runc spec "${args[@]}" + + # Always add additional mappings if we have idmaps. + if [[ "$ROOTLESS" -ne 0 ]] && [[ "$ROOTLESS_FEATURES" == *"idmap"* ]]; then + runc_rootless_idmap "$bundle" + fi + + # Ensure config.json contains linux.resources + if [[ "$ROOTLESS" -ne 0 ]] && [[ "$ROOTLESS_FEATURES" == *"cgroup"* ]]; then + runc_rootless_cgroup "$bundle" + fi +} + +# Shortcut to add additional uids and gids, based on the values set as part of +# a rootless configuration. +function runc_rootless_idmap() { + bundle="${1:-.}" + cat "$bundle/config.json" \ + | jq '.mounts |= map((select(.type == "devpts") | .options += ["gid=5"]) // .)' \ + | jq '.linux.uidMappings |= .+ [{"hostID": '"$ROOTLESS_UIDMAP_START"', "containerID": 1000, "size": '"$ROOTLESS_UIDMAP_LENGTH"'}]' \ + | jq '.linux.gidMappings |= .+ [{"hostID": '"$ROOTLESS_GIDMAP_START"', "containerID": 100, "size": 1}]' \ + | jq '.linux.gidMappings |= .+ [{"hostID": '"$(($ROOTLESS_GIDMAP_START+10))"', "containerID": 1, "size": 20}]' \ + | jq '.linux.gidMappings |= .+ [{"hostID": '"$(($ROOTLESS_GIDMAP_START+100))"', "containerID": 1000, "size": '"$(($ROOTLESS_GIDMAP_LENGTH-1000))"'}]' \ + >"$bundle/config.json.tmp" + mv "$bundle/config.json"{.tmp,} +} + +# Shortcut to add empty resources as part of a rootless configuration. +function runc_rootless_cgroup() { + bundle="${1:-.}" + cat "$bundle/config.json" \ + | jq '.linux.resources |= .+ {"memory":{},"cpu":{},"blockio":{},"pids":{}}' \ + >"$bundle/config.json.tmp" + mv "$bundle/config.json"{.tmp,} +} + +# Helper function to set cgroupsPath to the value of $CGROUPS_PATH +function set_cgroups_path() { + bundle="${1:-.}" + cgroups_path="/runc-cgroups-integration-test/test-cgroup" + if [[ -n "${RUNC_USE_SYSTEMD}" ]] ; then + cgroups_path="machine.slice:runc-cgroups:integration-test" + fi + sed -i 's#\("linux": {\)#\1\n "cgroupsPath": "'"${cgroups_path}"'",#' "$bundle/config.json" +} + +# Helper function to set a resources limit +function set_resources_limit() { + bundle="${1:-.}" + sed -i 's/\("linux": {\)/\1\n "resources": { "pids": { "limit": 100 } },/' "$bundle/config.json" +} + +# Fails the current test, providing the error given. +function fail() { + echo "$@" >&2 + exit 1 +} + +# Allows a test to specify what things it requires. If the environment can't +# support it, the test is skipped with a message. +function requires() { + for var in "$@"; do + case $var in + criu) + if [ ! -e "$CRIU" ]; then + skip "test requires ${var}" + fi + ;; + root) + if [ "$ROOTLESS" -ne 0 ]; then + skip "test requires ${var}" + fi + ;; + rootless) + if [ "$ROOTLESS" -eq 0 ]; then + skip "test requires ${var}" + fi + ;; + rootless_idmap) + if [[ "$ROOTLESS_FEATURES" != *"idmap"* ]]; then + skip "test requires ${var}" + fi + ;; + rootless_cgroup) + if [[ "$ROOTLESS_FEATURES" != *"cgroup"* ]]; then + skip "test requires ${var}" + fi + ;; + rootless_no_cgroup) + if [[ "$ROOTLESS_FEATURES" == *"cgroup"* ]]; then + skip "test requires ${var}" + fi + ;; + cgroups_kmem) + if [ ! -e "$KMEM" ]; then + skip "Test requires ${var}" + fi + ;; + cgroups_rt) + if [ ! -e "$RT_PERIOD" ]; then + skip "Test requires ${var}" + fi + ;; + *) + fail "BUG: Invalid requires ${var}." + ;; + esac + done +} + +# Retry a command $1 times until it succeeds. Wait $2 seconds between retries. +function retry() { + local attempts=$1 + shift + local delay=$1 + shift + local i + + for ((i = 0; i < attempts; i++)); do + run "$@" + if [[ "$status" -eq 0 ]]; then + return 0 + fi + sleep $delay + done + + echo "Command \"$@\" failed $attempts times. Output: $output" + false +} + +# retry until the given container has state +function wait_for_container() { + local attempts=$1 + local delay=$2 + local cid=$3 + # optionally wait for a specific status + local wait_for_status="${4:-}" + local i + + for ((i = 0; i < attempts; i++)); do + runc state $cid + if [[ "$status" -eq 0 ]]; then + if [[ "${output}" == *"${wait_for_status}"* ]]; then + return 0 + fi + fi + sleep $delay + done + + echo "runc state failed to return state $statecheck $attempts times. Output: $output" + false +} + +# retry until the given container has state +function wait_for_container_inroot() { + local attempts=$1 + local delay=$2 + local cid=$3 + # optionally wait for a specific status + local wait_for_status="${4:-}" + local i + + for ((i = 0; i < attempts; i++)); do + ROOT=$4 runc state $cid + if [[ "$status" -eq 0 ]]; then + if [[ "${output}" == *"${wait_for_status}"* ]]; then + return 0 + fi + fi + sleep $delay + done + + echo "runc state failed to return state $statecheck $attempts times. Output: $output" + false +} + +function testcontainer() { + # test state of container + runc state $1 + [ "$status" -eq 0 ] + [[ "${output}" == *"$2"* ]] +} + +function setup_recvtty() { + # We need to start recvtty in the background, so we double fork in the shell. + ("$RECVTTY" --pid-file "$BATS_TMPDIR/recvtty.pid" --mode null "$CONSOLE_SOCKET" &) & +} + +function teardown_recvtty() { + # When we kill recvtty, the container will also be killed. + if [ -f "$BATS_TMPDIR/recvtty.pid" ]; then + kill -9 $(cat "$BATS_TMPDIR/recvtty.pid") + fi + + # Clean up the files that might be left over. + rm -f "$BATS_TMPDIR/recvtty.pid" + rm -f "$CONSOLE_SOCKET" +} + +function setup_busybox() { + setup_recvtty + run mkdir "$BUSYBOX_BUNDLE" + run mkdir "$BUSYBOX_BUNDLE"/rootfs + if [ -e "/testdata/busybox.tar" ]; then + BUSYBOX_IMAGE="/testdata/busybox.tar" + fi + if [ ! -e $BUSYBOX_IMAGE ]; then + curl -o $BUSYBOX_IMAGE -sSL `get_busybox` + fi + tar --exclude './dev/*' -C "$BUSYBOX_BUNDLE"/rootfs -xf "$BUSYBOX_IMAGE" + cd "$BUSYBOX_BUNDLE" + runc_spec +} + +function setup_hello() { + setup_recvtty + run mkdir "$HELLO_BUNDLE" + run mkdir "$HELLO_BUNDLE"/rootfs + tar --exclude './dev/*' -C "$HELLO_BUNDLE"/rootfs -xf "$HELLO_IMAGE" + cd "$HELLO_BUNDLE" + runc_spec + sed -i 's;"sh";"/hello";' config.json +} + +function teardown_running_container() { + runc list + # $1 should be a container name such as "test_busybox" + # here we detect "test_busybox "(with one extra blank) to avoid conflict prefix + # e.g. "test_busybox" and "test_busybox_update" + if [[ "${output}" == *"$1 "* ]]; then + runc kill $1 KILL + retry 10 1 eval "__runc state '$1' | grep -q 'stopped'" + runc delete $1 + fi +} + +function teardown_running_container_inroot() { + ROOT=$2 runc list + # $1 should be a container name such as "test_busybox" + # here we detect "test_busybox "(with one extra blank) to avoid conflict prefix + # e.g. "test_busybox" and "test_busybox_update" + if [[ "${output}" == *"$1 "* ]]; then + ROOT=$2 runc kill $1 KILL + retry 10 1 eval "ROOT='$2' __runc state '$1' | grep -q 'stopped'" + ROOT=$2 runc delete $1 + fi +} + +function teardown_busybox() { + cd "$INTEGRATION_ROOT" + teardown_recvtty + teardown_running_container test_busybox + run rm -f -r "$BUSYBOX_BUNDLE" +} + +function teardown_hello() { + cd "$INTEGRATION_ROOT" + teardown_recvtty + teardown_running_container test_hello + run rm -f -r "$HELLO_BUNDLE" +} diff --git a/tests/integration/kill.bats b/tests/integration/kill.bats new file mode 100644 index 0000000..d9afe92 --- /dev/null +++ b/tests/integration/kill.bats @@ -0,0 +1,30 @@ +#!/usr/bin/env bats + +load helpers + +function setup() { + teardown_busybox + setup_busybox +} + +function teardown() { + teardown_busybox +} + + +@test "kill detached busybox" { + # run busybox detached + runc run -d --console-socket $CONSOLE_SOCKET test_busybox + [ "$status" -eq 0 ] + + # check state + testcontainer test_busybox running + + runc kill test_busybox KILL + [ "$status" -eq 0 ] + + retry 10 1 eval "__runc state test_busybox | grep -q 'stopped'" + + runc delete test_busybox + [ "$status" -eq 0 ] +} diff --git a/tests/integration/list.bats b/tests/integration/list.bats new file mode 100644 index 0000000..0a938c0 --- /dev/null +++ b/tests/integration/list.bats @@ -0,0 +1,56 @@ +#!/usr/bin/env bats + +load helpers + +function setup() { + teardown_running_container_inroot test_box1 $HELLO_BUNDLE + teardown_running_container_inroot test_box2 $HELLO_BUNDLE + teardown_running_container_inroot test_box3 $HELLO_BUNDLE + teardown_busybox + setup_busybox +} + +function teardown() { + teardown_running_container_inroot test_box1 $HELLO_BUNDLE + teardown_running_container_inroot test_box2 $HELLO_BUNDLE + teardown_running_container_inroot test_box3 $HELLO_BUNDLE + teardown_busybox +} + +@test "list" { + # run a few busyboxes detached + ROOT=$HELLO_BUNDLE runc run -d --console-socket $CONSOLE_SOCKET test_box1 + [ "$status" -eq 0 ] + + ROOT=$HELLO_BUNDLE runc run -d --console-socket $CONSOLE_SOCKET test_box2 + [ "$status" -eq 0 ] + + ROOT=$HELLO_BUNDLE runc run -d --console-socket $CONSOLE_SOCKET test_box3 + [ "$status" -eq 0 ] + + ROOT=$HELLO_BUNDLE runc list + [ "$status" -eq 0 ] + [[ ${lines[0]} =~ ID\ +PID\ +STATUS\ +BUNDLE\ +CREATED+ ]] + [[ "${lines[1]}" == *"test_box1"*[0-9]*"running"*$BUSYBOX_BUNDLE*[0-9]* ]] + [[ "${lines[2]}" == *"test_box2"*[0-9]*"running"*$BUSYBOX_BUNDLE*[0-9]* ]] + [[ "${lines[3]}" == *"test_box3"*[0-9]*"running"*$BUSYBOX_BUNDLE*[0-9]* ]] + + ROOT=$HELLO_BUNDLE runc list -q + [ "$status" -eq 0 ] + [[ "${lines[0]}" == "test_box1" ]] + [[ "${lines[1]}" == "test_box2" ]] + [[ "${lines[2]}" == "test_box3" ]] + + ROOT=$HELLO_BUNDLE runc list --format table + [ "$status" -eq 0 ] + [[ ${lines[0]} =~ ID\ +PID\ +STATUS\ +BUNDLE\ +CREATED+ ]] + [[ "${lines[1]}" == *"test_box1"*[0-9]*"running"*$BUSYBOX_BUNDLE*[0-9]* ]] + [[ "${lines[2]}" == *"test_box2"*[0-9]*"running"*$BUSYBOX_BUNDLE*[0-9]* ]] + [[ "${lines[3]}" == *"test_box3"*[0-9]*"running"*$BUSYBOX_BUNDLE*[0-9]* ]] + + ROOT=$HELLO_BUNDLE runc list --format json + [ "$status" -eq 0 ] + [[ "${lines[0]}" == [\[][\{]"\"ociVersion\""[:]"\""*[0-9][\.]*[0-9][\.]*[0-9]*"\""[,]"\"id\""[:]"\"test_box1\""[,]"\"pid\""[:]*[0-9][,]"\"status\""[:]*"\"running\""[,]"\"bundle\""[:]*$BUSYBOX_BUNDLE*[,]"\"rootfs\""[:]"\""*"\""[,]"\"created\""[:]*[0-9]*[\}]* ]] + [[ "${lines[0]}" == *[,][\{]"\"ociVersion\""[:]"\""*[0-9][\.]*[0-9][\.]*[0-9]*"\""[,]"\"id\""[:]"\"test_box2\""[,]"\"pid\""[:]*[0-9][,]"\"status\""[:]*"\"running\""[,]"\"bundle\""[:]*$BUSYBOX_BUNDLE*[,]"\"rootfs\""[:]"\""*"\""[,]"\"created\""[:]*[0-9]*[\}]* ]] + [[ "${lines[0]}" == *[,][\{]"\"ociVersion\""[:]"\""*[0-9][\.]*[0-9][\.]*[0-9]*"\""[,]"\"id\""[:]"\"test_box3\""[,]"\"pid\""[:]*[0-9][,]"\"status\""[:]*"\"running\""[,]"\"bundle\""[:]*$BUSYBOX_BUNDLE*[,]"\"rootfs\""[:]"\""*"\""[,]"\"created\""[:]*[0-9]*[\}][\]] ]] +} diff --git a/tests/integration/mask.bats b/tests/integration/mask.bats new file mode 100644 index 0000000..aaa8042 --- /dev/null +++ b/tests/integration/mask.bats @@ -0,0 +1,59 @@ +#!/usr/bin/env bats + +load helpers + +function setup() { + teardown_busybox + setup_busybox + + # Create fake rootfs. + mkdir rootfs/testdir + echo "Forbidden information!" > rootfs/testfile + + # add extra masked paths + sed -i 's;"maskedPaths": \[;"maskedPaths": \["/testdir","/testfile",;g' config.json +} + +function teardown() { + teardown_busybox +} + +@test "mask paths [file]" { + # run busybox detached + runc run -d --console-socket $CONSOLE_SOCKET test_busybox + [ "$status" -eq 0 ] + + runc exec test_busybox cat /testfile + [ "$status" -eq 0 ] + [[ "${output}" == "" ]] + + runc exec test_busybox rm -f /testfile + [ "$status" -eq 1 ] + [[ "${output}" == *"Read-only file system"* ]] + + runc exec test_busybox umount /testfile + [ "$status" -eq 1 ] + [[ "${output}" == *"Operation not permitted"* ]] +} + +@test "mask paths [directory]" { + # run busybox detached + runc run -d --console-socket $CONSOLE_SOCKET test_busybox + [ "$status" -eq 0 ] + + runc exec test_busybox ls /testdir + [ "$status" -eq 0 ] + [[ "${output}" == "" ]] + + runc exec test_busybox touch /testdir/foo + [ "$status" -eq 1 ] + [[ "${output}" == *"Read-only file system"* ]] + + runc exec test_busybox rm -rf /testdir + [ "$status" -eq 1 ] + [[ "${output}" == *"Read-only file system"* ]] + + runc exec test_busybox umount /testdir + [ "$status" -eq 1 ] + [[ "${output}" == *"Operation not permitted"* ]] +} diff --git a/tests/integration/mounts.bats b/tests/integration/mounts.bats new file mode 100755 index 0000000..c35b3c5 --- /dev/null +++ b/tests/integration/mounts.bats @@ -0,0 +1,21 @@ +#!/usr/bin/env bats + +load helpers + +function setup() { + teardown_busybox + setup_busybox +} + +function teardown() { + teardown_busybox +} + +@test "runc run [bind mount]" { + CONFIG=$(jq '.mounts |= . + [{"source": ".", "destination": "/tmp/bind", "options": ["bind"]}] | .process.args = ["ls", "/tmp/bind/config.json"]' config.json) + echo "${CONFIG}" >config.json + + runc run test_bind_mount + [ "$status" -eq 0 ] + [[ "${lines[0]}" =~ '/tmp/bind/config.json' ]] +} diff --git a/tests/integration/multi-arch.bash b/tests/integration/multi-arch.bash new file mode 100644 index 0000000..5616bf7 --- /dev/null +++ b/tests/integration/multi-arch.bash @@ -0,0 +1,22 @@ +#!/bin/bash +get_busybox(){ + case $(go env GOARCH) in + arm64) + echo 'https://github.com/docker-library/busybox/raw/dist-arm64v8/glibc/busybox.tar.xz' + ;; + *) + echo 'https://github.com/docker-library/busybox/raw/dist-amd64/glibc/busybox.tar.xz' + ;; + esac +} + +get_hello(){ + case $(go env GOARCH) in + arm64) + echo 'hello-world-aarch64.tar' + ;; + *) + echo 'hello-world.tar' + ;; + esac +} diff --git a/tests/integration/pause.bats b/tests/integration/pause.bats new file mode 100644 index 0000000..4e25e59 --- /dev/null +++ b/tests/integration/pause.bats @@ -0,0 +1,72 @@ +#!/usr/bin/env bats + +load helpers + +function setup() { + teardown_busybox + setup_busybox +} + +function teardown() { + teardown_busybox +} + +@test "runc pause and resume" { + # XXX: currently cgroups require root containers. + requires root + + # run busybox detached + runc run -d --console-socket $CONSOLE_SOCKET test_busybox + [ "$status" -eq 0 ] + + testcontainer test_busybox running + + # pause busybox + runc pause test_busybox + [ "$status" -eq 0 ] + + # test state of busybox is paused + testcontainer test_busybox paused + + # resume busybox + runc resume test_busybox + [ "$status" -eq 0 ] + + # test state of busybox is back to running + testcontainer test_busybox running +} + +@test "runc pause and resume with nonexist container" { + # XXX: currently cgroups require root containers. + requires root + + # run test_busybox detached + runc run -d --console-socket $CONSOLE_SOCKET test_busybox + [ "$status" -eq 0 ] + + testcontainer test_busybox running + + # pause test_busybox and nonexistent container + runc pause test_busybox + [ "$status" -eq 0 ] + runc pause nonexistent + [ "$status" -ne 0 ] + + # test state of test_busybox is paused + testcontainer test_busybox paused + + # resume test_busybox and nonexistent container + runc resume test_busybox + [ "$status" -eq 0 ] + runc resume nonexistent + [ "$status" -ne 0 ] + + # test state of test_busybox is back to running + testcontainer test_busybox running + + # delete test_busybox + runc delete --force test_busybox + + runc state test_busybox + [ "$status" -ne 0 ] +} diff --git a/tests/integration/ps.bats b/tests/integration/ps.bats new file mode 100644 index 0000000..646b5ab --- /dev/null +++ b/tests/integration/ps.bats @@ -0,0 +1,62 @@ +#!/usr/bin/env bats + +load helpers + +function setup() { + teardown_busybox + setup_busybox +} + +function teardown() { + teardown_busybox +} + +@test "ps" { + # ps is not supported, it requires cgroups + requires root + + # start busybox detached + runc run -d --console-socket $CONSOLE_SOCKET test_busybox + [ "$status" -eq 0 ] + + # check state + testcontainer test_busybox running + + runc ps test_busybox + [ "$status" -eq 0 ] + [[ ${lines[0]} =~ UID\ +PID\ +PPID\ +C\ +STIME\ +TTY\ +TIME\ +CMD+ ]] + [[ "${lines[1]}" == *"$(id -un 2>/dev/null)"*[0-9]* ]] +} + +@test "ps -f json" { + # ps is not supported, it requires cgroups + requires root + + # start busybox detached + runc run -d --console-socket $CONSOLE_SOCKET test_busybox + [ "$status" -eq 0 ] + + # check state + testcontainer test_busybox running + + runc ps -f json test_busybox + [ "$status" -eq 0 ] + [[ ${lines[0]} =~ [0-9]+ ]] +} + +@test "ps -e -x" { + # ps is not supported, it requires cgroups + requires root + + # start busybox detached + runc run -d --console-socket $CONSOLE_SOCKET test_busybox + [ "$status" -eq 0 ] + + # check state + testcontainer test_busybox running + + runc ps test_busybox -e -x + [ "$status" -eq 0 ] + [[ ${lines[0]} =~ \ +PID\ +TTY\ +STAT\ +TIME\ +COMMAND+ ]] + [[ "${lines[1]}" =~ [0-9]+ ]] +} diff --git a/tests/integration/root.bats b/tests/integration/root.bats new file mode 100644 index 0000000..90b53b4 --- /dev/null +++ b/tests/integration/root.bats @@ -0,0 +1,50 @@ +#!/usr/bin/env bats + +load helpers + +function setup() { + teardown_running_container_inroot test_dotbox $HELLO_BUNDLE + teardown_busybox + setup_busybox +} + +function teardown() { + teardown_running_container_inroot test_dotbox $HELLO_BUNDLE + teardown_busybox +} + +@test "global --root" { + # run busybox detached using $HELLO_BUNDLE for state + ROOT=$HELLO_BUNDLE runc run -d --console-socket $CONSOLE_SOCKET test_dotbox + [ "$status" -eq 0 ] + + # run busybox detached in default root + runc run -d --console-socket $CONSOLE_SOCKET test_busybox + [ "$status" -eq 0 ] + + runc state test_busybox + [ "$status" -eq 0 ] + [[ "${output}" == *"running"* ]] + + ROOT=$HELLO_BUNDLE runc state test_dotbox + [ "$status" -eq 0 ] + [[ "${output}" == *"running"* ]] + + ROOT=$HELLO_BUNDLE runc state test_busybox + [ "$status" -ne 0 ] + + runc state test_dotbox + [ "$status" -ne 0 ] + + runc kill test_busybox KILL + [ "$status" -eq 0 ] + retry 10 1 eval "__runc state test_busybox | grep -q 'stopped'" + runc delete test_busybox + [ "$status" -eq 0 ] + + ROOT=$HELLO_BUNDLE runc kill test_dotbox KILL + [ "$status" -eq 0 ] + retry 10 1 eval "ROOT='$HELLO_BUNDLE' __runc state test_dotbox | grep -q 'stopped'" + ROOT=$HELLO_BUNDLE runc delete test_dotbox + [ "$status" -eq 0 ] +} diff --git a/tests/integration/spec.bats b/tests/integration/spec.bats new file mode 100644 index 0000000..5df8f70 --- /dev/null +++ b/tests/integration/spec.bats @@ -0,0 +1,96 @@ +#!/usr/bin/env bats + +load helpers + +function setup() { + # initial cleanup in case a prior test exited and did not cleanup + cd "$INTEGRATION_ROOT" + run rm -f -r "$HELLO_BUNDLE" + + # setup hello-world for spec generation testing + run mkdir "$HELLO_BUNDLE" + run mkdir "$HELLO_BUNDLE"/rootfs + run tar -C "$HELLO_BUNDLE"/rootfs -xf "$HELLO_IMAGE" +} + +function teardown() { + cd "$INTEGRATION_ROOT" + run rm -f -r "$HELLO_BUNDLE" +} + +@test "spec generation cwd" { + cd "$HELLO_BUNDLE" + # note this test runs from the bundle not the integration root + + # test that config.json does not exist after the above partial setup + [ ! -e config.json ] + + # test generation of spec does not return an error + runc_spec + [ "$status" -eq 0 ] + + # test generation of spec created our config.json (spec) + [ -e config.json ] + + # test existence of required args parameter in the generated config.json + run bash -c "grep -A2 'args' config.json | grep 'sh'" + [[ "${output}" == *"sh"* ]] + + # change the default args parameter from sh to hello + sed -i 's;"sh";"/hello";' config.json + + # ensure the generated spec works by running hello-world + runc run test_hello + [ "$status" -eq 0 ] +} + +@test "spec generation --bundle" { + # note this test runs from the integration root not the bundle + + # test that config.json does not exist after the above partial setup + [ ! -e "$HELLO_BUNDLE"/config.json ] + + # test generation of spec does not return an error + runc_spec "$HELLO_BUNDLE" + [ "$status" -eq 0 ] + + # test generation of spec created our config.json (spec) + [ -e "$HELLO_BUNDLE"/config.json ] + + # change the default args parameter from sh to hello + sed -i 's;"sh";"/hello";' "$HELLO_BUNDLE"/config.json + + # ensure the generated spec works by running hello-world + runc run --bundle "$HELLO_BUNDLE" test_hello + [ "$status" -eq 0 ] +} + +@test "spec validator" { + TESTDIR=$(pwd) + cd "$HELLO_BUNDLE" + + run git clone https://github.com/opencontainers/runtime-spec.git src/runtime-spec + [ "$status" -eq 0 ] + + SPEC_COMMIT=$(grep '^github.com/opencontainers/runtime-spec' ${TESTDIR}/../../vendor.conf | tr -s ' ' | cut -d ' ' -f 2) + run git -C src/runtime-spec reset --hard "${SPEC_COMMIT}" + + [ "$status" -eq 0 ] + [ -e src/runtime-spec/schema/config-schema.json ] + + run bash -c "GOPATH='$GOPATH' go get github.com/xeipuuv/gojsonschema" + [ "$status" -eq 0 ] + + run git -C "${GOPATH}/src/github.com/xeipuuv/gojsonschema" reset --hard 6637feb73ee44cd4640bb3def285c29774234c7f + [ "$status" -eq 0 ] + + GOPATH="$GOPATH" go build src/runtime-spec/schema/validate.go + [ -e ./validate ] + + runc spec + [ -e config.json ] + + run ./validate src/runtime-spec/schema/config-schema.json config.json + [ "$status" -eq 0 ] + [[ "${lines[0]}" == *"The document is valid"* ]] +} diff --git a/tests/integration/start.bats b/tests/integration/start.bats new file mode 100644 index 0000000..1f0ea8e --- /dev/null +++ b/tests/integration/start.bats @@ -0,0 +1,31 @@ +#!/usr/bin/env bats + +load helpers + +function setup() { + teardown_busybox + setup_busybox +} + +function teardown() { + teardown_busybox +} + +@test "runc start" { + runc create --console-socket $CONSOLE_SOCKET test_busybox + [ "$status" -eq 0 ] + + testcontainer test_busybox created + + # start container test_busybox + runc start test_busybox + [ "$status" -eq 0 ] + + testcontainer test_busybox running + + # delete test_busybox + runc delete --force test_busybox + + runc state test_busybox + [ "$status" -ne 0 ] +} diff --git a/tests/integration/start_detached.bats b/tests/integration/start_detached.bats new file mode 100644 index 0000000..7f177b8 --- /dev/null +++ b/tests/integration/start_detached.bats @@ -0,0 +1,76 @@ +#!/usr/bin/env bats + +load helpers + +function setup() { + teardown_busybox + setup_busybox +} + +function teardown() { + teardown_busybox +} + +@test "runc run detached" { + # run busybox detached + runc run -d --console-socket $CONSOLE_SOCKET test_busybox + [ "$status" -eq 0 ] + + # check state + testcontainer test_busybox running +} + +@test "runc run detached ({u,g}id != 0)" { + # cannot start containers as another user in rootless setup without idmap + [[ "$ROOTLESS" -ne 0 ]] && requires rootless_idmap + + # replace "uid": 0 with "uid": 1000 + # and do a similar thing for gid. + sed -i 's;"uid": 0;"uid": 1000;g' config.json + sed -i 's;"gid": 0;"gid": 100;g' config.json + + # run busybox detached + runc run -d --console-socket $CONSOLE_SOCKET test_busybox + [ "$status" -eq 0 ] + + # check state + testcontainer test_busybox running +} + +@test "runc run detached --pid-file" { + # run busybox detached + runc run --pid-file pid.txt -d --console-socket $CONSOLE_SOCKET test_busybox + [ "$status" -eq 0 ] + + # check state + testcontainer test_busybox running + + # check pid.txt was generated + [ -e pid.txt ] + + run cat pid.txt + [ "$status" -eq 0 ] + [[ ${lines[0]} == $(__runc state test_busybox | jq '.pid') ]] +} + +@test "runc run detached --pid-file with new CWD" { + # create pid_file directory as the CWD + run mkdir pid_file + [ "$status" -eq 0 ] + run cd pid_file + [ "$status" -eq 0 ] + + # run busybox detached + runc run --pid-file pid.txt -d -b $BUSYBOX_BUNDLE --console-socket $CONSOLE_SOCKET test_busybox + [ "$status" -eq 0 ] + + # check state + testcontainer test_busybox running + + # check pid.txt was generated + [ -e pid.txt ] + + run cat pid.txt + [ "$status" -eq 0 ] + [[ ${lines[0]} == $(__runc state test_busybox | jq '.pid') ]] +} diff --git a/tests/integration/start_hello.bats b/tests/integration/start_hello.bats new file mode 100644 index 0000000..a706be2 --- /dev/null +++ b/tests/integration/start_hello.bats @@ -0,0 +1,64 @@ +#!/usr/bin/env bats + +load helpers + +function setup() { + teardown_hello + setup_hello +} + +function teardown() { + teardown_hello +} + +@test "runc run" { + # run hello-world + runc run test_hello + [ "$status" -eq 0 ] + + # check expected output + [[ "${output}" == *"Hello"* ]] +} + +@test "runc run ({u,g}id != 0)" { + # cannot start containers as another user in rootless setup without idmap + [[ "$ROOTLESS" -ne 0 ]] && requires rootless_idmap + + # replace "uid": 0 with "uid": 1000 + # and do a similar thing for gid. + sed -i 's;"uid": 0;"uid": 1000;g' config.json + sed -i 's;"gid": 0;"gid": 100;g' config.json + + # run hello-world + runc run test_hello + [ "$status" -eq 0 ] + + # check expected output + [[ "${output}" == *"Hello"* ]] +} + +@test "runc run with rootfs set to ." { + cp config.json rootfs/. + rm config.json + cd rootfs + sed -i 's;"rootfs";".";' config.json + + # run hello-world + runc run test_hello + [ "$status" -eq 0 ] + [[ "${output}" == *"Hello"* ]] +} + +@test "runc run --pid-file" { + # run hello-world + runc run --pid-file pid.txt test_hello + [ "$status" -eq 0 ] + [[ "${output}" == *"Hello"* ]] + + # check pid.txt was generated + [ -e pid.txt ] + + run cat pid.txt + [ "$status" -eq 0 ] + [[ ${lines[0]} =~ [0-9]+ ]] +} diff --git a/tests/integration/state.bats b/tests/integration/state.bats new file mode 100644 index 0000000..68dae38 --- /dev/null +++ b/tests/integration/state.bats @@ -0,0 +1,66 @@ +#!/usr/bin/env bats + +load helpers + +function setup() { + teardown_busybox + setup_busybox +} + +function teardown() { + teardown_busybox +} + +@test "state (kill + delete)" { + runc state test_busybox + [ "$status" -ne 0 ] + + # run busybox detached + runc run -d --console-socket $CONSOLE_SOCKET test_busybox + [ "$status" -eq 0 ] + + # check state + testcontainer test_busybox running + + runc kill test_busybox KILL + [ "$status" -eq 0 ] + + # wait for busybox to be in the destroyed state + retry 10 1 eval "__runc state test_busybox | grep -q 'stopped'" + + # delete test_busybox + runc delete test_busybox + [ "$status" -eq 0 ] + + runc state test_busybox + [ "$status" -ne 0 ] +} + +@test "state (pause + resume)" { + # XXX: pause and resume require cgroups. + requires root + + runc state test_busybox + [ "$status" -ne 0 ] + + # run busybox detached + runc run -d --console-socket $CONSOLE_SOCKET test_busybox + [ "$status" -eq 0 ] + + # check state + testcontainer test_busybox running + + # pause busybox + runc pause test_busybox + [ "$status" -eq 0 ] + + # test state of busybox is paused + testcontainer test_busybox paused + + # resume busybox + runc resume test_busybox + [ "$status" -eq 0 ] + + # test state of busybox is back to running + testcontainer test_busybox running +} diff --git a/tests/integration/testdata/hello-world-aarch64.tar b/tests/integration/testdata/hello-world-aarch64.tar new file mode 100644 index 0000000..186c8ae Binary files /dev/null and b/tests/integration/testdata/hello-world-aarch64.tar differ diff --git a/tests/integration/testdata/hello-world.tar b/tests/integration/testdata/hello-world.tar new file mode 100644 index 0000000..aec830e Binary files /dev/null and b/tests/integration/testdata/hello-world.tar differ diff --git a/tests/integration/tty.bats b/tests/integration/tty.bats new file mode 100644 index 0000000..688875d --- /dev/null +++ b/tests/integration/tty.bats @@ -0,0 +1,230 @@ +#!/usr/bin/env bats + +load helpers + +function setup() { + teardown_busybox + setup_busybox +} + +function teardown() { + teardown_busybox +} + +@test "runc run [tty ptsname]" { + # Replace sh script with readlink. + sed -i 's|"sh"|"sh", "-c", "for file in /proc/self/fd/[012]; do readlink $file; done"|' config.json + + # run busybox + runc run test_busybox + [ "$status" -eq 0 ] + [[ ${lines[0]} =~ /dev/pts/+ ]] + [[ ${lines[1]} =~ /dev/pts/+ ]] + [[ ${lines[2]} =~ /dev/pts/+ ]] +} + +@test "runc run [tty owner]" { + # tty chmod is not doable in rootless containers without idmap. + # TODO: this can be made as a change to the gid test. + [[ "$ROOTLESS" -ne 0 ]] && requires rootless_idmap + + # Replace sh script with stat. + sed -i 's/"sh"/"sh", "-c", "stat -c %u:%g $(tty) | tr : \\\\\\\\n"/' config.json + + # run busybox + runc run test_busybox + [ "$status" -eq 0 ] + [[ ${lines[0]} =~ 0 ]] + # This is set by the default config.json (it corresponds to the standard tty group). + [[ ${lines[1]} =~ 5 ]] +} + +@test "runc run [tty owner] ({u,g}id != 0)" { + # tty chmod is not doable in rootless containers without idmap. + [[ "$ROOTLESS" -ne 0 ]] && requires rootless_idmap + + # replace "uid": 0 with "uid": 1000 + # and do a similar thing for gid. + sed -i 's;"uid": 0;"uid": 1000;g' config.json + sed -i 's;"gid": 0;"gid": 100;g' config.json + + # Replace sh script with stat. + sed -i 's/"sh"/"sh", "-c", "stat -c %u:%g $(tty) | tr : \\\\\\\\n"/' config.json + + # run busybox + runc run test_busybox + [ "$status" -eq 0 ] + [[ ${lines[0]} =~ 1000 ]] + # This is set by the default config.json (it corresponds to the standard tty group). + [[ ${lines[1]} =~ 5 ]] +} + +@test "runc exec [tty ptsname]" { + # run busybox detached + runc run -d --console-socket $CONSOLE_SOCKET test_busybox + [ "$status" -eq 0 ] + + # make sure we're running + testcontainer test_busybox running + + # run the exec + runc exec test_busybox sh -c 'for file in /proc/self/fd/[012]; do readlink $file; done' + [ "$status" -eq 0 ] + [[ ${lines[0]} =~ /dev/pts/+ ]] + [[ ${lines[1]} =~ /dev/pts/+ ]] + [[ ${lines[2]} =~ /dev/pts/+ ]] +} + +@test "runc exec [tty owner]" { + # tty chmod is not doable in rootless containers without idmap. + # TODO: this can be made as a change to the gid test. + [[ "$ROOTLESS" -ne 0 ]] && requires rootless_idmap + + # run busybox detached + runc run -d --console-socket $CONSOLE_SOCKET test_busybox + [ "$status" -eq 0 ] + + # make sure we're running + testcontainer test_busybox running + + # run the exec + runc exec test_busybox sh -c 'stat -c %u:%g $(tty) | tr : \\n' + [ "$status" -eq 0 ] + [[ ${lines[0]} =~ 0 ]] + [[ ${lines[1]} =~ 5 ]] +} + +@test "runc exec [tty owner] ({u,g}id != 0)" { + # tty chmod is not doable in rootless containers without idmap. + [[ "$ROOTLESS" -ne 0 ]] && requires rootless_idmap + + # replace "uid": 0 with "uid": 1000 + # and do a similar thing for gid. + sed -i 's;"uid": 0;"uid": 1000;g' config.json + sed -i 's;"gid": 0;"gid": 100;g' config.json + + # run busybox detached + runc run -d --console-socket $CONSOLE_SOCKET test_busybox + [ "$status" -eq 0 ] + + # make sure we're running + testcontainer test_busybox running + + # run the exec + runc exec test_busybox sh -c 'stat -c %u:%g $(tty) | tr : \\n' + [ "$status" -eq 0 ] + [[ ${lines[0]} =~ 1000 ]] + [[ ${lines[1]} =~ 5 ]] +} + +@test "runc exec [tty consolesize]" { + # allow writing to filesystem + sed -i 's/"readonly": true/"readonly": false/' config.json + + # run busybox detached + runc run -d --console-socket $CONSOLE_SOCKET test_busybox + [ "$status" -eq 0 ] + + # make sure we're running + testcontainer test_busybox running + + tty_info_with_consize_size=$( cat < /tmp/tty-info" + ], + "cwd": "/" +} +EOF + ) + + # run the exec + runc exec --pid-file pid.txt -d --console-socket $CONSOLE_SOCKET -p <( echo $tty_info_with_consize_size ) test_busybox + [ "$status" -eq 0 ] + + # check the pid was generated + [ -e pid.txt ] + + #wait user process to finish + timeout 1 tail --pid=$(head -n 1 pid.txt) -f /dev/null + + tty_info=$( cat </ { print $5; exit }') + eval CGROUP_${g}="${base_path}${CGROUPS_PATH}" + done + + CGROUP_SYSTEM_MEMORY=$(grep "cgroup" /proc/self/mountinfo | gawk 'toupper($NF) ~ /\<'MEMORY'\>/ { print $5; exit }') + + # check that initial values were properly set + check_cgroup_value $CGROUP_CPU "cpu.cfs_period_us" 1000000 + check_cgroup_value $CGROUP_CPU "cpu.cfs_quota_us" 500000 + check_cgroup_value $CGROUP_CPU "cpu.shares" 100 + check_cgroup_value $CGROUP_CPUSET "cpuset.cpus" 0 + check_cgroup_value $CGROUP_MEMORY "memory.kmem.limit_in_bytes" 16777216 + check_cgroup_value $CGROUP_MEMORY "memory.kmem.tcp.limit_in_bytes" 11534336 + check_cgroup_value $CGROUP_MEMORY "memory.limit_in_bytes" 33554432 + check_cgroup_value $CGROUP_MEMORY "memory.soft_limit_in_bytes" 25165824 + check_cgroup_value $CGROUP_PIDS "pids.max" 20 + + # update cpu-period + runc update test_update --cpu-period 900000 + [ "$status" -eq 0 ] + check_cgroup_value $CGROUP_CPU "cpu.cfs_period_us" 900000 + + # update cpu-quota + runc update test_update --cpu-quota 600000 + [ "$status" -eq 0 ] + check_cgroup_value $CGROUP_CPU "cpu.cfs_quota_us" 600000 + + # update cpu-shares + runc update test_update --cpu-share 200 + [ "$status" -eq 0 ] + check_cgroup_value $CGROUP_CPU "cpu.shares" 200 + + # update cpuset if supported (i.e. we're running on a multicore cpu) + cpu_count=$(grep '^processor' /proc/cpuinfo | wc -l) + if [ $cpu_count -gt 1 ]; then + runc update test_update --cpuset-cpus "1" + [ "$status" -eq 0 ] + check_cgroup_value $CGROUP_CPUSET "cpuset.cpus" 1 + fi + + # update memory limit + runc update test_update --memory 67108864 + [ "$status" -eq 0 ] + check_cgroup_value $CGROUP_MEMORY "memory.limit_in_bytes" 67108864 + + runc update test_update --memory 50M + [ "$status" -eq 0 ] + check_cgroup_value $CGROUP_MEMORY "memory.limit_in_bytes" 52428800 + + # update memory soft limit + runc update test_update --memory-reservation 33554432 + [ "$status" -eq 0 ] + check_cgroup_value $CGROUP_MEMORY "memory.soft_limit_in_bytes" 33554432 + + # Run swap memory tests if swap is available + if [ -f "$CGROUP_MEMORY/memory.memsw.limit_in_bytes" ]; then + # try to remove memory swap limit + runc update test_update --memory-swap -1 + [ "$status" -eq 0 ] + # Get System memory swap limit + SYSTEM_MEMORY_SW=$(cat "${CGROUP_SYSTEM_MEMORY}/memory.memsw.limit_in_bytes") + check_cgroup_value $CGROUP_MEMORY "memory.memsw.limit_in_bytes" ${SYSTEM_MEMORY_SW} + + # update memory swap + runc update test_update --memory-swap 96468992 + [ "$status" -eq 0 ] + check_cgroup_value $CGROUP_MEMORY "memory.memsw.limit_in_bytes" 96468992 + fi; + + # try to remove memory limit + runc update test_update --memory -1 + [ "$status" -eq 0 ] + + # Get System memory limit + SYSTEM_MEMORY=$(cat "${CGROUP_SYSTEM_MEMORY}/memory.limit_in_bytes") + # check memory limited is gone + check_cgroup_value $CGROUP_MEMORY "memory.limit_in_bytes" ${SYSTEM_MEMORY} + + # check swap memory limited is gone + if [ -f "$CGROUP_MEMORY/memory.memsw.limit_in_bytes" ]; then + check_cgroup_value $CGROUP_MEMORY "memory.memsw.limit_in_bytes" ${SYSTEM_MEMORY} + fi + + # update kernel memory limit + runc update test_update --kernel-memory 50331648 + [ "$status" -eq 0 ] + check_cgroup_value $CGROUP_MEMORY "memory.kmem.limit_in_bytes" 50331648 + + # update kernel memory tcp limit + runc update test_update --kernel-memory-tcp 41943040 + [ "$status" -eq 0 ] + check_cgroup_value $CGROUP_MEMORY "memory.kmem.tcp.limit_in_bytes" 41943040 + + # update pids limit + runc update test_update --pids-limit 10 + [ "$status" -eq 0 ] + check_cgroup_value $CGROUP_PIDS "pids.max" 10 + + # Revert to the test initial value via json on stding + runc update -r - test_update < $BATS_TMPDIR/runc-cgroups-integration-test.json + + runc update -r $BATS_TMPDIR/runc-cgroups-integration-test.json test_update + [ "$status" -eq 0 ] + check_cgroup_value $CGROUP_CPU "cpu.cfs_period_us" 1000000 + check_cgroup_value $CGROUP_CPU "cpu.cfs_quota_us" 500000 + check_cgroup_value $CGROUP_CPU "cpu.shares" 100 + check_cgroup_value $CGROUP_CPUSET "cpuset.cpus" 0 + check_cgroup_value $CGROUP_MEMORY "memory.kmem.limit_in_bytes" 16777216 + check_cgroup_value $CGROUP_MEMORY "memory.kmem.tcp.limit_in_bytes" 11534336 + check_cgroup_value $CGROUP_MEMORY "memory.limit_in_bytes" 33554432 + check_cgroup_value $CGROUP_MEMORY "memory.soft_limit_in_bytes" 25165824 + check_cgroup_value $CGROUP_PIDS "pids.max" 20 +} + +@test "update rt period and runtime" { + [[ "$ROOTLESS" -ne 0 ]] && requires rootless_cgroup + requires cgroups_kmem cgroups_rt + + # run a detached busybox + runc run -d --console-socket $CONSOLE_SOCKET test_update_rt + [ "$status" -eq 0 ] + + # get the cgroup paths + eval CGROUP_CPU="${CGROUP_CPU_BASE_PATH}${CGROUPS_PATH}" + + runc update -r - test_update_rt < /etc/subuid.tmp + mv /etc/subuid{.tmp,} + [ -e /etc/subgid.tmp ] && mv /etc/subgid{.tmp,} + ( grep -v '^rootless' /etc/subgid ; echo "rootless:$ROOTLESS_GIDMAP_START:$ROOTLESS_GIDMAP_LENGTH" ) > /etc/subgid.tmp + mv /etc/subgid{.tmp,} + + # Reactivate new{uid,gid}map helpers if applicable. + [ -e /usr/bin/unused-newuidmap ] && mv /usr/bin/{unused-,}newuidmap + [ -e /usr/bin/unused-newgidmap ] && mv /usr/bin/{unused-,}newgidmap +} + +function disable_idmap() { + export ROOTLESS_UIDMAP_START ROOTLESS_UIDMAP_LENGTH + export ROOTLESS_GIDMAP_START ROOTLESS_GIDMAP_LENGTH + + # Deactivate sub{uid,gid} mappings. + [ -e /etc/subuid ] && mv /etc/subuid{,.tmp} + [ -e /etc/subgid ] && mv /etc/subgid{,.tmp} + + # Deactivate new{uid,gid}map helpers. setuid is preserved with mv(1). + [ -e /usr/bin/newuidmap ] && mv /usr/bin/{,unused-}newuidmap + [ -e /usr/bin/newgidmap ] && mv /usr/bin/{,unused-}newgidmap +} + +# FEATURE: Opportunistic cgroups support, allowing a rootless container to set +# resource limits on condition that cgroupsPath is set to a path the +# rootless user has permissions on. + +# List of cgroups. We handle name= cgroups as well as combined +# (comma-separated) cgroups and correctly split and/or strip them. +ALL_CGROUPS=( $(cat /proc/self/cgroup | cut -d: -f2 | sed -E '{s/^name=//;s/,/\n/;/^$/D}') ) +CGROUP_MOUNT="/sys/fs/cgroup" +CGROUP_PATH="/runc-cgroups-integration-test" + +function enable_cgroup() { + # Set up cgroups for use in rootless containers. + for cg in "${ALL_CGROUPS[@]}" + do + mkdir -p "$CGROUP_MOUNT/$cg$CGROUP_PATH" + # We only need to allow write access to {cgroup.procs,tasks} and the + # directory. Rather than changing the owner entirely, we just change + # the group and then allow write access to the group (in order to + # further limit the possible DAC permissions that runc could use). + chown root:rootless "$CGROUP_MOUNT/$cg$CGROUP_PATH/"{,cgroup.procs,tasks} + chmod g+rwx "$CGROUP_MOUNT/$cg$CGROUP_PATH/"{,cgroup.procs,tasks} + # Due to cpuset's semantics we need to give extra permissions to allow + # for runc to set up the hierarchy. XXX: This really shouldn't be + # necessary, and might actually be a bug in our impl of cgroup + # handling. + [[ "$cg" == "cpuset" ]] && chown rootless:rootless "$CGROUP_MOUNT/$cg$CGROUP_PATH/cpuset."{cpus,mems} + done +} + +function disable_cgroup() { + # Remove cgroups used in rootless containers. + for cg in "${ALL_CGROUPS[@]}" + do + [ -d "$CGROUP_MOUNT/$cg$CGROUP_PATH" ] && rmdir "$CGROUP_MOUNT/$cg$CGROUP_PATH" + done +} + +# Create a powerset of $ALL_FEATURES (the set of all subsets of $ALL_FEATURES). +# We test all of the possible combinations (as long as we don't add too many +# feature knobs this shouldn't take too long -- but the number of tested +# combinations is O(2^n)). +function powerset() { + eval printf '%s' $(printf '{,%s+}' "$@"): +} +features_powerset="$(powerset "${ALL_FEATURES[@]}")" + +# Iterate over the powerset of all features. +IFS=: +for enabled_features in $features_powerset +do + idx="$(($idx+1))" + echo "[$(printf '%.2d' "$idx")] run rootless tests ... (${enabled_features%%+})" + + unset IFS + for feature in "${ALL_FEATURES[@]}" + do + hook_func="disable_$feature" + grep -E "(^|\+)$feature(\+|$)" <<<$enabled_features &>/dev/null && hook_func="enable_$feature" + "$hook_func" + done + + # Run the test suite! + set -e + echo path: $PATH + export ROOTLESS_FEATURES="$enabled_features" + sudo -HE -u rootless PATH="$PATH" bats -t "$ROOT/tests/integration$TESTFLAGS" + set +e +done diff --git a/tty.go b/tty.go new file mode 100644 index 0000000..6106c2d --- /dev/null +++ b/tty.go @@ -0,0 +1,170 @@ +// +build linux + +package main + +import ( + "fmt" + "io" + "os" + "os/signal" + "sync" + + "github.com/containerd/console" + "github.com/opencontainers/runc/libcontainer" + "github.com/opencontainers/runc/libcontainer/utils" +) + +type tty struct { + epoller *console.Epoller + console *console.EpollConsole + stdin console.Console + closers []io.Closer + postStart []io.Closer + wg sync.WaitGroup + consoleC chan error +} + +func (t *tty) copyIO(w io.Writer, r io.ReadCloser) { + defer t.wg.Done() + io.Copy(w, r) + r.Close() +} + +// setup pipes for the process so that advanced features like c/r are able to easily checkpoint +// and restore the process's IO without depending on a host specific path or device +func setupProcessPipes(p *libcontainer.Process, rootuid, rootgid int) (*tty, error) { + i, err := p.InitializeIO(rootuid, rootgid) + if err != nil { + return nil, err + } + t := &tty{ + closers: []io.Closer{ + i.Stdin, + i.Stdout, + i.Stderr, + }, + } + // add the process's io to the post start closers if they support close + for _, cc := range []interface{}{ + p.Stdin, + p.Stdout, + p.Stderr, + } { + if c, ok := cc.(io.Closer); ok { + t.postStart = append(t.postStart, c) + } + } + go func() { + io.Copy(i.Stdin, os.Stdin) + i.Stdin.Close() + }() + t.wg.Add(2) + go t.copyIO(os.Stdout, i.Stdout) + go t.copyIO(os.Stderr, i.Stderr) + return t, nil +} + +func inheritStdio(process *libcontainer.Process) error { + process.Stdin = os.Stdin + process.Stdout = os.Stdout + process.Stderr = os.Stderr + return nil +} + +func (t *tty) recvtty(process *libcontainer.Process, socket *os.File) (Err error) { + f, err := utils.RecvFd(socket) + if err != nil { + return err + } + cons, err := console.ConsoleFromFile(f) + if err != nil { + return err + } + console.ClearONLCR(cons.Fd()) + epoller, err := console.NewEpoller() + if err != nil { + return err + } + epollConsole, err := epoller.Add(cons) + if err != nil { + return err + } + defer func() { + if Err != nil { + epollConsole.Close() + } + }() + go epoller.Wait() + go io.Copy(epollConsole, os.Stdin) + t.wg.Add(1) + go t.copyIO(os.Stdout, epollConsole) + + // set raw mode to stdin and also handle interrupt + stdin, err := console.ConsoleFromFile(os.Stdin) + if err != nil { + return err + } + if err := stdin.SetRaw(); err != nil { + return fmt.Errorf("failed to set the terminal from the stdin: %v", err) + } + go handleInterrupt(stdin) + + t.epoller = epoller + t.stdin = stdin + t.console = epollConsole + t.closers = []io.Closer{epollConsole} + return nil +} + +func handleInterrupt(c console.Console) { + sigchan := make(chan os.Signal, 1) + signal.Notify(sigchan, os.Interrupt) + <-sigchan + c.Reset() + os.Exit(0) +} + +func (t *tty) waitConsole() error { + if t.consoleC != nil { + return <-t.consoleC + } + return nil +} + +// ClosePostStart closes any fds that are provided to the container and dup2'd +// so that we no longer have copy in our process. +func (t *tty) ClosePostStart() error { + for _, c := range t.postStart { + c.Close() + } + return nil +} + +// Close closes all open fds for the tty and/or restores the original +// stdin state to what it was prior to the container execution +func (t *tty) Close() error { + // ensure that our side of the fds are always closed + for _, c := range t.postStart { + c.Close() + } + // the process is gone at this point, shutting down the console if we have + // one and wait for all IO to be finished + if t.console != nil && t.epoller != nil { + t.console.Shutdown(t.epoller.CloseConsole) + } + t.wg.Wait() + for _, c := range t.closers { + c.Close() + } + if t.stdin != nil { + t.stdin.Reset() + } + return nil +} + +func (t *tty) resize() error { + if t.console == nil { + return nil + } + return t.console.ResizeFrom(console.Current()) +} diff --git a/types/events.go b/types/events.go new file mode 100644 index 0000000..c6f0e97 --- /dev/null +++ b/types/events.go @@ -0,0 +1,130 @@ +package types + +// Event struct for encoding the event data to json. +type Event struct { + Type string `json:"type"` + ID string `json:"id"` + Data interface{} `json:"data,omitempty"` +} + +// stats is the runc specific stats structure for stability when encoding and decoding stats. +type Stats struct { + CPU Cpu `json:"cpu"` + Memory Memory `json:"memory"` + Pids Pids `json:"pids"` + Blkio Blkio `json:"blkio"` + Hugetlb map[string]Hugetlb `json:"hugetlb"` + IntelRdt IntelRdt `json:"intel_rdt"` + NetworkInterfaces []*NetworkInterface `json:"network_interfaces"` +} + +type Hugetlb struct { + Usage uint64 `json:"usage,omitempty"` + Max uint64 `json:"max,omitempty"` + Failcnt uint64 `json:"failcnt"` +} + +type BlkioEntry struct { + Major uint64 `json:"major,omitempty"` + Minor uint64 `json:"minor,omitempty"` + Op string `json:"op,omitempty"` + Value uint64 `json:"value,omitempty"` +} + +type Blkio struct { + IoServiceBytesRecursive []BlkioEntry `json:"ioServiceBytesRecursive,omitempty"` + IoServicedRecursive []BlkioEntry `json:"ioServicedRecursive,omitempty"` + IoQueuedRecursive []BlkioEntry `json:"ioQueueRecursive,omitempty"` + IoServiceTimeRecursive []BlkioEntry `json:"ioServiceTimeRecursive,omitempty"` + IoWaitTimeRecursive []BlkioEntry `json:"ioWaitTimeRecursive,omitempty"` + IoMergedRecursive []BlkioEntry `json:"ioMergedRecursive,omitempty"` + IoTimeRecursive []BlkioEntry `json:"ioTimeRecursive,omitempty"` + SectorsRecursive []BlkioEntry `json:"sectorsRecursive,omitempty"` +} + +type Pids struct { + Current uint64 `json:"current,omitempty"` + Limit uint64 `json:"limit,omitempty"` +} + +type Throttling struct { + Periods uint64 `json:"periods,omitempty"` + ThrottledPeriods uint64 `json:"throttledPeriods,omitempty"` + ThrottledTime uint64 `json:"throttledTime,omitempty"` +} + +type CpuUsage struct { + // Units: nanoseconds. + Total uint64 `json:"total,omitempty"` + Percpu []uint64 `json:"percpu,omitempty"` + Kernel uint64 `json:"kernel"` + User uint64 `json:"user"` +} + +type Cpu struct { + Usage CpuUsage `json:"usage,omitempty"` + Throttling Throttling `json:"throttling,omitempty"` +} + +type MemoryEntry struct { + Limit uint64 `json:"limit"` + Usage uint64 `json:"usage,omitempty"` + Max uint64 `json:"max,omitempty"` + Failcnt uint64 `json:"failcnt"` +} + +type Memory struct { + Cache uint64 `json:"cache,omitempty"` + Usage MemoryEntry `json:"usage,omitempty"` + Swap MemoryEntry `json:"swap,omitempty"` + Kernel MemoryEntry `json:"kernel,omitempty"` + KernelTCP MemoryEntry `json:"kernelTCP,omitempty"` + Raw map[string]uint64 `json:"raw,omitempty"` +} + +type L3CacheInfo struct { + CbmMask string `json:"cbm_mask,omitempty"` + MinCbmBits uint64 `json:"min_cbm_bits,omitempty"` + NumClosids uint64 `json:"num_closids,omitempty"` +} + +type MemBwInfo struct { + BandwidthGran uint64 `json:"bandwidth_gran,omitempty"` + DelayLinear uint64 `json:"delay_linear,omitempty"` + MinBandwidth uint64 `json:"min_bandwidth,omitempty"` + NumClosids uint64 `json:"num_closids,omitempty"` +} + +type IntelRdt struct { + // The read-only L3 cache information + L3CacheInfo *L3CacheInfo `json:"l3_cache_info,omitempty"` + + // The read-only L3 cache schema in root + L3CacheSchemaRoot string `json:"l3_cache_schema_root,omitempty"` + + // The L3 cache schema in 'container_id' group + L3CacheSchema string `json:"l3_cache_schema,omitempty"` + + // The read-only memory bandwidth information + MemBwInfo *MemBwInfo `json:"mem_bw_info,omitempty"` + + // The read-only memory bandwidth schema in root + MemBwSchemaRoot string `json:"mem_bw_schema_root,omitempty"` + + // The memory bandwidth schema in 'container_id' group + MemBwSchema string `json:"mem_bw_schema,omitempty"` +} + +type NetworkInterface struct { + // Name is the name of the network interface. + Name string + + RxBytes uint64 + RxPackets uint64 + RxErrors uint64 + RxDropped uint64 + TxBytes uint64 + TxPackets uint64 + TxErrors uint64 + TxDropped uint64 +} diff --git a/update.go b/update.go new file mode 100644 index 0000000..05dc4b5 --- /dev/null +++ b/update.go @@ -0,0 +1,304 @@ +// +build linux + +package main + +import ( + "encoding/json" + "fmt" + "os" + "strconv" + + "github.com/docker/go-units" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/intelrdt" + "github.com/opencontainers/runtime-spec/specs-go" + "github.com/urfave/cli" +) + +func i64Ptr(i int64) *int64 { return &i } +func u64Ptr(i uint64) *uint64 { return &i } +func u16Ptr(i uint16) *uint16 { return &i } + +var updateCommand = cli.Command{ + Name: "update", + Usage: "update container resource constraints", + ArgsUsage: ``, + Flags: []cli.Flag{ + cli.StringFlag{ + Name: "resources, r", + Value: "", + Usage: `path to the file containing the resources to update or '-' to read from the standard input + +The accepted format is as follow (unchanged values can be omitted): + +{ + "memory": { + "limit": 0, + "reservation": 0, + "swap": 0, + "kernel": 0, + "kernelTCP": 0 + }, + "cpu": { + "shares": 0, + "quota": 0, + "period": 0, + "realtimeRuntime": 0, + "realtimePeriod": 0, + "cpus": "", + "mems": "" + }, + "blockIO": { + "weight": 0 + } +} + +Note: if data is to be read from a file or the standard input, all +other options are ignored. +`, + }, + + cli.IntFlag{ + Name: "blkio-weight", + Usage: "Specifies per cgroup weight, range is from 10 to 1000", + }, + cli.StringFlag{ + Name: "cpu-period", + Usage: "CPU CFS period to be used for hardcapping (in usecs). 0 to use system default", + }, + cli.StringFlag{ + Name: "cpu-quota", + Usage: "CPU CFS hardcap limit (in usecs). Allowed cpu time in a given period", + }, + cli.StringFlag{ + Name: "cpu-share", + Usage: "CPU shares (relative weight vs. other containers)", + }, + cli.StringFlag{ + Name: "cpu-rt-period", + Usage: "CPU realtime period to be used for hardcapping (in usecs). 0 to use system default", + }, + cli.StringFlag{ + Name: "cpu-rt-runtime", + Usage: "CPU realtime hardcap limit (in usecs). Allowed cpu time in a given period", + }, + cli.StringFlag{ + Name: "cpuset-cpus", + Usage: "CPU(s) to use", + }, + cli.StringFlag{ + Name: "cpuset-mems", + Usage: "Memory node(s) to use", + }, + cli.StringFlag{ + Name: "kernel-memory", + Usage: "Kernel memory limit (in bytes)", + }, + cli.StringFlag{ + Name: "kernel-memory-tcp", + Usage: "Kernel memory limit (in bytes) for tcp buffer", + }, + cli.StringFlag{ + Name: "memory", + Usage: "Memory limit (in bytes)", + }, + cli.StringFlag{ + Name: "memory-reservation", + Usage: "Memory reservation or soft_limit (in bytes)", + }, + cli.StringFlag{ + Name: "memory-swap", + Usage: "Total memory usage (memory + swap); set '-1' to enable unlimited swap", + }, + cli.IntFlag{ + Name: "pids-limit", + Usage: "Maximum number of pids allowed in the container", + }, + cli.StringFlag{ + Name: "l3-cache-schema", + Usage: "The string of Intel RDT/CAT L3 cache schema", + }, + cli.StringFlag{ + Name: "mem-bw-schema", + Usage: "The string of Intel RDT/MBA memory bandwidth schema", + }, + }, + Action: func(context *cli.Context) error { + if err := checkArgs(context, 1, exactArgs); err != nil { + return err + } + container, err := getContainer(context) + if err != nil { + return err + } + + r := specs.LinuxResources{ + Memory: &specs.LinuxMemory{ + Limit: i64Ptr(0), + Reservation: i64Ptr(0), + Swap: i64Ptr(0), + Kernel: i64Ptr(0), + KernelTCP: i64Ptr(0), + }, + CPU: &specs.LinuxCPU{ + Shares: u64Ptr(0), + Quota: i64Ptr(0), + Period: u64Ptr(0), + RealtimeRuntime: i64Ptr(0), + RealtimePeriod: u64Ptr(0), + Cpus: "", + Mems: "", + }, + BlockIO: &specs.LinuxBlockIO{ + Weight: u16Ptr(0), + }, + Pids: &specs.LinuxPids{ + Limit: 0, + }, + } + + config := container.Config() + + if in := context.String("resources"); in != "" { + var ( + f *os.File + err error + ) + switch in { + case "-": + f = os.Stdin + default: + f, err = os.Open(in) + if err != nil { + return err + } + } + err = json.NewDecoder(f).Decode(&r) + if err != nil { + return err + } + } else { + if val := context.Int("blkio-weight"); val != 0 { + r.BlockIO.Weight = u16Ptr(uint16(val)) + } + if val := context.String("cpuset-cpus"); val != "" { + r.CPU.Cpus = val + } + if val := context.String("cpuset-mems"); val != "" { + r.CPU.Mems = val + } + + for _, pair := range []struct { + opt string + dest *uint64 + }{ + + {"cpu-period", r.CPU.Period}, + {"cpu-rt-period", r.CPU.RealtimePeriod}, + {"cpu-share", r.CPU.Shares}, + } { + if val := context.String(pair.opt); val != "" { + var err error + *pair.dest, err = strconv.ParseUint(val, 10, 64) + if err != nil { + return fmt.Errorf("invalid value for %s: %s", pair.opt, err) + } + } + } + for _, pair := range []struct { + opt string + dest *int64 + }{ + + {"cpu-quota", r.CPU.Quota}, + {"cpu-rt-runtime", r.CPU.RealtimeRuntime}, + } { + if val := context.String(pair.opt); val != "" { + var err error + *pair.dest, err = strconv.ParseInt(val, 10, 64) + if err != nil { + return fmt.Errorf("invalid value for %s: %s", pair.opt, err) + } + } + } + for _, pair := range []struct { + opt string + dest *int64 + }{ + {"memory", r.Memory.Limit}, + {"memory-swap", r.Memory.Swap}, + {"kernel-memory", r.Memory.Kernel}, + {"kernel-memory-tcp", r.Memory.KernelTCP}, + {"memory-reservation", r.Memory.Reservation}, + } { + if val := context.String(pair.opt); val != "" { + var v int64 + + if val != "-1" { + v, err = units.RAMInBytes(val) + if err != nil { + return fmt.Errorf("invalid value for %s: %s", pair.opt, err) + } + } else { + v = -1 + } + *pair.dest = v + } + } + r.Pids.Limit = int64(context.Int("pids-limit")) + } + + // Update the value + config.Cgroups.Resources.BlkioWeight = *r.BlockIO.Weight + config.Cgroups.Resources.CpuPeriod = *r.CPU.Period + config.Cgroups.Resources.CpuQuota = *r.CPU.Quota + config.Cgroups.Resources.CpuShares = *r.CPU.Shares + config.Cgroups.Resources.CpuRtPeriod = *r.CPU.RealtimePeriod + config.Cgroups.Resources.CpuRtRuntime = *r.CPU.RealtimeRuntime + config.Cgroups.Resources.CpusetCpus = r.CPU.Cpus + config.Cgroups.Resources.CpusetMems = r.CPU.Mems + config.Cgroups.Resources.KernelMemory = *r.Memory.Kernel + config.Cgroups.Resources.KernelMemoryTCP = *r.Memory.KernelTCP + config.Cgroups.Resources.Memory = *r.Memory.Limit + config.Cgroups.Resources.MemoryReservation = *r.Memory.Reservation + config.Cgroups.Resources.MemorySwap = *r.Memory.Swap + config.Cgroups.Resources.PidsLimit = r.Pids.Limit + + // Update Intel RDT + l3CacheSchema := context.String("l3-cache-schema") + memBwSchema := context.String("mem-bw-schema") + if l3CacheSchema != "" && !intelrdt.IsCatEnabled() { + return fmt.Errorf("Intel RDT/CAT: l3 cache schema is not enabled") + } + + if memBwSchema != "" && !intelrdt.IsMbaEnabled() { + return fmt.Errorf("Intel RDT/MBA: memory bandwidth schema is not enabled") + } + + if l3CacheSchema != "" || memBwSchema != "" { + // If intelRdt is not specified in original configuration, we just don't + // Apply() to create intelRdt group or attach tasks for this container. + // In update command, we could re-enable through IntelRdtManager.Apply() + // and then update intelrdt constraint. + if config.IntelRdt == nil { + state, err := container.State() + if err != nil { + return err + } + config.IntelRdt = &configs.IntelRdt{} + intelRdtManager := intelrdt.IntelRdtManager{ + Config: &config, + Id: container.ID(), + Path: state.IntelRdtPath, + } + if err := intelRdtManager.Apply(state.InitProcessPid); err != nil { + return err + } + } + config.IntelRdt.L3CacheSchema = l3CacheSchema + config.IntelRdt.MemBwSchema = memBwSchema + } + + return container.Set(config) + }, +} diff --git a/utils.go b/utils.go new file mode 100644 index 0000000..5165336 --- /dev/null +++ b/utils.go @@ -0,0 +1,94 @@ +package main + +import ( + "fmt" + "os" + "path/filepath" + "strconv" + "strings" + + "github.com/opencontainers/runtime-spec/specs-go" + + "github.com/sirupsen/logrus" + "github.com/urfave/cli" +) + +const ( + exactArgs = iota + minArgs + maxArgs +) + +func checkArgs(context *cli.Context, expected, checkType int) error { + var err error + cmdName := context.Command.Name + switch checkType { + case exactArgs: + if context.NArg() != expected { + err = fmt.Errorf("%s: %q requires exactly %d argument(s)", os.Args[0], cmdName, expected) + } + case minArgs: + if context.NArg() < expected { + err = fmt.Errorf("%s: %q requires a minimum of %d argument(s)", os.Args[0], cmdName, expected) + } + case maxArgs: + if context.NArg() > expected { + err = fmt.Errorf("%s: %q requires a maximum of %d argument(s)", os.Args[0], cmdName, expected) + } + } + + if err != nil { + fmt.Printf("Incorrect Usage.\n\n") + cli.ShowCommandHelp(context, cmdName) + return err + } + return nil +} + +// fatal prints the error's details if it is a libcontainer specific error type +// then exits the program with an exit status of 1. +func fatal(err error) { + // make sure the error is written to the logger + logrus.Error(err) + fmt.Fprintln(os.Stderr, err) + os.Exit(1) +} + +// setupSpec performs initial setup based on the cli.Context for the container +func setupSpec(context *cli.Context) (*specs.Spec, error) { + bundle := context.String("bundle") + if bundle != "" { + if err := os.Chdir(bundle); err != nil { + return nil, err + } + } + spec, err := loadSpec(specConfig) + if err != nil { + return nil, err + } + return spec, nil +} + +func revisePidFile(context *cli.Context) error { + pidFile := context.String("pid-file") + if pidFile == "" { + return nil + } + + // convert pid-file to an absolute path so we can write to the right + // file after chdir to bundle + pidFile, err := filepath.Abs(pidFile) + if err != nil { + return err + } + return context.Set("pid-file", pidFile) +} + +// parseBoolOrAuto returns (nil, nil) if s is empty or "auto" +func parseBoolOrAuto(s string) (*bool, error) { + if s == "" || strings.ToLower(s) == "auto" { + return nil, nil + } + b, err := strconv.ParseBool(s) + return &b, err +} diff --git a/utils_linux.go b/utils_linux.go new file mode 100644 index 0000000..984e6b0 --- /dev/null +++ b/utils_linux.go @@ -0,0 +1,453 @@ +// +build linux + +package main + +import ( + "fmt" + "net" + "os" + "os/exec" + "path/filepath" + "strconv" + + "github.com/opencontainers/runc/libcontainer" + "github.com/opencontainers/runc/libcontainer/cgroups/systemd" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/intelrdt" + "github.com/opencontainers/runc/libcontainer/specconv" + "github.com/opencontainers/runc/libcontainer/utils" + "github.com/opencontainers/runtime-spec/specs-go" + selinux "github.com/opencontainers/selinux/go-selinux" + + "github.com/coreos/go-systemd/activation" + "github.com/pkg/errors" + "github.com/sirupsen/logrus" + "github.com/urfave/cli" + "golang.org/x/sys/unix" +) + +var errEmptyID = errors.New("container id cannot be empty") + +// loadFactory returns the configured factory instance for execing containers. +func loadFactory(context *cli.Context) (libcontainer.Factory, error) { + root := context.GlobalString("root") + abs, err := filepath.Abs(root) + if err != nil { + return nil, err + } + + // We default to cgroupfs, and can only use systemd if the system is a + // systemd box. + cgroupManager := libcontainer.Cgroupfs + rootlessCg, err := shouldUseRootlessCgroupManager(context) + if err != nil { + return nil, err + } + if rootlessCg { + cgroupManager = libcontainer.RootlessCgroupfs + } + if context.GlobalBool("systemd-cgroup") { + if systemd.UseSystemd() { + cgroupManager = libcontainer.SystemdCgroups + } else { + return nil, fmt.Errorf("systemd cgroup flag passed, but systemd support for managing cgroups is not available") + } + } + + intelRdtManager := libcontainer.IntelRdtFs + if !intelrdt.IsCatEnabled() && !intelrdt.IsMbaEnabled() { + intelRdtManager = nil + } + + // We resolve the paths for {newuidmap,newgidmap} from the context of runc, + // to avoid doing a path lookup in the nsexec context. TODO: The binary + // names are not currently configurable. + newuidmap, err := exec.LookPath("newuidmap") + if err != nil { + newuidmap = "" + } + newgidmap, err := exec.LookPath("newgidmap") + if err != nil { + newgidmap = "" + } + + return libcontainer.New(abs, cgroupManager, intelRdtManager, + libcontainer.CriuPath(context.GlobalString("criu")), + libcontainer.NewuidmapPath(newuidmap), + libcontainer.NewgidmapPath(newgidmap)) +} + +// getContainer returns the specified container instance by loading it from state +// with the default factory. +func getContainer(context *cli.Context) (libcontainer.Container, error) { + id := context.Args().First() + if id == "" { + return nil, errEmptyID + } + factory, err := loadFactory(context) + if err != nil { + return nil, err + } + return factory.Load(id) +} + +func fatalf(t string, v ...interface{}) { + fatal(fmt.Errorf(t, v...)) +} + +func getDefaultImagePath(context *cli.Context) string { + cwd, err := os.Getwd() + if err != nil { + panic(err) + } + return filepath.Join(cwd, "checkpoint") +} + +// newProcess returns a new libcontainer Process with the arguments from the +// spec and stdio from the current process. +func newProcess(p specs.Process, init bool, logLevel string) (*libcontainer.Process, error) { + lp := &libcontainer.Process{ + Args: p.Args, + Env: p.Env, + // TODO: fix libcontainer's API to better support uid/gid in a typesafe way. + User: fmt.Sprintf("%d:%d", p.User.UID, p.User.GID), + Cwd: p.Cwd, + Label: p.SelinuxLabel, + NoNewPrivileges: &p.NoNewPrivileges, + AppArmorProfile: p.ApparmorProfile, + Init: init, + LogLevel: logLevel, + } + + if p.ConsoleSize != nil { + lp.ConsoleWidth = uint16(p.ConsoleSize.Width) + lp.ConsoleHeight = uint16(p.ConsoleSize.Height) + } + + if p.Capabilities != nil { + lp.Capabilities = &configs.Capabilities{} + lp.Capabilities.Bounding = p.Capabilities.Bounding + lp.Capabilities.Effective = p.Capabilities.Effective + lp.Capabilities.Inheritable = p.Capabilities.Inheritable + lp.Capabilities.Permitted = p.Capabilities.Permitted + lp.Capabilities.Ambient = p.Capabilities.Ambient + } + for _, gid := range p.User.AdditionalGids { + lp.AdditionalGroups = append(lp.AdditionalGroups, strconv.FormatUint(uint64(gid), 10)) + } + for _, rlimit := range p.Rlimits { + rl, err := createLibContainerRlimit(rlimit) + if err != nil { + return nil, err + } + lp.Rlimits = append(lp.Rlimits, rl) + } + return lp, nil +} + +func destroy(container libcontainer.Container) { + if err := container.Destroy(); err != nil { + logrus.Error(err) + } +} + +// setupIO modifies the given process config according to the options. +func setupIO(process *libcontainer.Process, rootuid, rootgid int, createTTY, detach bool, sockpath string) (*tty, error) { + if createTTY { + process.Stdin = nil + process.Stdout = nil + process.Stderr = nil + t := &tty{} + if !detach { + parent, child, err := utils.NewSockPair("console") + if err != nil { + return nil, err + } + process.ConsoleSocket = child + t.postStart = append(t.postStart, parent, child) + t.consoleC = make(chan error, 1) + go func() { + if err := t.recvtty(process, parent); err != nil { + t.consoleC <- err + } + t.consoleC <- nil + }() + } else { + // the caller of runc will handle receiving the console master + conn, err := net.Dial("unix", sockpath) + if err != nil { + return nil, err + } + uc, ok := conn.(*net.UnixConn) + if !ok { + return nil, fmt.Errorf("casting to UnixConn failed") + } + t.postStart = append(t.postStart, uc) + socket, err := uc.File() + if err != nil { + return nil, err + } + t.postStart = append(t.postStart, socket) + process.ConsoleSocket = socket + } + return t, nil + } + // when runc will detach the caller provides the stdio to runc via runc's 0,1,2 + // and the container's process inherits runc's stdio. + if detach { + if err := inheritStdio(process); err != nil { + return nil, err + } + return &tty{}, nil + } + return setupProcessPipes(process, rootuid, rootgid) +} + +// createPidFile creates a file with the processes pid inside it atomically +// it creates a temp file with the paths filename + '.' infront of it +// then renames the file +func createPidFile(path string, process *libcontainer.Process) error { + pid, err := process.Pid() + if err != nil { + return err + } + var ( + tmpDir = filepath.Dir(path) + tmpName = filepath.Join(tmpDir, fmt.Sprintf(".%s", filepath.Base(path))) + ) + f, err := os.OpenFile(tmpName, os.O_RDWR|os.O_CREATE|os.O_EXCL|os.O_SYNC, 0666) + if err != nil { + return err + } + _, err = fmt.Fprintf(f, "%d", pid) + f.Close() + if err != nil { + return err + } + return os.Rename(tmpName, path) +} + +func createContainer(context *cli.Context, id string, spec *specs.Spec) (libcontainer.Container, error) { + rootlessCg, err := shouldUseRootlessCgroupManager(context) + if err != nil { + return nil, err + } + config, err := specconv.CreateLibcontainerConfig(&specconv.CreateOpts{ + CgroupName: id, + UseSystemdCgroup: context.GlobalBool("systemd-cgroup"), + NoPivotRoot: context.Bool("no-pivot"), + NoNewKeyring: context.Bool("no-new-keyring"), + Spec: spec, + RootlessEUID: os.Geteuid() != 0, + RootlessCgroups: rootlessCg, + }) + if err != nil { + return nil, err + } + + factory, err := loadFactory(context) + if err != nil { + return nil, err + } + return factory.Create(id, config) +} + +type runner struct { + init bool + enableSubreaper bool + shouldDestroy bool + detach bool + listenFDs []*os.File + preserveFDs int + pidFile string + consoleSocket string + container libcontainer.Container + action CtAct + notifySocket *notifySocket + criuOpts *libcontainer.CriuOpts + logLevel string +} + +func (r *runner) run(config *specs.Process) (int, error) { + var err error + defer func() { + if err != nil { + r.destroy() + } + }() + if err = r.checkTerminal(config); err != nil { + return -1, err + } + process, err := newProcess(*config, r.init, r.logLevel) + if err != nil { + return -1, err + } + if len(r.listenFDs) > 0 { + process.Env = append(process.Env, fmt.Sprintf("LISTEN_FDS=%d", len(r.listenFDs)), "LISTEN_PID=1") + process.ExtraFiles = append(process.ExtraFiles, r.listenFDs...) + } + baseFd := 3 + len(process.ExtraFiles) + for i := baseFd; i < baseFd+r.preserveFDs; i++ { + _, err = os.Stat(fmt.Sprintf("/proc/self/fd/%d", i)) + if err != nil { + return -1, errors.Wrapf(err, "please check that preserved-fd %d (of %d) is present", i-baseFd, r.preserveFDs) + } + process.ExtraFiles = append(process.ExtraFiles, os.NewFile(uintptr(i), "PreserveFD:"+strconv.Itoa(i))) + } + rootuid, err := r.container.Config().HostRootUID() + if err != nil { + return -1, err + } + rootgid, err := r.container.Config().HostRootGID() + if err != nil { + return -1, err + } + var ( + detach = r.detach || (r.action == CT_ACT_CREATE) + ) + // Setting up IO is a two stage process. We need to modify process to deal + // with detaching containers, and then we get a tty after the container has + // started. + handler := newSignalHandler(r.enableSubreaper, r.notifySocket) + tty, err := setupIO(process, rootuid, rootgid, config.Terminal, detach, r.consoleSocket) + if err != nil { + return -1, err + } + defer tty.Close() + + switch r.action { + case CT_ACT_CREATE: + err = r.container.Start(process) + case CT_ACT_RESTORE: + err = r.container.Restore(process, r.criuOpts) + case CT_ACT_RUN: + err = r.container.Run(process) + default: + panic("Unknown action") + } + if err != nil { + return -1, err + } + if err = tty.waitConsole(); err != nil { + r.terminate(process) + return -1, err + } + if err = tty.ClosePostStart(); err != nil { + r.terminate(process) + return -1, err + } + if r.pidFile != "" { + if err = createPidFile(r.pidFile, process); err != nil { + r.terminate(process) + return -1, err + } + } + status, err := handler.forward(process, tty, detach) + if err != nil { + r.terminate(process) + } + if detach { + return 0, nil + } + r.destroy() + return status, err +} + +func (r *runner) destroy() { + if r.shouldDestroy { + destroy(r.container) + } +} + +func (r *runner) terminate(p *libcontainer.Process) { + _ = p.Signal(unix.SIGKILL) + _, _ = p.Wait() +} + +func (r *runner) checkTerminal(config *specs.Process) error { + detach := r.detach || (r.action == CT_ACT_CREATE) + // Check command-line for sanity. + if detach && config.Terminal && r.consoleSocket == "" { + return fmt.Errorf("cannot allocate tty if runc will detach without setting console socket") + } + if (!detach || !config.Terminal) && r.consoleSocket != "" { + return fmt.Errorf("cannot use console socket if runc will not detach or allocate tty") + } + return nil +} + +func validateProcessSpec(spec *specs.Process) error { + if spec.Cwd == "" { + return fmt.Errorf("Cwd property must not be empty") + } + if !filepath.IsAbs(spec.Cwd) { + return fmt.Errorf("Cwd must be an absolute path") + } + if len(spec.Args) == 0 { + return fmt.Errorf("args must not be empty") + } + if spec.SelinuxLabel != "" && !selinux.GetEnabled() { + return fmt.Errorf("selinux label is specified in config, but selinux is disabled or not supported") + } + return nil +} + +type CtAct uint8 + +const ( + CT_ACT_CREATE CtAct = iota + 1 + CT_ACT_RUN + CT_ACT_RESTORE +) + +func startContainer(context *cli.Context, spec *specs.Spec, action CtAct, criuOpts *libcontainer.CriuOpts) (int, error) { + id := context.Args().First() + if id == "" { + return -1, errEmptyID + } + + notifySocket := newNotifySocket(context, os.Getenv("NOTIFY_SOCKET"), id) + if notifySocket != nil { + notifySocket.setupSpec(context, spec) + } + + container, err := createContainer(context, id, spec) + if err != nil { + return -1, err + } + + if notifySocket != nil { + err := notifySocket.setupSocket() + if err != nil { + return -1, err + } + } + + // Support on-demand socket activation by passing file descriptors into the container init process. + listenFDs := []*os.File{} + if os.Getenv("LISTEN_FDS") != "" { + listenFDs = activation.Files(false) + } + + logLevel := "info" + if context.GlobalBool("debug") { + logLevel = "debug" + } + + r := &runner{ + enableSubreaper: !context.Bool("no-subreaper"), + shouldDestroy: true, + container: container, + listenFDs: listenFDs, + notifySocket: notifySocket, + consoleSocket: context.String("console-socket"), + detach: context.Bool("detach"), + pidFile: context.String("pid-file"), + preserveFDs: context.Int("preserve-fds"), + action: action, + criuOpts: criuOpts, + init: true, + logLevel: logLevel, + } + return r.run(spec.Process) +} diff --git a/vendor.conf b/vendor.conf new file mode 100644 index 0000000..dd51785 --- /dev/null +++ b/vendor.conf @@ -0,0 +1,31 @@ +# OCI runtime-spec. When updating this, make sure you use a version tag rather +# than a commit ID so it's much more obvious what version of the spec we are +# using. +github.com/opencontainers/runtime-spec 29686dbc5559d93fb1ef402eeda3e35c38d75af4 # v1.0.1-59-g29686db + +# Core libcontainer functionality. +github.com/checkpoint-restore/go-criu 17b0214f6c48980c45dc47ecb0cfd6d9e02df723 # v3.11 +github.com/mrunalp/fileutils 7d4729fb36185a7c1719923406c9d40e54fb93c7 +github.com/opencontainers/selinux 5215b1806f52b1fcc2070a8826c542c9d33cd3cf # v1.3.0 (+ CVE-2019-16884) +github.com/seccomp/libseccomp-golang 689e3c1541a84461afc49c1c87352a6cedf72e9c # v0.9.1 +github.com/sirupsen/logrus 8bdbc7bcc01dcbb8ec23dc8a28e332258d25251f # v1.4.1 +github.com/syndtr/gocapability d98352740cb2c55f81556b63d4a1ec64c5a319c2 +github.com/vishvananda/netlink 1e2e08e8a2dcdacaae3f14ac44c5cfa31361f270 + +# systemd integration. +github.com/coreos/go-systemd 95778dfbb74eb7e4dbaf43bf7d71809650ef8076 # v19 +github.com/godbus/dbus 2ff6f7ffd60f0f2410b3105864bdd12c7894f844 # v5.0.1 +github.com/golang/protobuf 925541529c1fa6821df4e44ce2723319eb2be768 # v1.0.0 + +# Command-line interface. +github.com/cyphar/filepath-securejoin a261ee33d7a517f054effbf451841abaafe3e0fd # v0.2.2 +github.com/docker/go-units 47565b4f722fb6ceae66b95f853feed578a4a51c # v0.3.3 +github.com/urfave/cli cfb38830724cc34fedffe9a2a29fb54fa9169cd1 # v1.20.0 +golang.org/x/sys 9eafafc0a87e0fd0aeeba439a4573537970c44c7 https://github.com/golang/sys + +# console dependencies +github.com/containerd/console 0650fd9eeb50bab4fc99dceb9f2e14cf58f36e7f +github.com/pkg/errors ba968bfe8b2f7e042a574c888954fccecfa385b4 # v0.8.1 + +# ebpf dependencies +github.com/cilium/ebpf 95b36a581eed7b0f127306ed1d16cc0ddc06cf67 diff --git a/vendor/github.com/cilium/ebpf/LICENSE b/vendor/github.com/cilium/ebpf/LICENSE new file mode 100644 index 0000000..c637ae9 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/LICENSE @@ -0,0 +1,23 @@ +MIT License + +Copyright (c) 2017 Nathan Sweet +Copyright (c) 2018, 2019 Cloudflare +Copyright (c) 2019 Authors of Cilium + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/vendor/github.com/cilium/ebpf/abi.go b/vendor/github.com/cilium/ebpf/abi.go new file mode 100644 index 0000000..999b8cc --- /dev/null +++ b/vendor/github.com/cilium/ebpf/abi.go @@ -0,0 +1,183 @@ +package ebpf + +import ( + "github.com/pkg/errors" +) + +// CollectionABI describes the interface of an eBPF collection. +type CollectionABI struct { + Maps map[string]*MapABI + Programs map[string]*ProgramABI +} + +// CheckSpec verifies that all maps and programs mentioned +// in the ABI are present in the spec. +func (abi *CollectionABI) CheckSpec(cs *CollectionSpec) error { + for name := range abi.Maps { + if cs.Maps[name] == nil { + return errors.Errorf("missing map %s", name) + } + } + + for name := range abi.Programs { + if cs.Programs[name] == nil { + return errors.Errorf("missing program %s", name) + } + } + + return nil +} + +// Check verifies that all items in a collection conform to this ABI. +func (abi *CollectionABI) Check(coll *Collection) error { + for name, mapABI := range abi.Maps { + m := coll.Maps[name] + if m == nil { + return errors.Errorf("missing map %s", name) + } + if err := mapABI.Check(m); err != nil { + return errors.Wrapf(err, "map %s", name) + } + } + + for name, progABI := range abi.Programs { + p := coll.Programs[name] + if p == nil { + return errors.Errorf("missing program %s", name) + } + if err := progABI.Check(p); err != nil { + return errors.Wrapf(err, "program %s", name) + } + } + + return nil +} + +// MapABI describes a Map. +// +// Use it to assert that a Map matches what your code expects. +type MapABI struct { + Type MapType + KeySize uint32 + ValueSize uint32 + MaxEntries uint32 + InnerMap *MapABI +} + +func newMapABIFromSpec(spec *MapSpec) *MapABI { + var inner *MapABI + if spec.InnerMap != nil { + inner = newMapABIFromSpec(spec.InnerMap) + } + + return &MapABI{ + spec.Type, + spec.KeySize, + spec.ValueSize, + spec.MaxEntries, + inner, + } +} + +func newMapABIFromFd(fd *bpfFD) (*MapABI, error) { + info, err := bpfGetMapInfoByFD(fd) + if err != nil { + return nil, err + } + + mapType := MapType(info.mapType) + if mapType == ArrayOfMaps || mapType == HashOfMaps { + return nil, errors.New("can't get map info for nested maps") + } + + return &MapABI{ + mapType, + info.keySize, + info.valueSize, + info.maxEntries, + nil, + }, nil +} + +// Check verifies that a Map conforms to the ABI. +// +// Members of ABI which have the zero value of their type are not checked. +func (abi *MapABI) Check(m *Map) error { + return abi.check(&m.abi) +} + +func (abi *MapABI) check(other *MapABI) error { + if abi.Type != UnspecifiedMap && other.Type != abi.Type { + return errors.Errorf("expected map type %s, have %s", abi.Type, other.Type) + } + if err := checkUint32("key size", abi.KeySize, other.KeySize); err != nil { + return err + } + if err := checkUint32("value size", abi.ValueSize, other.ValueSize); err != nil { + return err + } + if err := checkUint32("max entries", abi.MaxEntries, other.MaxEntries); err != nil { + return err + } + + if abi.InnerMap == nil { + if abi.Type == ArrayOfMaps || abi.Type == HashOfMaps { + return errors.New("missing inner map ABI") + } + + return nil + } + + if other.InnerMap == nil { + return errors.New("missing inner map") + } + + return errors.Wrap(abi.InnerMap.check(other.InnerMap), "inner map") +} + +// ProgramABI describes a Program. +// +// Use it to assert that a Program matches what your code expects. +type ProgramABI struct { + Type ProgramType +} + +func newProgramABIFromSpec(spec *ProgramSpec) *ProgramABI { + return &ProgramABI{ + spec.Type, + } +} + +func newProgramABIFromFd(fd *bpfFD) (*ProgramABI, error) { + info, err := bpfGetProgInfoByFD(fd) + if err != nil { + return nil, err + } + + return newProgramABIFromInfo(info), nil +} + +func newProgramABIFromInfo(info *bpfProgInfo) *ProgramABI { + return &ProgramABI{ + Type: ProgramType(info.progType), + } +} + +// Check verifies that a Program conforms to the ABI. +// +// Members which have the zero value of their type +// are not checked. +func (abi *ProgramABI) Check(prog *Program) error { + if abi.Type != UnspecifiedProgram && prog.abi.Type != abi.Type { + return errors.Errorf("expected program type %s, have %s", abi.Type, prog.abi.Type) + } + + return nil +} + +func checkUint32(name string, want, have uint32) error { + if want != 0 && have != want { + return errors.Errorf("expected %s to be %d, have %d", name, want, have) + } + return nil +} diff --git a/vendor/github.com/cilium/ebpf/asm/alu.go b/vendor/github.com/cilium/ebpf/asm/alu.go new file mode 100644 index 0000000..70ccc4d --- /dev/null +++ b/vendor/github.com/cilium/ebpf/asm/alu.go @@ -0,0 +1,149 @@ +package asm + +//go:generate stringer -output alu_string.go -type=Source,Endianness,ALUOp + +// Source of ALU / ALU64 / Branch operations +// +// msb lsb +// +----+-+---+ +// |op |S|cls| +// +----+-+---+ +type Source uint8 + +const sourceMask OpCode = 0x08 + +// Source bitmask +const ( + // InvalidSource is returned by getters when invoked + // on non ALU / branch OpCodes. + InvalidSource Source = 0xff + // ImmSource src is from constant + ImmSource Source = 0x00 + // RegSource src is from register + RegSource Source = 0x08 +) + +// The Endianness of a byte swap instruction. +type Endianness uint8 + +const endianMask = sourceMask + +// Endian flags +const ( + InvalidEndian Endianness = 0xff + // Convert to little endian + LE Endianness = 0x00 + // Convert to big endian + BE Endianness = 0x08 +) + +// ALUOp are ALU / ALU64 operations +// +// msb lsb +// +----+-+---+ +// |OP |s|cls| +// +----+-+---+ +type ALUOp uint8 + +const aluMask OpCode = 0xf0 + +const ( + // InvalidALUOp is returned by getters when invoked + // on non ALU OpCodes + InvalidALUOp ALUOp = 0xff + // Add - addition + Add ALUOp = 0x00 + // Sub - subtraction + Sub ALUOp = 0x10 + // Mul - multiplication + Mul ALUOp = 0x20 + // Div - division + Div ALUOp = 0x30 + // Or - bitwise or + Or ALUOp = 0x40 + // And - bitwise and + And ALUOp = 0x50 + // LSh - bitwise shift left + LSh ALUOp = 0x60 + // RSh - bitwise shift right + RSh ALUOp = 0x70 + // Neg - sign/unsign signing bit + Neg ALUOp = 0x80 + // Mod - modulo + Mod ALUOp = 0x90 + // Xor - bitwise xor + Xor ALUOp = 0xa0 + // Mov - move value from one place to another + Mov ALUOp = 0xb0 + // ArSh - arithmatic shift + ArSh ALUOp = 0xc0 + // Swap - endian conversions + Swap ALUOp = 0xd0 +) + +// HostTo converts from host to another endianness. +func HostTo(endian Endianness, dst Register, size Size) Instruction { + var imm int64 + switch size { + case Half: + imm = 16 + case Word: + imm = 32 + case DWord: + imm = 64 + default: + return Instruction{OpCode: InvalidOpCode} + } + + return Instruction{ + OpCode: OpCode(ALUClass).SetALUOp(Swap).SetSource(Source(endian)), + Dst: dst, + Constant: imm, + } +} + +// Op returns the OpCode for an ALU operation with a given source. +func (op ALUOp) Op(source Source) OpCode { + return OpCode(ALU64Class).SetALUOp(op).SetSource(source) +} + +// Reg emits `dst (op) src`. +func (op ALUOp) Reg(dst, src Register) Instruction { + return Instruction{ + OpCode: op.Op(RegSource), + Dst: dst, + Src: src, + } +} + +// Imm emits `dst (op) value`. +func (op ALUOp) Imm(dst Register, value int32) Instruction { + return Instruction{ + OpCode: op.Op(ImmSource), + Dst: dst, + Constant: int64(value), + } +} + +// Op32 returns the OpCode for a 32-bit ALU operation with a given source. +func (op ALUOp) Op32(source Source) OpCode { + return OpCode(ALUClass).SetALUOp(op).SetSource(source) +} + +// Reg32 emits `dst (op) src`, zeroing the upper 32 bit of dst. +func (op ALUOp) Reg32(dst, src Register) Instruction { + return Instruction{ + OpCode: op.Op32(RegSource), + Dst: dst, + Src: src, + } +} + +// Imm32 emits `dst (op) value`, zeroing the upper 32 bit of dst. +func (op ALUOp) Imm32(dst Register, value int32) Instruction { + return Instruction{ + OpCode: op.Op32(ImmSource), + Dst: dst, + Constant: int64(value), + } +} diff --git a/vendor/github.com/cilium/ebpf/asm/alu_string.go b/vendor/github.com/cilium/ebpf/asm/alu_string.go new file mode 100644 index 0000000..72d3fe6 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/asm/alu_string.go @@ -0,0 +1,107 @@ +// Code generated by "stringer -output alu_string.go -type=Source,Endianness,ALUOp"; DO NOT EDIT. + +package asm + +import "strconv" + +func _() { + // An "invalid array index" compiler error signifies that the constant values have changed. + // Re-run the stringer command to generate them again. + var x [1]struct{} + _ = x[InvalidSource-255] + _ = x[ImmSource-0] + _ = x[RegSource-8] +} + +const ( + _Source_name_0 = "ImmSource" + _Source_name_1 = "RegSource" + _Source_name_2 = "InvalidSource" +) + +func (i Source) String() string { + switch { + case i == 0: + return _Source_name_0 + case i == 8: + return _Source_name_1 + case i == 255: + return _Source_name_2 + default: + return "Source(" + strconv.FormatInt(int64(i), 10) + ")" + } +} +func _() { + // An "invalid array index" compiler error signifies that the constant values have changed. + // Re-run the stringer command to generate them again. + var x [1]struct{} + _ = x[InvalidEndian-255] + _ = x[LE-0] + _ = x[BE-8] +} + +const ( + _Endianness_name_0 = "LE" + _Endianness_name_1 = "BE" + _Endianness_name_2 = "InvalidEndian" +) + +func (i Endianness) String() string { + switch { + case i == 0: + return _Endianness_name_0 + case i == 8: + return _Endianness_name_1 + case i == 255: + return _Endianness_name_2 + default: + return "Endianness(" + strconv.FormatInt(int64(i), 10) + ")" + } +} +func _() { + // An "invalid array index" compiler error signifies that the constant values have changed. + // Re-run the stringer command to generate them again. + var x [1]struct{} + _ = x[InvalidALUOp-255] + _ = x[Add-0] + _ = x[Sub-16] + _ = x[Mul-32] + _ = x[Div-48] + _ = x[Or-64] + _ = x[And-80] + _ = x[LSh-96] + _ = x[RSh-112] + _ = x[Neg-128] + _ = x[Mod-144] + _ = x[Xor-160] + _ = x[Mov-176] + _ = x[ArSh-192] + _ = x[Swap-208] +} + +const _ALUOp_name = "AddSubMulDivOrAndLShRShNegModXorMovArShSwapInvalidALUOp" + +var _ALUOp_map = map[ALUOp]string{ + 0: _ALUOp_name[0:3], + 16: _ALUOp_name[3:6], + 32: _ALUOp_name[6:9], + 48: _ALUOp_name[9:12], + 64: _ALUOp_name[12:14], + 80: _ALUOp_name[14:17], + 96: _ALUOp_name[17:20], + 112: _ALUOp_name[20:23], + 128: _ALUOp_name[23:26], + 144: _ALUOp_name[26:29], + 160: _ALUOp_name[29:32], + 176: _ALUOp_name[32:35], + 192: _ALUOp_name[35:39], + 208: _ALUOp_name[39:43], + 255: _ALUOp_name[43:55], +} + +func (i ALUOp) String() string { + if str, ok := _ALUOp_map[i]; ok { + return str + } + return "ALUOp(" + strconv.FormatInt(int64(i), 10) + ")" +} diff --git a/vendor/github.com/cilium/ebpf/asm/doc.go b/vendor/github.com/cilium/ebpf/asm/doc.go new file mode 100644 index 0000000..7031bdc --- /dev/null +++ b/vendor/github.com/cilium/ebpf/asm/doc.go @@ -0,0 +1,2 @@ +// Package asm is an assembler for eBPF bytecode. +package asm diff --git a/vendor/github.com/cilium/ebpf/asm/func.go b/vendor/github.com/cilium/ebpf/asm/func.go new file mode 100644 index 0000000..97f794c --- /dev/null +++ b/vendor/github.com/cilium/ebpf/asm/func.go @@ -0,0 +1,143 @@ +package asm + +//go:generate stringer -output func_string.go -type=BuiltinFunc + +// BuiltinFunc is a built-in eBPF function. +type BuiltinFunc int32 + +// eBPF built-in functions +// +// You can renegerate this list using the following gawk script: +// +// /FN\(.+\),/ { +// match($1, /\((.+)\)/, r) +// split(r[1], p, "_") +// printf "Fn" +// for (i in p) { +// printf "%s%s", toupper(substr(p[i], 1, 1)), substr(p[i], 2) +// } +// print "" +// } +// +// The script expects include/uapi/linux/bpf.h as it's input. +const ( + FnUnspec BuiltinFunc = iota + FnMapLookupElem + FnMapUpdateElem + FnMapDeleteElem + FnProbeRead + FnKtimeGetNs + FnTracePrintk + FnGetPrandomU32 + FnGetSmpProcessorId + FnSkbStoreBytes + FnL3CsumReplace + FnL4CsumReplace + FnTailCall + FnCloneRedirect + FnGetCurrentPidTgid + FnGetCurrentUidGid + FnGetCurrentComm + FnGetCgroupClassid + FnSkbVlanPush + FnSkbVlanPop + FnSkbGetTunnelKey + FnSkbSetTunnelKey + FnPerfEventRead + FnRedirect + FnGetRouteRealm + FnPerfEventOutput + FnSkbLoadBytes + FnGetStackid + FnCsumDiff + FnSkbGetTunnelOpt + FnSkbSetTunnelOpt + FnSkbChangeProto + FnSkbChangeType + FnSkbUnderCgroup + FnGetHashRecalc + FnGetCurrentTask + FnProbeWriteUser + FnCurrentTaskUnderCgroup + FnSkbChangeTail + FnSkbPullData + FnCsumUpdate + FnSetHashInvalid + FnGetNumaNodeId + FnSkbChangeHead + FnXdpAdjustHead + FnProbeReadStr + FnGetSocketCookie + FnGetSocketUid + FnSetHash + FnSetsockopt + FnSkbAdjustRoom + FnRedirectMap + FnSkRedirectMap + FnSockMapUpdate + FnXdpAdjustMeta + FnPerfEventReadValue + FnPerfProgReadValue + FnGetsockopt + FnOverrideReturn + FnSockOpsCbFlagsSet + FnMsgRedirectMap + FnMsgApplyBytes + FnMsgCorkBytes + FnMsgPullData + FnBind + FnXdpAdjustTail + FnSkbGetXfrmState + FnGetStack + FnSkbLoadBytesRelative + FnFibLookup + FnSockHashUpdate + FnMsgRedirectHash + FnSkRedirectHash + FnLwtPushEncap + FnLwtSeg6StoreBytes + FnLwtSeg6AdjustSrh + FnLwtSeg6Action + FnRcRepeat + FnRcKeydown + FnSkbCgroupId + FnGetCurrentCgroupId + FnGetLocalStorage + FnSkSelectReuseport + FnSkbAncestorCgroupId + FnSkLookupTcp + FnSkLookupUdp + FnSkRelease + FnMapPushElem + FnMapPopElem + FnMapPeekElem + FnMsgPushData + FnMsgPopData + FnRcPointerRel + FnSpinLock + FnSpinUnlock + FnSkFullsock + FnTcpSock + FnSkbEcnSetCe + FnGetListenerSock + FnSkcLookupTcp + FnTcpCheckSyncookie + FnSysctlGetName + FnSysctlGetCurrentValue + FnSysctlGetNewValue + FnSysctlSetNewValue + FnStrtol + FnStrtoul + FnSkStorageGet + FnSkStorageDelete + FnSendSignal + FnTcpGenSyncookie +) + +// Call emits a function call. +func (fn BuiltinFunc) Call() Instruction { + return Instruction{ + OpCode: OpCode(JumpClass).SetJumpOp(Call), + Constant: int64(fn), + } +} diff --git a/vendor/github.com/cilium/ebpf/asm/func_string.go b/vendor/github.com/cilium/ebpf/asm/func_string.go new file mode 100644 index 0000000..8860b9f --- /dev/null +++ b/vendor/github.com/cilium/ebpf/asm/func_string.go @@ -0,0 +1,133 @@ +// Code generated by "stringer -output func_string.go -type=BuiltinFunc"; DO NOT EDIT. + +package asm + +import "strconv" + +func _() { + // An "invalid array index" compiler error signifies that the constant values have changed. + // Re-run the stringer command to generate them again. + var x [1]struct{} + _ = x[FnUnspec-0] + _ = x[FnMapLookupElem-1] + _ = x[FnMapUpdateElem-2] + _ = x[FnMapDeleteElem-3] + _ = x[FnProbeRead-4] + _ = x[FnKtimeGetNs-5] + _ = x[FnTracePrintk-6] + _ = x[FnGetPrandomU32-7] + _ = x[FnGetSmpProcessorId-8] + _ = x[FnSkbStoreBytes-9] + _ = x[FnL3CsumReplace-10] + _ = x[FnL4CsumReplace-11] + _ = x[FnTailCall-12] + _ = x[FnCloneRedirect-13] + _ = x[FnGetCurrentPidTgid-14] + _ = x[FnGetCurrentUidGid-15] + _ = x[FnGetCurrentComm-16] + _ = x[FnGetCgroupClassid-17] + _ = x[FnSkbVlanPush-18] + _ = x[FnSkbVlanPop-19] + _ = x[FnSkbGetTunnelKey-20] + _ = x[FnSkbSetTunnelKey-21] + _ = x[FnPerfEventRead-22] + _ = x[FnRedirect-23] + _ = x[FnGetRouteRealm-24] + _ = x[FnPerfEventOutput-25] + _ = x[FnSkbLoadBytes-26] + _ = x[FnGetStackid-27] + _ = x[FnCsumDiff-28] + _ = x[FnSkbGetTunnelOpt-29] + _ = x[FnSkbSetTunnelOpt-30] + _ = x[FnSkbChangeProto-31] + _ = x[FnSkbChangeType-32] + _ = x[FnSkbUnderCgroup-33] + _ = x[FnGetHashRecalc-34] + _ = x[FnGetCurrentTask-35] + _ = x[FnProbeWriteUser-36] + _ = x[FnCurrentTaskUnderCgroup-37] + _ = x[FnSkbChangeTail-38] + _ = x[FnSkbPullData-39] + _ = x[FnCsumUpdate-40] + _ = x[FnSetHashInvalid-41] + _ = x[FnGetNumaNodeId-42] + _ = x[FnSkbChangeHead-43] + _ = x[FnXdpAdjustHead-44] + _ = x[FnProbeReadStr-45] + _ = x[FnGetSocketCookie-46] + _ = x[FnGetSocketUid-47] + _ = x[FnSetHash-48] + _ = x[FnSetsockopt-49] + _ = x[FnSkbAdjustRoom-50] + _ = x[FnRedirectMap-51] + _ = x[FnSkRedirectMap-52] + _ = x[FnSockMapUpdate-53] + _ = x[FnXdpAdjustMeta-54] + _ = x[FnPerfEventReadValue-55] + _ = x[FnPerfProgReadValue-56] + _ = x[FnGetsockopt-57] + _ = x[FnOverrideReturn-58] + _ = x[FnSockOpsCbFlagsSet-59] + _ = x[FnMsgRedirectMap-60] + _ = x[FnMsgApplyBytes-61] + _ = x[FnMsgCorkBytes-62] + _ = x[FnMsgPullData-63] + _ = x[FnBind-64] + _ = x[FnXdpAdjustTail-65] + _ = x[FnSkbGetXfrmState-66] + _ = x[FnGetStack-67] + _ = x[FnSkbLoadBytesRelative-68] + _ = x[FnFibLookup-69] + _ = x[FnSockHashUpdate-70] + _ = x[FnMsgRedirectHash-71] + _ = x[FnSkRedirectHash-72] + _ = x[FnLwtPushEncap-73] + _ = x[FnLwtSeg6StoreBytes-74] + _ = x[FnLwtSeg6AdjustSrh-75] + _ = x[FnLwtSeg6Action-76] + _ = x[FnRcRepeat-77] + _ = x[FnRcKeydown-78] + _ = x[FnSkbCgroupId-79] + _ = x[FnGetCurrentCgroupId-80] + _ = x[FnGetLocalStorage-81] + _ = x[FnSkSelectReuseport-82] + _ = x[FnSkbAncestorCgroupId-83] + _ = x[FnSkLookupTcp-84] + _ = x[FnSkLookupUdp-85] + _ = x[FnSkRelease-86] + _ = x[FnMapPushElem-87] + _ = x[FnMapPopElem-88] + _ = x[FnMapPeekElem-89] + _ = x[FnMsgPushData-90] + _ = x[FnMsgPopData-91] + _ = x[FnRcPointerRel-92] + _ = x[FnSpinLock-93] + _ = x[FnSpinUnlock-94] + _ = x[FnSkFullsock-95] + _ = x[FnTcpSock-96] + _ = x[FnSkbEcnSetCe-97] + _ = x[FnGetListenerSock-98] + _ = x[FnSkcLookupTcp-99] + _ = x[FnTcpCheckSyncookie-100] + _ = x[FnSysctlGetName-101] + _ = x[FnSysctlGetCurrentValue-102] + _ = x[FnSysctlGetNewValue-103] + _ = x[FnSysctlSetNewValue-104] + _ = x[FnStrtol-105] + _ = x[FnStrtoul-106] + _ = x[FnSkStorageGet-107] + _ = x[FnSkStorageDelete-108] + _ = x[FnSendSignal-109] + _ = x[FnTcpGenSyncookie-110] +} + +const _BuiltinFunc_name = "FnUnspecFnMapLookupElemFnMapUpdateElemFnMapDeleteElemFnProbeReadFnKtimeGetNsFnTracePrintkFnGetPrandomU32FnGetSmpProcessorIdFnSkbStoreBytesFnL3CsumReplaceFnL4CsumReplaceFnTailCallFnCloneRedirectFnGetCurrentPidTgidFnGetCurrentUidGidFnGetCurrentCommFnGetCgroupClassidFnSkbVlanPushFnSkbVlanPopFnSkbGetTunnelKeyFnSkbSetTunnelKeyFnPerfEventReadFnRedirectFnGetRouteRealmFnPerfEventOutputFnSkbLoadBytesFnGetStackidFnCsumDiffFnSkbGetTunnelOptFnSkbSetTunnelOptFnSkbChangeProtoFnSkbChangeTypeFnSkbUnderCgroupFnGetHashRecalcFnGetCurrentTaskFnProbeWriteUserFnCurrentTaskUnderCgroupFnSkbChangeTailFnSkbPullDataFnCsumUpdateFnSetHashInvalidFnGetNumaNodeIdFnSkbChangeHeadFnXdpAdjustHeadFnProbeReadStrFnGetSocketCookieFnGetSocketUidFnSetHashFnSetsockoptFnSkbAdjustRoomFnRedirectMapFnSkRedirectMapFnSockMapUpdateFnXdpAdjustMetaFnPerfEventReadValueFnPerfProgReadValueFnGetsockoptFnOverrideReturnFnSockOpsCbFlagsSetFnMsgRedirectMapFnMsgApplyBytesFnMsgCorkBytesFnMsgPullDataFnBindFnXdpAdjustTailFnSkbGetXfrmStateFnGetStackFnSkbLoadBytesRelativeFnFibLookupFnSockHashUpdateFnMsgRedirectHashFnSkRedirectHashFnLwtPushEncapFnLwtSeg6StoreBytesFnLwtSeg6AdjustSrhFnLwtSeg6ActionFnRcRepeatFnRcKeydownFnSkbCgroupIdFnGetCurrentCgroupIdFnGetLocalStorageFnSkSelectReuseportFnSkbAncestorCgroupIdFnSkLookupTcpFnSkLookupUdpFnSkReleaseFnMapPushElemFnMapPopElemFnMapPeekElemFnMsgPushDataFnMsgPopDataFnRcPointerRelFnSpinLockFnSpinUnlockFnSkFullsockFnTcpSockFnSkbEcnSetCeFnGetListenerSockFnSkcLookupTcpFnTcpCheckSyncookieFnSysctlGetNameFnSysctlGetCurrentValueFnSysctlGetNewValueFnSysctlSetNewValueFnStrtolFnStrtoulFnSkStorageGetFnSkStorageDeleteFnSendSignalFnTcpGenSyncookie" + +var _BuiltinFunc_index = [...]uint16{0, 8, 23, 38, 53, 64, 76, 89, 104, 123, 138, 153, 168, 178, 193, 212, 230, 246, 264, 277, 289, 306, 323, 338, 348, 363, 380, 394, 406, 416, 433, 450, 466, 481, 497, 512, 528, 544, 568, 583, 596, 608, 624, 639, 654, 669, 683, 700, 714, 723, 735, 750, 763, 778, 793, 808, 828, 847, 859, 875, 894, 910, 925, 939, 952, 958, 973, 990, 1000, 1022, 1033, 1049, 1066, 1082, 1096, 1115, 1133, 1148, 1158, 1169, 1182, 1202, 1219, 1238, 1259, 1272, 1285, 1296, 1309, 1321, 1334, 1347, 1359, 1373, 1383, 1395, 1407, 1416, 1429, 1446, 1460, 1479, 1494, 1517, 1536, 1555, 1563, 1572, 1586, 1603, 1615, 1632} + +func (i BuiltinFunc) String() string { + if i < 0 || i >= BuiltinFunc(len(_BuiltinFunc_index)-1) { + return "BuiltinFunc(" + strconv.FormatInt(int64(i), 10) + ")" + } + return _BuiltinFunc_name[_BuiltinFunc_index[i]:_BuiltinFunc_index[i+1]] +} diff --git a/vendor/github.com/cilium/ebpf/asm/instruction.go b/vendor/github.com/cilium/ebpf/asm/instruction.go new file mode 100644 index 0000000..c8ed6cf --- /dev/null +++ b/vendor/github.com/cilium/ebpf/asm/instruction.go @@ -0,0 +1,416 @@ +package asm + +import ( + "encoding/binary" + "fmt" + "io" + "math" + "strings" + + "github.com/pkg/errors" +) + +// InstructionSize is the size of a BPF instruction in bytes +const InstructionSize = 8 + +// Instruction is a single eBPF instruction. +type Instruction struct { + OpCode OpCode + Dst Register + Src Register + Offset int16 + Constant int64 + Reference string + Symbol string +} + +// Sym creates a symbol. +func (ins Instruction) Sym(name string) Instruction { + ins.Symbol = name + return ins +} + +// Unmarshal decodes a BPF instruction. +func (ins *Instruction) Unmarshal(r io.Reader, bo binary.ByteOrder) (uint64, error) { + var bi bpfInstruction + err := binary.Read(r, bo, &bi) + if err != nil { + return 0, err + } + + ins.OpCode = bi.OpCode + ins.Dst = bi.Registers.Dst() + ins.Src = bi.Registers.Src() + ins.Offset = bi.Offset + ins.Constant = int64(bi.Constant) + + if !bi.OpCode.isDWordLoad() { + return InstructionSize, nil + } + + var bi2 bpfInstruction + if err := binary.Read(r, bo, &bi2); err != nil { + // No Wrap, to avoid io.EOF clash + return 0, errors.New("64bit immediate is missing second half") + } + if bi2.OpCode != 0 || bi2.Offset != 0 || bi2.Registers != 0 { + return 0, errors.New("64bit immediate has non-zero fields") + } + ins.Constant = int64(uint64(uint32(bi2.Constant))<<32 | uint64(uint32(bi.Constant))) + + return 2 * InstructionSize, nil +} + +// Marshal encodes a BPF instruction. +func (ins Instruction) Marshal(w io.Writer, bo binary.ByteOrder) (uint64, error) { + if ins.OpCode == InvalidOpCode { + return 0, errors.New("invalid opcode") + } + + isDWordLoad := ins.OpCode.isDWordLoad() + + cons := int32(ins.Constant) + if isDWordLoad { + // Encode least significant 32bit first for 64bit operations. + cons = int32(uint32(ins.Constant)) + } + + bpfi := bpfInstruction{ + ins.OpCode, + newBPFRegisters(ins.Dst, ins.Src), + ins.Offset, + cons, + } + + if err := binary.Write(w, bo, &bpfi); err != nil { + return 0, err + } + + if !isDWordLoad { + return InstructionSize, nil + } + + bpfi = bpfInstruction{ + Constant: int32(ins.Constant >> 32), + } + + if err := binary.Write(w, bo, &bpfi); err != nil { + return 0, err + } + + return 2 * InstructionSize, nil +} + +// RewriteMapPtr changes an instruction to use a new map fd. +// +// Returns an error if the fd is invalid, or the instruction +// is incorrect. +func (ins *Instruction) RewriteMapPtr(fd int) error { + if !ins.OpCode.isDWordLoad() { + return errors.Errorf("%s is not a 64 bit load", ins.OpCode) + } + + if fd < 0 { + return errors.New("invalid fd") + } + + ins.Src = R1 + ins.Constant = int64(fd) + return nil +} + +// Format implements fmt.Formatter. +func (ins Instruction) Format(f fmt.State, c rune) { + if c != 'v' { + fmt.Fprintf(f, "{UNRECOGNIZED: %c}", c) + return + } + + op := ins.OpCode + + if op == InvalidOpCode { + fmt.Fprint(f, "INVALID") + return + } + + // Omit trailing space for Exit + if op.JumpOp() == Exit { + fmt.Fprint(f, op) + return + } + + fmt.Fprintf(f, "%v ", op) + switch cls := op.Class(); cls { + case LdClass, LdXClass, StClass, StXClass: + switch op.Mode() { + case ImmMode: + fmt.Fprintf(f, "dst: %s imm: %d", ins.Dst, ins.Constant) + case AbsMode: + fmt.Fprintf(f, "imm: %d", ins.Constant) + case IndMode: + fmt.Fprintf(f, "dst: %s src: %s imm: %d", ins.Dst, ins.Src, ins.Constant) + case MemMode: + fmt.Fprintf(f, "dst: %s src: %s off: %d imm: %d", ins.Dst, ins.Src, ins.Offset, ins.Constant) + case XAddMode: + fmt.Fprintf(f, "dst: %s src: %s", ins.Dst, ins.Src) + } + + case ALU64Class, ALUClass: + fmt.Fprintf(f, "dst: %s ", ins.Dst) + if op.ALUOp() == Swap || op.Source() == ImmSource { + fmt.Fprintf(f, "imm: %d", ins.Constant) + } else { + fmt.Fprintf(f, "src: %s", ins.Src) + } + + case JumpClass: + switch jop := op.JumpOp(); jop { + case Call: + if ins.Src == R1 { + // bpf-to-bpf call + fmt.Fprint(f, ins.Constant) + } else { + fmt.Fprint(f, BuiltinFunc(ins.Constant)) + } + + default: + fmt.Fprintf(f, "dst: %s off: %d ", ins.Dst, ins.Offset) + if op.Source() == ImmSource { + fmt.Fprintf(f, "imm: %d", ins.Constant) + } else { + fmt.Fprintf(f, "src: %s", ins.Src) + } + } + } + + if ins.Reference != "" { + fmt.Fprintf(f, " <%s>", ins.Reference) + } +} + +// Instructions is an eBPF program. +type Instructions []Instruction + +func (insns Instructions) String() string { + return fmt.Sprint(insns) +} + +// RewriteMapPtr rewrites all loads of a specific map pointer to a new fd. +// +// Returns an error if the symbol isn't used, see IsUnreferencedSymbol. +func (insns Instructions) RewriteMapPtr(symbol string, fd int) error { + if symbol == "" { + return errors.New("empty symbol") + } + + found := false + for i := range insns { + ins := &insns[i] + if ins.Reference != symbol { + continue + } + + if err := ins.RewriteMapPtr(fd); err != nil { + return err + } + + found = true + } + + if !found { + return &unreferencedSymbolError{symbol} + } + + return nil +} + +// SymbolOffsets returns the set of symbols and their offset in +// the instructions. +func (insns Instructions) SymbolOffsets() (map[string]int, error) { + offsets := make(map[string]int) + + for i, ins := range insns { + if ins.Symbol == "" { + continue + } + + if _, ok := offsets[ins.Symbol]; ok { + return nil, errors.Errorf("duplicate symbol %s", ins.Symbol) + } + + offsets[ins.Symbol] = i + } + + return offsets, nil +} + +// ReferenceOffsets returns the set of references and their offset in +// the instructions. +func (insns Instructions) ReferenceOffsets() map[string][]int { + offsets := make(map[string][]int) + + for i, ins := range insns { + if ins.Reference == "" { + continue + } + + offsets[ins.Reference] = append(offsets[ins.Reference], i) + } + + return offsets +} + +func (insns Instructions) marshalledOffsets() (map[string]int, error) { + symbols := make(map[string]int) + + marshalledPos := 0 + for _, ins := range insns { + currentPos := marshalledPos + marshalledPos += ins.OpCode.marshalledInstructions() + + if ins.Symbol == "" { + continue + } + + if _, ok := symbols[ins.Symbol]; ok { + return nil, errors.Errorf("duplicate symbol %s", ins.Symbol) + } + + symbols[ins.Symbol] = currentPos + } + + return symbols, nil +} + +// Format implements fmt.Formatter. +// +// You can control indentation of symbols by +// specifying a width. Setting a precision controls the indentation of +// instructions. +// The default character is a tab, which can be overriden by specifying +// the ' ' space flag. +func (insns Instructions) Format(f fmt.State, c rune) { + if c != 's' && c != 'v' { + fmt.Fprintf(f, "{UNKNOWN FORMAT '%c'}", c) + return + } + + // Precision is better in this case, because it allows + // specifying 0 padding easily. + padding, ok := f.Precision() + if !ok { + padding = 1 + } + + indent := strings.Repeat("\t", padding) + if f.Flag(' ') { + indent = strings.Repeat(" ", padding) + } + + symPadding, ok := f.Width() + if !ok { + symPadding = padding - 1 + } + if symPadding < 0 { + symPadding = 0 + } + + symIndent := strings.Repeat("\t", symPadding) + if f.Flag(' ') { + symIndent = strings.Repeat(" ", symPadding) + } + + // Figure out how many digits we need to represent the highest + // offset. + highestOffset := 0 + for _, ins := range insns { + highestOffset += ins.OpCode.marshalledInstructions() + } + offsetWidth := int(math.Ceil(math.Log10(float64(highestOffset)))) + + offset := 0 + for _, ins := range insns { + if ins.Symbol != "" { + fmt.Fprintf(f, "%s%s:\n", symIndent, ins.Symbol) + } + fmt.Fprintf(f, "%s%*d: %v\n", indent, offsetWidth, offset, ins) + offset += ins.OpCode.marshalledInstructions() + } + + return +} + +// Marshal encodes a BPF program into the kernel format. +func (insns Instructions) Marshal(w io.Writer, bo binary.ByteOrder) error { + absoluteOffsets, err := insns.marshalledOffsets() + if err != nil { + return err + } + + num := 0 + for i, ins := range insns { + switch { + case ins.OpCode.JumpOp() == Call && ins.Constant == -1: + // Rewrite bpf to bpf call + offset, ok := absoluteOffsets[ins.Reference] + if !ok { + return errors.Errorf("instruction %d: reference to missing symbol %s", i, ins.Reference) + } + + ins.Constant = int64(offset - num - 1) + + case ins.OpCode.Class() == JumpClass && ins.Offset == -1: + // Rewrite jump to label + offset, ok := absoluteOffsets[ins.Reference] + if !ok { + return errors.Errorf("instruction %d: reference to missing symbol %s", i, ins.Reference) + } + + ins.Offset = int16(offset - num - 1) + } + + n, err := ins.Marshal(w, bo) + if err != nil { + return errors.Wrapf(err, "instruction %d", i) + } + + num += int(n / InstructionSize) + } + return nil +} + +type bpfInstruction struct { + OpCode OpCode + Registers bpfRegisters + Offset int16 + Constant int32 +} + +type bpfRegisters uint8 + +func newBPFRegisters(dst, src Register) bpfRegisters { + return bpfRegisters((src << 4) | (dst & 0xF)) +} + +func (r bpfRegisters) Dst() Register { + return Register(r & 0xF) +} + +func (r bpfRegisters) Src() Register { + return Register(r >> 4) +} + +type unreferencedSymbolError struct { + symbol string +} + +func (use *unreferencedSymbolError) Error() string { + return fmt.Sprintf("unreferenced symbol %s", use.symbol) +} + +// IsUnreferencedSymbol returns true if err was caused by +// an unreferenced symbol. +func IsUnreferencedSymbol(err error) bool { + _, ok := err.(*unreferencedSymbolError) + return ok +} diff --git a/vendor/github.com/cilium/ebpf/asm/jump.go b/vendor/github.com/cilium/ebpf/asm/jump.go new file mode 100644 index 0000000..33c9b56 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/asm/jump.go @@ -0,0 +1,109 @@ +package asm + +//go:generate stringer -output jump_string.go -type=JumpOp + +// JumpOp affect control flow. +// +// msb lsb +// +----+-+---+ +// |OP |s|cls| +// +----+-+---+ +type JumpOp uint8 + +const jumpMask OpCode = aluMask + +const ( + // InvalidJumpOp is returned by getters when invoked + // on non branch OpCodes + InvalidJumpOp JumpOp = 0xff + // Ja jumps by offset unconditionally + Ja JumpOp = 0x00 + // JEq jumps by offset if r == imm + JEq JumpOp = 0x10 + // JGT jumps by offset if r > imm + JGT JumpOp = 0x20 + // JGE jumps by offset if r >= imm + JGE JumpOp = 0x30 + // JSet jumps by offset if r & imm + JSet JumpOp = 0x40 + // JNE jumps by offset if r != imm + JNE JumpOp = 0x50 + // JSGT jumps by offset if signed r > signed imm + JSGT JumpOp = 0x60 + // JSGE jumps by offset if signed r >= signed imm + JSGE JumpOp = 0x70 + // Call builtin or user defined function from imm + Call JumpOp = 0x80 + // Exit ends execution, with value in r0 + Exit JumpOp = 0x90 + // JLT jumps by offset if r < imm + JLT JumpOp = 0xa0 + // JLE jumps by offset if r <= imm + JLE JumpOp = 0xb0 + // JSLT jumps by offset if signed r < signed imm + JSLT JumpOp = 0xc0 + // JSLE jumps by offset if signed r <= signed imm + JSLE JumpOp = 0xd0 +) + +// Return emits an exit instruction. +// +// Requires a return value in R0. +func Return() Instruction { + return Instruction{ + OpCode: OpCode(JumpClass).SetJumpOp(Exit), + } +} + +// Op returns the OpCode for a given jump source. +func (op JumpOp) Op(source Source) OpCode { + return OpCode(JumpClass).SetJumpOp(op).SetSource(source) +} + +// Imm compares dst to value, and adjusts PC by offset if the condition is fulfilled. +func (op JumpOp) Imm(dst Register, value int32, label string) Instruction { + if op == Exit || op == Call || op == Ja { + return Instruction{OpCode: InvalidOpCode} + } + + return Instruction{ + OpCode: OpCode(JumpClass).SetJumpOp(op).SetSource(ImmSource), + Dst: dst, + Offset: -1, + Constant: int64(value), + Reference: label, + } +} + +// Reg compares dst to src, and adjusts PC by offset if the condition is fulfilled. +func (op JumpOp) Reg(dst, src Register, label string) Instruction { + if op == Exit || op == Call || op == Ja { + return Instruction{OpCode: InvalidOpCode} + } + + return Instruction{ + OpCode: OpCode(JumpClass).SetJumpOp(op).SetSource(RegSource), + Dst: dst, + Src: src, + Offset: -1, + Reference: label, + } +} + +// Label adjusts PC to the address of the label. +func (op JumpOp) Label(label string) Instruction { + if op == Call { + return Instruction{ + OpCode: OpCode(JumpClass).SetJumpOp(Call), + Src: R1, + Constant: -1, + Reference: label, + } + } + + return Instruction{ + OpCode: OpCode(JumpClass).SetJumpOp(op), + Offset: -1, + Reference: label, + } +} diff --git a/vendor/github.com/cilium/ebpf/asm/jump_string.go b/vendor/github.com/cilium/ebpf/asm/jump_string.go new file mode 100644 index 0000000..85a4aaf --- /dev/null +++ b/vendor/github.com/cilium/ebpf/asm/jump_string.go @@ -0,0 +1,53 @@ +// Code generated by "stringer -output jump_string.go -type=JumpOp"; DO NOT EDIT. + +package asm + +import "strconv" + +func _() { + // An "invalid array index" compiler error signifies that the constant values have changed. + // Re-run the stringer command to generate them again. + var x [1]struct{} + _ = x[InvalidJumpOp-255] + _ = x[Ja-0] + _ = x[JEq-16] + _ = x[JGT-32] + _ = x[JGE-48] + _ = x[JSet-64] + _ = x[JNE-80] + _ = x[JSGT-96] + _ = x[JSGE-112] + _ = x[Call-128] + _ = x[Exit-144] + _ = x[JLT-160] + _ = x[JLE-176] + _ = x[JSLT-192] + _ = x[JSLE-208] +} + +const _JumpOp_name = "JaJEqJGTJGEJSetJNEJSGTJSGECallExitJLTJLEJSLTJSLEInvalidJumpOp" + +var _JumpOp_map = map[JumpOp]string{ + 0: _JumpOp_name[0:2], + 16: _JumpOp_name[2:5], + 32: _JumpOp_name[5:8], + 48: _JumpOp_name[8:11], + 64: _JumpOp_name[11:15], + 80: _JumpOp_name[15:18], + 96: _JumpOp_name[18:22], + 112: _JumpOp_name[22:26], + 128: _JumpOp_name[26:30], + 144: _JumpOp_name[30:34], + 160: _JumpOp_name[34:37], + 176: _JumpOp_name[37:40], + 192: _JumpOp_name[40:44], + 208: _JumpOp_name[44:48], + 255: _JumpOp_name[48:61], +} + +func (i JumpOp) String() string { + if str, ok := _JumpOp_map[i]; ok { + return str + } + return "JumpOp(" + strconv.FormatInt(int64(i), 10) + ")" +} diff --git a/vendor/github.com/cilium/ebpf/asm/load_store.go b/vendor/github.com/cilium/ebpf/asm/load_store.go new file mode 100644 index 0000000..ab0e92f --- /dev/null +++ b/vendor/github.com/cilium/ebpf/asm/load_store.go @@ -0,0 +1,189 @@ +package asm + +//go:generate stringer -output load_store_string.go -type=Mode,Size + +// Mode for load and store operations +// +// msb lsb +// +---+--+---+ +// |MDE|sz|cls| +// +---+--+---+ +type Mode uint8 + +const modeMask OpCode = 0xe0 + +const ( + // InvalidMode is returned by getters when invoked + // on non load / store OpCodes + InvalidMode Mode = 0xff + // ImmMode - immediate value + ImmMode Mode = 0x00 + // AbsMode - immediate value + offset + AbsMode Mode = 0x20 + // IndMode - indirect (imm+src) + IndMode Mode = 0x40 + // MemMode - load from memory + MemMode Mode = 0x60 + // XAddMode - add atomically across processors. + XAddMode Mode = 0xc0 +) + +// Size of load and store operations +// +// msb lsb +// +---+--+---+ +// |mde|SZ|cls| +// +---+--+---+ +type Size uint8 + +const sizeMask OpCode = 0x18 + +const ( + // InvalidSize is returned by getters when invoked + // on non load / store OpCodes + InvalidSize Size = 0xff + // DWord - double word; 64 bits + DWord Size = 0x18 + // Word - word; 32 bits + Word Size = 0x00 + // Half - half-word; 16 bits + Half Size = 0x08 + // Byte - byte; 8 bits + Byte Size = 0x10 +) + +// Sizeof returns the size in bytes. +func (s Size) Sizeof() int { + switch s { + case DWord: + return 8 + case Word: + return 4 + case Half: + return 2 + case Byte: + return 1 + default: + return -1 + } +} + +// LoadMemOp returns the OpCode to load a value of given size from memory. +func LoadMemOp(size Size) OpCode { + return OpCode(LdXClass).SetMode(MemMode).SetSize(size) +} + +// LoadMem emits `dst = *(size *)(src + offset)`. +func LoadMem(dst, src Register, offset int16, size Size) Instruction { + return Instruction{ + OpCode: LoadMemOp(size), + Dst: dst, + Src: src, + Offset: offset, + } +} + +// LoadImmOp returns the OpCode to load an immediate of given size. +// +// As of kernel 4.20, only DWord size is accepted. +func LoadImmOp(size Size) OpCode { + return OpCode(LdClass).SetMode(ImmMode).SetSize(size) +} + +// LoadImm emits `dst = (size)value`. +// +// As of kernel 4.20, only DWord size is accepted. +func LoadImm(dst Register, value int64, size Size) Instruction { + return Instruction{ + OpCode: LoadImmOp(size), + Dst: dst, + Constant: value, + } +} + +// LoadMapPtr stores a pointer to a map in dst. +func LoadMapPtr(dst Register, fd int) Instruction { + if fd < 0 { + return Instruction{OpCode: InvalidOpCode} + } + + return Instruction{ + OpCode: LoadImmOp(DWord), + Dst: dst, + Src: R1, + Constant: int64(fd), + } +} + +// LoadIndOp returns the OpCode for loading a value of given size from an sk_buff. +func LoadIndOp(size Size) OpCode { + return OpCode(LdClass).SetMode(IndMode).SetSize(size) +} + +// LoadInd emits `dst = ntoh(*(size *)(((sk_buff *)R6)->data + src + offset))`. +func LoadInd(dst, src Register, offset int32, size Size) Instruction { + return Instruction{ + OpCode: LoadIndOp(size), + Dst: dst, + Src: src, + Constant: int64(offset), + } +} + +// LoadAbsOp returns the OpCode for loading a value of given size from an sk_buff. +func LoadAbsOp(size Size) OpCode { + return OpCode(LdClass).SetMode(AbsMode).SetSize(size) +} + +// LoadAbs emits `r0 = ntoh(*(size *)(((sk_buff *)R6)->data + offset))`. +func LoadAbs(offset int32, size Size) Instruction { + return Instruction{ + OpCode: LoadAbsOp(size), + Dst: R0, + Constant: int64(offset), + } +} + +// StoreMemOp returns the OpCode for storing a register of given size in memory. +func StoreMemOp(size Size) OpCode { + return OpCode(StXClass).SetMode(MemMode).SetSize(size) +} + +// StoreMem emits `*(size *)(dst + offset) = src` +func StoreMem(dst Register, offset int16, src Register, size Size) Instruction { + return Instruction{ + OpCode: StoreMemOp(size), + Dst: dst, + Src: src, + Offset: offset, + } +} + +// StoreImmOp returns the OpCode for storing an immediate of given size in memory. +func StoreImmOp(size Size) OpCode { + return OpCode(StClass).SetMode(MemMode).SetSize(size) +} + +// StoreImm emits `*(size *)(dst + offset) = value`. +func StoreImm(dst Register, offset int16, value int64, size Size) Instruction { + return Instruction{ + OpCode: StoreImmOp(size), + Dst: dst, + Offset: offset, + Constant: value, + } +} + +// StoreXAddOp returns the OpCode to atomically add a register to a value in memory. +func StoreXAddOp(size Size) OpCode { + return OpCode(StXClass).SetMode(XAddMode).SetSize(size) +} + +// StoreXAdd atomically adds src to *dst. +func StoreXAdd(dst, src Register, size Size) Instruction { + return Instruction{ + OpCode: StoreXAddOp(size), + Dst: dst, + Src: src, + } +} diff --git a/vendor/github.com/cilium/ebpf/asm/load_store_string.go b/vendor/github.com/cilium/ebpf/asm/load_store_string.go new file mode 100644 index 0000000..76d29a0 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/asm/load_store_string.go @@ -0,0 +1,80 @@ +// Code generated by "stringer -output load_store_string.go -type=Mode,Size"; DO NOT EDIT. + +package asm + +import "strconv" + +func _() { + // An "invalid array index" compiler error signifies that the constant values have changed. + // Re-run the stringer command to generate them again. + var x [1]struct{} + _ = x[InvalidMode-255] + _ = x[ImmMode-0] + _ = x[AbsMode-32] + _ = x[IndMode-64] + _ = x[MemMode-96] + _ = x[XAddMode-192] +} + +const ( + _Mode_name_0 = "ImmMode" + _Mode_name_1 = "AbsMode" + _Mode_name_2 = "IndMode" + _Mode_name_3 = "MemMode" + _Mode_name_4 = "XAddMode" + _Mode_name_5 = "InvalidMode" +) + +func (i Mode) String() string { + switch { + case i == 0: + return _Mode_name_0 + case i == 32: + return _Mode_name_1 + case i == 64: + return _Mode_name_2 + case i == 96: + return _Mode_name_3 + case i == 192: + return _Mode_name_4 + case i == 255: + return _Mode_name_5 + default: + return "Mode(" + strconv.FormatInt(int64(i), 10) + ")" + } +} +func _() { + // An "invalid array index" compiler error signifies that the constant values have changed. + // Re-run the stringer command to generate them again. + var x [1]struct{} + _ = x[InvalidSize-255] + _ = x[DWord-24] + _ = x[Word-0] + _ = x[Half-8] + _ = x[Byte-16] +} + +const ( + _Size_name_0 = "Word" + _Size_name_1 = "Half" + _Size_name_2 = "Byte" + _Size_name_3 = "DWord" + _Size_name_4 = "InvalidSize" +) + +func (i Size) String() string { + switch { + case i == 0: + return _Size_name_0 + case i == 8: + return _Size_name_1 + case i == 16: + return _Size_name_2 + case i == 24: + return _Size_name_3 + case i == 255: + return _Size_name_4 + default: + return "Size(" + strconv.FormatInt(int64(i), 10) + ")" + } +} diff --git a/vendor/github.com/cilium/ebpf/asm/opcode.go b/vendor/github.com/cilium/ebpf/asm/opcode.go new file mode 100644 index 0000000..d796de3 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/asm/opcode.go @@ -0,0 +1,237 @@ +package asm + +import ( + "fmt" + "strings" +) + +//go:generate stringer -output opcode_string.go -type=Class + +type encoding int + +const ( + unknownEncoding encoding = iota + loadOrStore + jumpOrALU +) + +// Class of operations +// +// msb lsb +// +---+--+---+ +// | ?? |CLS| +// +---+--+---+ +type Class uint8 + +const classMask OpCode = 0x07 + +const ( + // LdClass load memory + LdClass Class = 0x00 + // LdXClass load memory from constant + LdXClass Class = 0x01 + // StClass load register from memory + StClass Class = 0x02 + // StXClass load register from constant + StXClass Class = 0x03 + // ALUClass arithmetic operators + ALUClass Class = 0x04 + // JumpClass jump operators + JumpClass Class = 0x05 + // ALU64Class arithmetic in 64 bit mode + ALU64Class Class = 0x07 +) + +func (cls Class) encoding() encoding { + switch cls { + case LdClass, LdXClass, StClass, StXClass: + return loadOrStore + case ALU64Class, ALUClass, JumpClass: + return jumpOrALU + default: + return unknownEncoding + } +} + +// OpCode is a packed eBPF opcode. +// +// Its encoding is defined by a Class value: +// +// msb lsb +// +----+-+---+ +// | ???? |CLS| +// +----+-+---+ +type OpCode uint8 + +// InvalidOpCode is returned by setters on OpCode +const InvalidOpCode OpCode = 0xff + +// marshalledInstructions returns the number of BPF instructions required +// to encode this opcode. +func (op OpCode) marshalledInstructions() int { + if op == LoadImmOp(DWord) { + return 2 + } + return 1 +} + +func (op OpCode) isDWordLoad() bool { + return op == LoadImmOp(DWord) +} + +// Class returns the class of operation. +func (op OpCode) Class() Class { + return Class(op & classMask) +} + +// Mode returns the mode for load and store operations. +func (op OpCode) Mode() Mode { + if op.Class().encoding() != loadOrStore { + return InvalidMode + } + return Mode(op & modeMask) +} + +// Size returns the size for load and store operations. +func (op OpCode) Size() Size { + if op.Class().encoding() != loadOrStore { + return InvalidSize + } + return Size(op & sizeMask) +} + +// Source returns the source for branch and ALU operations. +func (op OpCode) Source() Source { + if op.Class().encoding() != jumpOrALU || op.ALUOp() == Swap { + return InvalidSource + } + return Source(op & sourceMask) +} + +// ALUOp returns the ALUOp. +func (op OpCode) ALUOp() ALUOp { + if op.Class().encoding() != jumpOrALU { + return InvalidALUOp + } + return ALUOp(op & aluMask) +} + +// Endianness returns the Endianness for a byte swap instruction. +func (op OpCode) Endianness() Endianness { + if op.ALUOp() != Swap { + return InvalidEndian + } + return Endianness(op & endianMask) +} + +// JumpOp returns the JumpOp. +func (op OpCode) JumpOp() JumpOp { + if op.Class().encoding() != jumpOrALU { + return InvalidJumpOp + } + return JumpOp(op & jumpMask) +} + +// SetMode sets the mode on load and store operations. +// +// Returns InvalidOpCode if op is of the wrong class. +func (op OpCode) SetMode(mode Mode) OpCode { + if op.Class().encoding() != loadOrStore || !valid(OpCode(mode), modeMask) { + return InvalidOpCode + } + return (op & ^modeMask) | OpCode(mode) +} + +// SetSize sets the size on load and store operations. +// +// Returns InvalidOpCode if op is of the wrong class. +func (op OpCode) SetSize(size Size) OpCode { + if op.Class().encoding() != loadOrStore || !valid(OpCode(size), sizeMask) { + return InvalidOpCode + } + return (op & ^sizeMask) | OpCode(size) +} + +// SetSource sets the source on jump and ALU operations. +// +// Returns InvalidOpCode if op is of the wrong class. +func (op OpCode) SetSource(source Source) OpCode { + if op.Class().encoding() != jumpOrALU || !valid(OpCode(source), sourceMask) { + return InvalidOpCode + } + return (op & ^sourceMask) | OpCode(source) +} + +// SetALUOp sets the ALUOp on ALU operations. +// +// Returns InvalidOpCode if op is of the wrong class. +func (op OpCode) SetALUOp(alu ALUOp) OpCode { + class := op.Class() + if (class != ALUClass && class != ALU64Class) || !valid(OpCode(alu), aluMask) { + return InvalidOpCode + } + return (op & ^aluMask) | OpCode(alu) +} + +// SetJumpOp sets the JumpOp on jump operations. +// +// Returns InvalidOpCode if op is of the wrong class. +func (op OpCode) SetJumpOp(jump JumpOp) OpCode { + if op.Class() != JumpClass || !valid(OpCode(jump), jumpMask) { + return InvalidOpCode + } + return (op & ^jumpMask) | OpCode(jump) +} + +func (op OpCode) String() string { + var f strings.Builder + + switch class := op.Class(); class { + case LdClass, LdXClass, StClass, StXClass: + f.WriteString(strings.TrimSuffix(class.String(), "Class")) + + mode := op.Mode() + f.WriteString(strings.TrimSuffix(mode.String(), "Mode")) + + switch op.Size() { + case DWord: + f.WriteString("DW") + case Word: + f.WriteString("W") + case Half: + f.WriteString("H") + case Byte: + f.WriteString("B") + } + + case ALU64Class, ALUClass: + f.WriteString(op.ALUOp().String()) + + if op.ALUOp() == Swap { + // Width for Endian is controlled by Constant + f.WriteString(op.Endianness().String()) + } else { + if class == ALUClass { + f.WriteString("32") + } + + f.WriteString(strings.TrimSuffix(op.Source().String(), "Source")) + } + + case JumpClass: + f.WriteString(op.JumpOp().String()) + if jop := op.JumpOp(); jop != Exit && jop != Call { + f.WriteString(strings.TrimSuffix(op.Source().String(), "Source")) + } + + default: + fmt.Fprintf(&f, "%#x", op) + } + + return f.String() +} + +// valid returns true if all bits in value are covered by mask. +func valid(value, mask OpCode) bool { + return value & ^mask == 0 +} diff --git a/vendor/github.com/cilium/ebpf/asm/opcode_string.go b/vendor/github.com/cilium/ebpf/asm/opcode_string.go new file mode 100644 index 0000000..079ce1d --- /dev/null +++ b/vendor/github.com/cilium/ebpf/asm/opcode_string.go @@ -0,0 +1,38 @@ +// Code generated by "stringer -output opcode_string.go -type=Class"; DO NOT EDIT. + +package asm + +import "strconv" + +func _() { + // An "invalid array index" compiler error signifies that the constant values have changed. + // Re-run the stringer command to generate them again. + var x [1]struct{} + _ = x[LdClass-0] + _ = x[LdXClass-1] + _ = x[StClass-2] + _ = x[StXClass-3] + _ = x[ALUClass-4] + _ = x[JumpClass-5] + _ = x[ALU64Class-7] +} + +const ( + _Class_name_0 = "LdClassLdXClassStClassStXClassALUClassJumpClass" + _Class_name_1 = "ALU64Class" +) + +var ( + _Class_index_0 = [...]uint8{0, 7, 15, 22, 30, 38, 47} +) + +func (i Class) String() string { + switch { + case 0 <= i && i <= 5: + return _Class_name_0[_Class_index_0[i]:_Class_index_0[i+1]] + case i == 7: + return _Class_name_1 + default: + return "Class(" + strconv.FormatInt(int64(i), 10) + ")" + } +} diff --git a/vendor/github.com/cilium/ebpf/asm/register.go b/vendor/github.com/cilium/ebpf/asm/register.go new file mode 100644 index 0000000..4f284fb --- /dev/null +++ b/vendor/github.com/cilium/ebpf/asm/register.go @@ -0,0 +1,42 @@ +package asm + +import ( + "fmt" +) + +// Register is the source or destination of most operations. +type Register uint8 + +// R0 contains return values. +const R0 Register = 0 + +// Registers for function arguments. +const ( + R1 Register = R0 + 1 + iota + R2 + R3 + R4 + R5 +) + +// Callee saved registers preserved by function calls. +const ( + R6 Register = R5 + 1 + iota + R7 + R8 + R9 +) + +// Read-only frame pointer to access stack. +const ( + R10 Register = R9 + 1 + RFP = R10 +) + +func (r Register) String() string { + v := uint8(r) + if v == 10 { + return "rfp" + } + return fmt.Sprintf("r%d", v) +} diff --git a/vendor/github.com/cilium/ebpf/collection.go b/vendor/github.com/cilium/ebpf/collection.go new file mode 100644 index 0000000..5ad1a5e --- /dev/null +++ b/vendor/github.com/cilium/ebpf/collection.go @@ -0,0 +1,148 @@ +package ebpf + +import ( + "github.com/cilium/ebpf/asm" + "github.com/pkg/errors" +) + +// CollectionOptions control loading a collection into the kernel. +type CollectionOptions struct { + Programs ProgramOptions +} + +// CollectionSpec describes a collection. +type CollectionSpec struct { + Maps map[string]*MapSpec + Programs map[string]*ProgramSpec +} + +// Copy returns a recursive copy of the spec. +func (cs *CollectionSpec) Copy() *CollectionSpec { + if cs == nil { + return nil + } + + cpy := CollectionSpec{ + Maps: make(map[string]*MapSpec, len(cs.Maps)), + Programs: make(map[string]*ProgramSpec, len(cs.Programs)), + } + + for name, spec := range cs.Maps { + cpy.Maps[name] = spec.Copy() + } + + for name, spec := range cs.Programs { + cpy.Programs[name] = spec.Copy() + } + + return &cpy +} + +// Collection is a collection of Programs and Maps associated +// with their symbols +type Collection struct { + Programs map[string]*Program + Maps map[string]*Map +} + +// NewCollection creates a Collection from a specification. +// +// Only maps referenced by at least one of the programs are initialized. +func NewCollection(spec *CollectionSpec) (*Collection, error) { + return NewCollectionWithOptions(spec, CollectionOptions{}) +} + +// NewCollectionWithOptions creates a Collection from a specification. +// +// Only maps referenced by at least one of the programs are initialized. +func NewCollectionWithOptions(spec *CollectionSpec, opts CollectionOptions) (*Collection, error) { + maps := make(map[string]*Map) + for mapName, mapSpec := range spec.Maps { + m, err := NewMap(mapSpec) + if err != nil { + return nil, errors.Wrapf(err, "map %s", mapName) + } + maps[mapName] = m + } + + progs := make(map[string]*Program) + for progName, origProgSpec := range spec.Programs { + progSpec := origProgSpec.Copy() + + // Rewrite any reference to a valid map. + for i := range progSpec.Instructions { + var ( + ins = &progSpec.Instructions[i] + m = maps[ins.Reference] + ) + + if ins.Reference == "" || m == nil { + continue + } + + if ins.Src == asm.R1 { + // Don't overwrite maps already rewritten, users can + // rewrite programs in the spec themselves + continue + } + + if err := ins.RewriteMapPtr(m.FD()); err != nil { + return nil, errors.Wrapf(err, "progam %s: map %s", progName, ins.Reference) + } + } + + prog, err := NewProgramWithOptions(progSpec, opts.Programs) + if err != nil { + return nil, errors.Wrapf(err, "program %s", progName) + } + progs[progName] = prog + } + + return &Collection{ + progs, + maps, + }, nil +} + +// LoadCollection parses an object file and converts it to a collection. +func LoadCollection(file string) (*Collection, error) { + spec, err := LoadCollectionSpec(file) + if err != nil { + return nil, err + } + return NewCollection(spec) +} + +// Close frees all maps and programs associated with the collection. +// +// The collection mustn't be used afterwards. +func (coll *Collection) Close() { + for _, prog := range coll.Programs { + prog.Close() + } + for _, m := range coll.Maps { + m.Close() + } +} + +// DetachMap removes the named map from the Collection. +// +// This means that a later call to Close() will not affect this map. +// +// Returns nil if no map of that name exists. +func (coll *Collection) DetachMap(name string) *Map { + m := coll.Maps[name] + delete(coll.Maps, name) + return m +} + +// DetachProgram removes the named program from the Collection. +// +// This means that a later call to Close() will not affect this program. +// +// Returns nil if no program of that name exists. +func (coll *Collection) DetachProgram(name string) *Program { + p := coll.Programs[name] + delete(coll.Programs, name) + return p +} diff --git a/vendor/github.com/cilium/ebpf/doc.go b/vendor/github.com/cilium/ebpf/doc.go new file mode 100644 index 0000000..d96e6b1 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/doc.go @@ -0,0 +1,17 @@ +// Package ebpf is a toolkit for working with eBPF programs. +// +// eBPF programs are small snippets of code which are executed directly +// in a VM in the Linux kernel, which makes them very fast and flexible. +// Many Linux subsystems now accept eBPF programs. This makes it possible +// to implement highly application specific logic inside the kernel, +// without having to modify the actual kernel itself. +// +// This package is designed for long-running processes which +// want to use eBPF to implement part of their application logic. It has no +// run-time dependencies outside of the library and the Linux kernel itself. +// eBPF code should be compiled ahead of time using clang, and shipped with +// your application as any other resource. +// +// This package doesn't include code required to attach eBPF to Linux +// subsystems, since this varies per subsystem. +package ebpf diff --git a/vendor/github.com/cilium/ebpf/elf_reader.go b/vendor/github.com/cilium/ebpf/elf_reader.go new file mode 100644 index 0000000..3bdc084 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/elf_reader.go @@ -0,0 +1,392 @@ +package ebpf + +import ( + "bytes" + "debug/elf" + "encoding/binary" + "fmt" + "io" + "os" + "strings" + + "github.com/cilium/ebpf/asm" + + "github.com/pkg/errors" +) + +type elfCode struct { + *elf.File + symbols []elf.Symbol + symbolsPerSection map[elf.SectionIndex]map[uint64]string +} + +// LoadCollectionSpec parses an ELF file into a CollectionSpec. +func LoadCollectionSpec(file string) (*CollectionSpec, error) { + f, err := os.Open(file) + if err != nil { + return nil, err + } + defer f.Close() + + spec, err := LoadCollectionSpecFromReader(f) + return spec, errors.Wrapf(err, "file %s", file) +} + +// LoadCollectionSpecFromReader parses an ELF file into a CollectionSpec. +func LoadCollectionSpecFromReader(code io.ReaderAt) (*CollectionSpec, error) { + f, err := elf.NewFile(code) + if err != nil { + return nil, err + } + defer f.Close() + + symbols, err := f.Symbols() + if err != nil { + return nil, errors.Wrap(err, "load symbols") + } + + ec := &elfCode{f, symbols, symbolsPerSection(symbols)} + + var licenseSection, versionSection *elf.Section + progSections := make(map[elf.SectionIndex]*elf.Section) + relSections := make(map[elf.SectionIndex]*elf.Section) + mapSections := make(map[elf.SectionIndex]*elf.Section) + for i, sec := range ec.Sections { + switch { + case strings.HasPrefix(sec.Name, "license"): + licenseSection = sec + case strings.HasPrefix(sec.Name, "version"): + versionSection = sec + case strings.HasPrefix(sec.Name, "maps"): + mapSections[elf.SectionIndex(i)] = sec + case sec.Type == elf.SHT_REL: + if int(sec.Info) >= len(ec.Sections) { + return nil, errors.Errorf("found relocation section %v for missing section %v", i, sec.Info) + } + + // Store relocations under the section index of the target + idx := elf.SectionIndex(sec.Info) + if relSections[idx] != nil { + return nil, errors.Errorf("section %d has multiple relocation sections", idx) + } + relSections[idx] = sec + case sec.Type == elf.SHT_PROGBITS && (sec.Flags&elf.SHF_EXECINSTR) != 0 && sec.Size > 0: + progSections[elf.SectionIndex(i)] = sec + } + } + + license, err := loadLicense(licenseSection) + if err != nil { + return nil, errors.Wrap(err, "load license") + } + + version, err := loadVersion(versionSection, ec.ByteOrder) + if err != nil { + return nil, errors.Wrap(err, "load version") + } + + maps, err := ec.loadMaps(mapSections) + if err != nil { + return nil, errors.Wrap(err, "load maps") + } + + progs, libs, err := ec.loadPrograms(progSections, relSections, license, version) + if err != nil { + return nil, errors.Wrap(err, "load programs") + } + + if len(libs) > 0 { + for name, prog := range progs { + prog.Instructions, err = link(prog.Instructions, libs...) + if err != nil { + return nil, errors.Wrapf(err, "program %s", name) + } + } + } + + return &CollectionSpec{maps, progs}, nil +} + +func loadLicense(sec *elf.Section) (string, error) { + if sec == nil { + return "", errors.Errorf("missing license section") + } + data, err := sec.Data() + if err != nil { + return "", errors.Wrapf(err, "section %s", sec.Name) + } + return string(bytes.TrimRight(data, "\000")), nil +} + +func loadVersion(sec *elf.Section, bo binary.ByteOrder) (uint32, error) { + if sec == nil { + return 0, nil + } + + var version uint32 + err := binary.Read(sec.Open(), bo, &version) + return version, errors.Wrapf(err, "section %s", sec.Name) +} + +func (ec *elfCode) loadPrograms(progSections, relSections map[elf.SectionIndex]*elf.Section, license string, version uint32) (map[string]*ProgramSpec, []asm.Instructions, error) { + var ( + progs = make(map[string]*ProgramSpec) + libs []asm.Instructions + ) + for idx, prog := range progSections { + syms := ec.symbolsPerSection[idx] + if len(syms) == 0 { + return nil, nil, errors.Errorf("section %v: missing symbols", prog.Name) + } + + funcSym := syms[0] + if funcSym == "" { + return nil, nil, errors.Errorf("section %v: no label at start", prog.Name) + } + + rels, err := ec.loadRelocations(relSections[idx]) + if err != nil { + return nil, nil, errors.Wrapf(err, "program %s: can't load relocations", funcSym) + } + + insns, err := ec.loadInstructions(prog, syms, rels) + if err != nil { + return nil, nil, errors.Wrapf(err, "program %s: can't unmarshal instructions", funcSym) + } + + if progType, attachType := getProgType(prog.Name); progType == UnspecifiedProgram { + // There is no single name we can use for "library" sections, + // since they may contain multiple functions. We'll decode the + // labels they contain later on, and then link sections that way. + libs = append(libs, insns) + } else { + progs[funcSym] = &ProgramSpec{ + Name: funcSym, + Type: progType, + AttachType: attachType, + License: license, + KernelVersion: version, + Instructions: insns, + } + } + } + return progs, libs, nil +} + +func (ec *elfCode) loadInstructions(section *elf.Section, symbols, relocations map[uint64]string) (asm.Instructions, error) { + var ( + r = section.Open() + insns asm.Instructions + ins asm.Instruction + offset uint64 + ) + for { + n, err := ins.Unmarshal(r, ec.ByteOrder) + if err == io.EOF { + return insns, nil + } + if err != nil { + return nil, errors.Wrapf(err, "offset %d", offset) + } + + ins.Symbol = symbols[offset] + ins.Reference = relocations[offset] + + insns = append(insns, ins) + offset += n + } +} + +func (ec *elfCode) loadMaps(mapSections map[elf.SectionIndex]*elf.Section) (map[string]*MapSpec, error) { + var ( + maps = make(map[string]*MapSpec) + b = make([]byte, 1) + ) + for idx, sec := range mapSections { + syms := ec.symbolsPerSection[idx] + if len(syms) == 0 { + return nil, errors.Errorf("section %v: no symbols", sec.Name) + } + + if sec.Size%uint64(len(syms)) != 0 { + return nil, errors.Errorf("section %v: map descriptors are not of equal size", sec.Name) + } + + var ( + r = sec.Open() + size = sec.Size / uint64(len(syms)) + ) + for i, offset := 0, uint64(0); i < len(syms); i, offset = i+1, offset+size { + mapSym := syms[offset] + if mapSym == "" { + fmt.Println(syms) + return nil, errors.Errorf("section %s: missing symbol for map at offset %d", sec.Name, offset) + } + + if maps[mapSym] != nil { + return nil, errors.Errorf("section %v: map %v already exists", sec.Name, mapSym) + } + + lr := io.LimitReader(r, int64(size)) + + var spec MapSpec + switch { + case binary.Read(lr, ec.ByteOrder, &spec.Type) != nil: + return nil, errors.Errorf("map %v: missing type", mapSym) + case binary.Read(lr, ec.ByteOrder, &spec.KeySize) != nil: + return nil, errors.Errorf("map %v: missing key size", mapSym) + case binary.Read(lr, ec.ByteOrder, &spec.ValueSize) != nil: + return nil, errors.Errorf("map %v: missing value size", mapSym) + case binary.Read(lr, ec.ByteOrder, &spec.MaxEntries) != nil: + return nil, errors.Errorf("map %v: missing max entries", mapSym) + case binary.Read(lr, ec.ByteOrder, &spec.Flags) != nil: + return nil, errors.Errorf("map %v: missing flags", mapSym) + } + + for { + _, err := lr.Read(b) + if err == io.EOF { + break + } + if err != nil { + return nil, err + } + if b[0] != 0 { + return nil, errors.Errorf("map %v: unknown and non-zero fields in definition", mapSym) + } + } + + maps[mapSym] = &spec + } + } + return maps, nil +} + +func getProgType(v string) (ProgramType, AttachType) { + types := map[string]ProgramType{ + // From https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/tools/lib/bpf/libbpf.c#n3568 + "socket": SocketFilter, + "seccomp": SocketFilter, + "kprobe/": Kprobe, + "kretprobe/": Kprobe, + "tracepoint/": TracePoint, + "xdp": XDP, + "perf_event": PerfEvent, + "sockops": SockOps, + "sk_skb": SkSKB, + "sk_msg": SkMsg, + "lirc_mode2": LircMode2, + "flow_dissector": FlowDissector, + + "cgroup_skb/": CGroupSKB, + "cgroup/dev": CGroupDevice, + "cgroup/skb": CGroupSKB, + "cgroup/sock": CGroupSock, + "cgroup/post_bind": CGroupSock, + "cgroup/bind": CGroupSockAddr, + "cgroup/connect": CGroupSockAddr, + "cgroup/sendmsg": CGroupSockAddr, + "cgroup/recvmsg": CGroupSockAddr, + "cgroup/sysctl": CGroupSysctl, + "cgroup/getsockopt": CGroupSockopt, + "cgroup/setsockopt": CGroupSockopt, + "classifier": SchedCLS, + "action": SchedACT, + } + attachTypes := map[string]AttachType{ + "cgroup_skb/ingress": AttachCGroupInetIngress, + "cgroup_skb/egress": AttachCGroupInetEgress, + "cgroup/sock": AttachCGroupInetSockCreate, + "cgroup/post_bind4": AttachCGroupInet4PostBind, + "cgroup/post_bind6": AttachCGroupInet6PostBind, + "cgroup/dev": AttachCGroupDevice, + "sockops": AttachCGroupSockOps, + "sk_skb/stream_parser": AttachSkSKBStreamParser, + "sk_skb/stream_verdict": AttachSkSKBStreamVerdict, + "sk_msg": AttachSkSKBStreamVerdict, + "lirc_mode2": AttachLircMode2, + "flow_dissector": AttachFlowDissector, + "cgroup/bind4": AttachCGroupInet4Bind, + "cgroup/bind6": AttachCGroupInet6Bind, + "cgroup/connect4": AttachCGroupInet4Connect, + "cgroup/connect6": AttachCGroupInet6Connect, + "cgroup/sendmsg4": AttachCGroupUDP4Sendmsg, + "cgroup/sendmsg6": AttachCGroupUDP6Sendmsg, + "cgroup/recvmsg4": AttachCGroupUDP4Recvmsg, + "cgroup/recvmsg6": AttachCGroupUDP6Recvmsg, + "cgroup/sysctl": AttachCGroupSysctl, + "cgroup/getsockopt": AttachCGroupGetsockopt, + "cgroup/setsockopt": AttachCGroupSetsockopt, + } + attachType := AttachNone + for k, t := range attachTypes { + if strings.HasPrefix(v, k) { + attachType = t + } + } + + for k, t := range types { + if strings.HasPrefix(v, k) { + return t, attachType + } + } + return UnspecifiedProgram, AttachNone +} + +func (ec *elfCode) loadRelocations(sec *elf.Section) (map[uint64]string, error) { + rels := make(map[uint64]string) + if sec == nil { + return rels, nil + } + + if sec.Entsize < 16 { + return nil, errors.New("rels are less than 16 bytes") + } + + r := sec.Open() + for off := uint64(0); off < sec.Size; off += sec.Entsize { + ent := io.LimitReader(r, int64(sec.Entsize)) + + var rel elf.Rel64 + if binary.Read(ent, ec.ByteOrder, &rel) != nil { + return nil, errors.Errorf("can't parse relocation at offset %v", off) + } + + symNo := int(elf.R_SYM64(rel.Info) - 1) + if symNo >= len(ec.symbols) { + return nil, errors.Errorf("relocation at offset %d: symbol %v doesnt exist", off, symNo) + } + + rels[rel.Off] = ec.symbols[symNo].Name + } + return rels, nil +} + +func symbolsPerSection(symbols []elf.Symbol) map[elf.SectionIndex]map[uint64]string { + result := make(map[elf.SectionIndex]map[uint64]string) + for i, sym := range symbols { + switch elf.ST_TYPE(sym.Info) { + case elf.STT_NOTYPE: + // Older versions of LLVM doesn't tag + // symbols correctly. + break + case elf.STT_OBJECT: + break + case elf.STT_FUNC: + break + default: + continue + } + + if sym.Name == "" { + continue + } + + idx := sym.Section + if _, ok := result[idx]; !ok { + result[idx] = make(map[uint64]string) + } + result[idx][sym.Value] = symbols[i].Name + } + return result +} diff --git a/vendor/github.com/cilium/ebpf/feature.go b/vendor/github.com/cilium/ebpf/feature.go new file mode 100644 index 0000000..9104bc9 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/feature.go @@ -0,0 +1,19 @@ +package ebpf + +import ( + "sync" +) + +type featureTest struct { + Fn func() bool + + once sync.Once + result bool +} + +func (ft *featureTest) Result() bool { + ft.once.Do(func() { + ft.result = ft.Fn() + }) + return ft.result +} diff --git a/vendor/github.com/cilium/ebpf/go.mod b/vendor/github.com/cilium/ebpf/go.mod new file mode 100644 index 0000000..687bdec --- /dev/null +++ b/vendor/github.com/cilium/ebpf/go.mod @@ -0,0 +1,8 @@ +module github.com/cilium/ebpf + +go 1.12 + +require ( + github.com/pkg/errors v0.8.1 + golang.org/x/sys v0.0.0-20191022100944-742c48ecaeb7 +) diff --git a/vendor/github.com/cilium/ebpf/internal/cpu.go b/vendor/github.com/cilium/ebpf/internal/cpu.go new file mode 100644 index 0000000..ce3cab7 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/cpu.go @@ -0,0 +1,64 @@ +package internal + +import ( + "fmt" + "os" + "sync" + + "github.com/pkg/errors" +) + +var sysCPU struct { + once sync.Once + err error + num int +} + +// PossibleCPUs returns the max number of CPUs a system may possibly have +// Logical CPU numbers must be of the form 0-n +func PossibleCPUs() (int, error) { + sysCPU.once.Do(func() { + sysCPU.num, sysCPU.err = parseCPUs("/sys/devices/system/cpu/possible") + }) + + return sysCPU.num, sysCPU.err +} + +var onlineCPU struct { + once sync.Once + err error + num int +} + +// OnlineCPUs returns the number of currently online CPUs +// Logical CPU numbers must be of the form 0-n +func OnlineCPUs() (int, error) { + onlineCPU.once.Do(func() { + onlineCPU.num, onlineCPU.err = parseCPUs("/sys/devices/system/cpu/online") + }) + + return onlineCPU.num, onlineCPU.err +} + +// parseCPUs parses the number of cpus from sysfs, +// in the format of "/sys/devices/system/cpu/{possible,online,..}. +// Logical CPU numbers must be of the form 0-n +func parseCPUs(path string) (int, error) { + file, err := os.Open(path) + if err != nil { + return 0, err + } + defer file.Close() + + var low, high int + n, _ := fmt.Fscanf(file, "%d-%d", &low, &high) + if n < 1 || low != 0 { + return 0, errors.Wrapf(err, "%s has unknown format", path) + } + if n == 1 { + high = low + } + + // cpus is 0 indexed + return high + 1, nil +} diff --git a/vendor/github.com/cilium/ebpf/internal/endian.go b/vendor/github.com/cilium/ebpf/internal/endian.go new file mode 100644 index 0000000..ac8a94e --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/endian.go @@ -0,0 +1,24 @@ +package internal + +import ( + "encoding/binary" + "unsafe" +) + +// NativeEndian is set to either binary.BigEndian or binary.LittleEndian, +// depending on the host's endianness. +var NativeEndian binary.ByteOrder + +func init() { + if isBigEndian() { + NativeEndian = binary.BigEndian + } else { + NativeEndian = binary.LittleEndian + } +} + +func isBigEndian() (ret bool) { + i := int(0x1) + bs := (*[int(unsafe.Sizeof(i))]byte)(unsafe.Pointer(&i)) + return bs[0] == 0 +} diff --git a/vendor/github.com/cilium/ebpf/internal/unix/types_linux.go b/vendor/github.com/cilium/ebpf/internal/unix/types_linux.go new file mode 100644 index 0000000..49c6be5 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/unix/types_linux.go @@ -0,0 +1,118 @@ +// +build linux + +package unix + +import ( + "syscall" + + linux "golang.org/x/sys/unix" +) + +const ( + ENOENT = linux.ENOENT + EAGAIN = linux.EAGAIN + ENOSPC = linux.ENOSPC + EINVAL = linux.EINVAL + EPOLLIN = linux.EPOLLIN + BPF_OBJ_NAME_LEN = linux.BPF_OBJ_NAME_LEN + BPF_TAG_SIZE = linux.BPF_TAG_SIZE + SYS_BPF = linux.SYS_BPF + F_DUPFD_CLOEXEC = linux.F_DUPFD_CLOEXEC + EPOLL_CTL_ADD = linux.EPOLL_CTL_ADD + EPOLL_CLOEXEC = linux.EPOLL_CLOEXEC + O_CLOEXEC = linux.O_CLOEXEC + O_NONBLOCK = linux.O_NONBLOCK + PROT_READ = linux.PROT_READ + PROT_WRITE = linux.PROT_WRITE + MAP_SHARED = linux.MAP_SHARED + PERF_TYPE_SOFTWARE = linux.PERF_TYPE_SOFTWARE + PERF_COUNT_SW_BPF_OUTPUT = linux.PERF_COUNT_SW_BPF_OUTPUT + PerfBitWatermark = linux.PerfBitWatermark + PERF_SAMPLE_RAW = linux.PERF_SAMPLE_RAW + PERF_FLAG_FD_CLOEXEC = linux.PERF_FLAG_FD_CLOEXEC +) + +// Statfs_t is a wrapper +type Statfs_t = linux.Statfs_t + +// Rlimit is a wrapper +type Rlimit = linux.Rlimit + +// Setrlimit is a wrapper +func Setrlimit(resource int, rlim *Rlimit) (err error) { + return linux.Setrlimit(resource, rlim) +} + +// Syscall is a wrapper +func Syscall(trap, a1, a2, a3 uintptr) (r1, r2 uintptr, err syscall.Errno) { + return linux.Syscall(trap, a1, a2, a3) +} + +// FcntlInt is a wrapper +func FcntlInt(fd uintptr, cmd, arg int) (int, error) { + return linux.FcntlInt(fd, cmd, arg) +} + +// Statfs is a wrapper +func Statfs(path string, buf *Statfs_t) (err error) { + return linux.Statfs(path, buf) +} + +// Close is a wrapper +func Close(fd int) (err error) { + return linux.Close(fd) +} + +// EpollEvent is a wrapper +type EpollEvent = linux.EpollEvent + +// EpollWait is a wrapper +func EpollWait(epfd int, events []EpollEvent, msec int) (n int, err error) { + return linux.EpollWait(epfd, events, msec) +} + +// EpollCtl is a wrapper +func EpollCtl(epfd int, op int, fd int, event *EpollEvent) (err error) { + return linux.EpollCtl(epfd, op, fd, event) +} + +// Eventfd is a wrapper +func Eventfd(initval uint, flags int) (fd int, err error) { + return linux.Eventfd(initval, flags) +} + +// Write is a wrapper +func Write(fd int, p []byte) (n int, err error) { + return linux.Write(fd, p) +} + +// EpollCreate1 is a wrapper +func EpollCreate1(flag int) (fd int, err error) { + return linux.EpollCreate1(flag) +} + +// PerfEventMmapPage is a wrapper +type PerfEventMmapPage linux.PerfEventMmapPage + +// SetNonblock is a wrapper +func SetNonblock(fd int, nonblocking bool) (err error) { + return linux.SetNonblock(fd, nonblocking) +} + +// Mmap is a wrapper +func Mmap(fd int, offset int64, length int, prot int, flags int) (data []byte, err error) { + return linux.Mmap(fd, offset, length, prot, flags) +} + +// Munmap is a wrapper +func Munmap(b []byte) (err error) { + return linux.Munmap(b) +} + +// PerfEventAttr is a wrapper +type PerfEventAttr = linux.PerfEventAttr + +// PerfEventOpen is a wrapper +func PerfEventOpen(attr *PerfEventAttr, pid int, cpu int, groupFd int, flags int) (fd int, err error) { + return linux.PerfEventOpen(attr, pid, cpu, groupFd, flags) +} diff --git a/vendor/github.com/cilium/ebpf/internal/unix/types_other.go b/vendor/github.com/cilium/ebpf/internal/unix/types_other.go new file mode 100644 index 0000000..a327f2a --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/unix/types_other.go @@ -0,0 +1,183 @@ +// +build !linux + +package unix + +import ( + "fmt" + "runtime" + "syscall" +) + +var errNonLinux = fmt.Errorf("unsupported platform %s/%s", runtime.GOOS, runtime.GOARCH) + +const ( + ENOENT = syscall.ENOENT + EAGAIN = syscall.EAGAIN + ENOSPC = syscall.ENOSPC + EINVAL = syscall.EINVAL + BPF_OBJ_NAME_LEN = 0x10 + BPF_TAG_SIZE = 0x8 + SYS_BPF = 321 + F_DUPFD_CLOEXEC = 0x406 + EPOLLIN = 0x1 + EPOLL_CTL_ADD = 0x1 + EPOLL_CLOEXEC = 0x80000 + O_CLOEXEC = 0x80000 + O_NONBLOCK = 0x800 + PROT_READ = 0x1 + PROT_WRITE = 0x2 + MAP_SHARED = 0x1 + PERF_TYPE_SOFTWARE = 0x1 + PERF_COUNT_SW_BPF_OUTPUT = 0xa + PerfBitWatermark = 0x4000 + PERF_SAMPLE_RAW = 0x400 + PERF_FLAG_FD_CLOEXEC = 0x8 +) + +// Statfs_t is a wrapper +type Statfs_t struct { + Type int64 + Bsize int64 + Blocks uint64 + Bfree uint64 + Bavail uint64 + Files uint64 + Ffree uint64 + Fsid [2]int32 + Namelen int64 + Frsize int64 + Flags int64 + Spare [4]int64 +} + +// Rlimit is a wrapper +type Rlimit struct { + Cur uint64 + Max uint64 +} + +// Setrlimit is a wrapper +func Setrlimit(resource int, rlim *Rlimit) (err error) { + return errNonLinux +} + +// Syscall is a wrapper +func Syscall(trap, a1, a2, a3 uintptr) (r1, r2 uintptr, err syscall.Errno) { + return 0, 0, syscall.Errno(1) +} + +// FcntlInt is a wrapper +func FcntlInt(fd uintptr, cmd, arg int) (int, error) { + return -1, errNonLinux +} + +// Statfs is a wrapper +func Statfs(path string, buf *Statfs_t) error { + return errNonLinux +} + +// Close is a wrapper +func Close(fd int) (err error) { + return errNonLinux +} + +// EpollEvent is a wrapper +type EpollEvent struct { + Events uint32 + Fd int32 + Pad int32 +} + +// EpollWait is a wrapper +func EpollWait(epfd int, events []EpollEvent, msec int) (n int, err error) { + return 0, errNonLinux +} + +// EpollCtl is a wrapper +func EpollCtl(epfd int, op int, fd int, event *EpollEvent) (err error) { + return errNonLinux +} + +// Eventfd is a wrapper +func Eventfd(initval uint, flags int) (fd int, err error) { + return 0, errNonLinux +} + +// Write is a wrapper +func Write(fd int, p []byte) (n int, err error) { + return 0, errNonLinux +} + +// EpollCreate1 is a wrapper +func EpollCreate1(flag int) (fd int, err error) { + return 0, errNonLinux +} + +// PerfEventMmapPage is a wrapper +type PerfEventMmapPage struct { + Version uint32 + Compat_version uint32 + Lock uint32 + Index uint32 + Offset int64 + Time_enabled uint64 + Time_running uint64 + Capabilities uint64 + Pmc_width uint16 + Time_shift uint16 + Time_mult uint32 + Time_offset uint64 + Time_zero uint64 + Size uint32 + + Data_head uint64 + Data_tail uint64 + Data_offset uint64 + Data_size uint64 + Aux_head uint64 + Aux_tail uint64 + Aux_offset uint64 + Aux_size uint64 +} + +// SetNonblock is a wrapper +func SetNonblock(fd int, nonblocking bool) (err error) { + return errNonLinux +} + +// Mmap is a wrapper +func Mmap(fd int, offset int64, length int, prot int, flags int) (data []byte, err error) { + return []byte{}, errNonLinux +} + +// Munmap is a wrapper +func Munmap(b []byte) (err error) { + return errNonLinux +} + +// PerfEventAttr is a wrapper +type PerfEventAttr struct { + Type uint32 + Size uint32 + Config uint64 + Sample uint64 + Sample_type uint64 + Read_format uint64 + Bits uint64 + Wakeup uint32 + Bp_type uint32 + Ext1 uint64 + Ext2 uint64 + Branch_sample_type uint64 + Sample_regs_user uint64 + Sample_stack_user uint32 + Clockid int32 + Sample_regs_intr uint64 + Aux_watermark uint32 + Sample_max_stack uint16 +} + +// PerfEventOpen is a wrapper +func PerfEventOpen(attr *PerfEventAttr, pid int, cpu int, groupFd int, flags int) (fd int, err error) { + return 0, errNonLinux +} diff --git a/vendor/github.com/cilium/ebpf/linker.go b/vendor/github.com/cilium/ebpf/linker.go new file mode 100644 index 0000000..da556c2 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/linker.go @@ -0,0 +1,58 @@ +package ebpf + +import ( + "github.com/cilium/ebpf/asm" +) + +// link resolves bpf-to-bpf calls. +// +// Each section may contain multiple functions / labels, and is only linked +// if the program being edited references one of these functions. +// +// Sections must not require linking themselves. +func link(insns asm.Instructions, sections ...asm.Instructions) (asm.Instructions, error) { + for _, section := range sections { + var err error + insns, err = linkSection(insns, section) + if err != nil { + return nil, err + } + } + return insns, nil +} + +func linkSection(insns, section asm.Instructions) (asm.Instructions, error) { + // A map of symbols to the libraries which contain them. + symbols, err := section.SymbolOffsets() + if err != nil { + return nil, err + } + + for _, ins := range insns { + if ins.Reference == "" { + continue + } + + if ins.OpCode.JumpOp() != asm.Call || ins.Src != asm.R1 { + continue + } + + if ins.Constant != -1 { + // This is already a valid call, no need to link again. + continue + } + + if _, ok := symbols[ins.Reference]; !ok { + // Symbol isn't available in this section + continue + } + + // At this point we know that at least one function in the + // library is called from insns. Merge the two sections. + // The rewrite of ins.Constant happens in asm.Instruction.Marshal. + return append(insns, section...), nil + } + + // None of the functions in the section are called. Do nothing. + return insns, nil +} diff --git a/vendor/github.com/cilium/ebpf/map.go b/vendor/github.com/cilium/ebpf/map.go new file mode 100644 index 0000000..028a913 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/map.go @@ -0,0 +1,595 @@ +package ebpf + +import ( + "fmt" + "unsafe" + + "github.com/cilium/ebpf/internal" + "github.com/cilium/ebpf/internal/unix" + + "github.com/pkg/errors" +) + +// MapSpec defines a Map. +type MapSpec struct { + // Name is passed to the kernel as a debug aid. Must only contain + // alpha numeric and '_' characters. + Name string + Type MapType + KeySize uint32 + ValueSize uint32 + MaxEntries uint32 + Flags uint32 + // InnerMap is used as a template for ArrayOfMaps and HashOfMaps + InnerMap *MapSpec +} + +func (ms *MapSpec) String() string { + return fmt.Sprintf("%s(keySize=%d, valueSize=%d, maxEntries=%d, flags=%d)", ms.Type, ms.KeySize, ms.ValueSize, ms.MaxEntries, ms.Flags) +} + +// Copy returns a copy of the spec. +func (ms *MapSpec) Copy() *MapSpec { + if ms == nil { + return nil + } + + cpy := *ms + cpy.InnerMap = ms.InnerMap.Copy() + return &cpy +} + +// Map represents a Map file descriptor. +// +// It is not safe to close a map which is used by other goroutines. +// +// Methods which take interface{} arguments by default encode +// them using binary.Read/Write in the machine's native endianness. +// +// Implement encoding.BinaryMarshaler or encoding.BinaryUnmarshaler +// if you require custom encoding. +type Map struct { + fd *bpfFD + abi MapABI + // Per CPU maps return values larger than the size in the spec + fullValueSize int +} + +// NewMapFromFD creates a map from a raw fd. +// +// You should not use fd after calling this function. +func NewMapFromFD(fd int) (*Map, error) { + if fd < 0 { + return nil, errors.New("invalid fd") + } + bpfFd := newBPFFD(uint32(fd)) + + abi, err := newMapABIFromFd(bpfFd) + if err != nil { + bpfFd.forget() + return nil, err + } + return newMap(bpfFd, abi) +} + +// NewMap creates a new Map. +// +// Creating a map for the first time will perform feature detection +// by creating small, temporary maps. +func NewMap(spec *MapSpec) (*Map, error) { + if spec.Type != ArrayOfMaps && spec.Type != HashOfMaps { + return createMap(spec, nil) + } + + if spec.InnerMap == nil { + return nil, errors.Errorf("%s requires InnerMap", spec.Type) + } + + template, err := createMap(spec.InnerMap, nil) + if err != nil { + return nil, err + } + defer template.Close() + + return createMap(spec, template.fd) +} + +func createMap(spec *MapSpec, inner *bpfFD) (*Map, error) { + spec = spec.Copy() + + switch spec.Type { + case ArrayOfMaps: + fallthrough + case HashOfMaps: + if spec.ValueSize != 0 && spec.ValueSize != 4 { + return nil, errors.Errorf("ValueSize must be zero or four for map of map") + } + spec.ValueSize = 4 + + case PerfEventArray: + if spec.KeySize != 0 { + return nil, errors.Errorf("KeySize must be zero for perf event array") + } + if spec.ValueSize != 0 { + return nil, errors.Errorf("ValueSize must be zero for perf event array") + } + if spec.MaxEntries == 0 { + n, err := internal.OnlineCPUs() + if err != nil { + return nil, errors.Wrap(err, "perf event array") + } + spec.MaxEntries = uint32(n) + } + + spec.KeySize = 4 + spec.ValueSize = 4 + } + + attr := bpfMapCreateAttr{ + mapType: spec.Type, + keySize: spec.KeySize, + valueSize: spec.ValueSize, + maxEntries: spec.MaxEntries, + flags: spec.Flags, + } + + if inner != nil { + var err error + attr.innerMapFd, err = inner.value() + if err != nil { + return nil, errors.Wrap(err, "map create") + } + } + + name, err := newBPFObjName(spec.Name) + if err != nil { + return nil, errors.Wrap(err, "map create") + } + + if haveObjName.Result() { + attr.mapName = name + } + + fd, err := bpfMapCreate(&attr) + if err != nil { + return nil, errors.Wrap(err, "map create") + } + + return newMap(fd, newMapABIFromSpec(spec)) +} + +func newMap(fd *bpfFD, abi *MapABI) (*Map, error) { + m := &Map{ + fd, + *abi, + int(abi.ValueSize), + } + + if !abi.Type.hasPerCPUValue() { + return m, nil + } + + possibleCPUs, err := internal.PossibleCPUs() + if err != nil { + return nil, err + } + + m.fullValueSize = align(int(abi.ValueSize), 8) * possibleCPUs + return m, nil +} + +func (m *Map) String() string { + return fmt.Sprintf("%s#%d", m.abi.Type, m.fd) +} + +// ABI gets the ABI of the Map +func (m *Map) ABI() MapABI { + return m.abi +} + +// Lookup retrieves a value from a Map. +// +// Calls Close() on valueOut if it is of type **Map or **Program, +// and *valueOut is not nil. +// +// Returns an error if the key doesn't exist, see IsNotExist. +func (m *Map) Lookup(key, valueOut interface{}) error { + valuePtr, valueBytes := makeBuffer(valueOut, m.fullValueSize) + + if err := m.lookup(key, valuePtr); err != nil { + return err + } + + if valueBytes == nil { + return nil + } + + if m.abi.Type.hasPerCPUValue() { + return unmarshalPerCPUValue(valueOut, int(m.abi.ValueSize), valueBytes) + } + + switch value := valueOut.(type) { + case **Map: + m, err := unmarshalMap(valueBytes) + if err != nil { + return err + } + + (*value).Close() + *value = m + return nil + case *Map: + return errors.Errorf("can't unmarshal into %T, need %T", value, (**Map)(nil)) + case Map: + return errors.Errorf("can't unmarshal into %T, need %T", value, (**Map)(nil)) + + case **Program: + p, err := unmarshalProgram(valueBytes) + if err != nil { + return err + } + + (*value).Close() + *value = p + return nil + case *Program: + return errors.Errorf("can't unmarshal into %T, need %T", value, (**Program)(nil)) + case Program: + return errors.Errorf("can't unmarshal into %T, need %T", value, (**Program)(nil)) + + default: + return unmarshalBytes(valueOut, valueBytes) + } +} + +// LookupBytes gets a value from Map. +// +// Returns a nil value if a key doesn't exist. +func (m *Map) LookupBytes(key interface{}) ([]byte, error) { + valueBytes := make([]byte, m.fullValueSize) + valuePtr := newPtr(unsafe.Pointer(&valueBytes[0])) + + err := m.lookup(key, valuePtr) + if IsNotExist(err) { + return nil, nil + } + + return valueBytes, err +} + +func (m *Map) lookup(key interface{}, valueOut syscallPtr) error { + keyPtr, err := marshalPtr(key, int(m.abi.KeySize)) + if err != nil { + return errors.WithMessage(err, "can't marshal key") + } + + err = bpfMapLookupElem(m.fd, keyPtr, valueOut) + return errors.WithMessage(err, "lookup failed") +} + +// MapUpdateFlags controls the behaviour of the Map.Update call. +// +// The exact semantics depend on the specific MapType. +type MapUpdateFlags uint64 + +const ( + // UpdateAny creates a new element or update an existing one. + UpdateAny MapUpdateFlags = iota + // UpdateNoExist creates a new element. + UpdateNoExist MapUpdateFlags = 1 << (iota - 1) + // UpdateExist updates an existing element. + UpdateExist +) + +// Put replaces or creates a value in map. +// +// It is equivalent to calling Update with UpdateAny. +func (m *Map) Put(key, value interface{}) error { + return m.Update(key, value, UpdateAny) +} + +// Update changes the value of a key. +func (m *Map) Update(key, value interface{}, flags MapUpdateFlags) error { + keyPtr, err := marshalPtr(key, int(m.abi.KeySize)) + if err != nil { + return errors.WithMessage(err, "can't marshal key") + } + + var valuePtr syscallPtr + if m.abi.Type.hasPerCPUValue() { + valuePtr, err = marshalPerCPUValue(value, int(m.abi.ValueSize)) + } else { + valuePtr, err = marshalPtr(value, int(m.abi.ValueSize)) + } + if err != nil { + return errors.WithMessage(err, "can't marshal value") + } + + return bpfMapUpdateElem(m.fd, keyPtr, valuePtr, uint64(flags)) +} + +// Delete removes a value. +// +// Returns an error if the key does not exist, see IsNotExist. +func (m *Map) Delete(key interface{}) error { + keyPtr, err := marshalPtr(key, int(m.abi.KeySize)) + if err != nil { + return errors.WithMessage(err, "can't marshal key") + } + + err = bpfMapDeleteElem(m.fd, keyPtr) + return errors.WithMessage(err, "can't delete key") +} + +// NextKey finds the key following an initial key. +// +// See NextKeyBytes for details. +func (m *Map) NextKey(key, nextKeyOut interface{}) error { + nextKeyPtr, nextKeyBytes := makeBuffer(nextKeyOut, int(m.abi.KeySize)) + + if err := m.nextKey(key, nextKeyPtr); err != nil { + return err + } + + if nextKeyBytes == nil { + return nil + } + + err := unmarshalBytes(nextKeyOut, nextKeyBytes) + return errors.WithMessage(err, "can't unmarshal next key") +} + +// NextKeyBytes returns the key following an initial key as a byte slice. +// +// Passing nil will return the first key. +// +// Use Iterate if you want to traverse all entries in the map. +func (m *Map) NextKeyBytes(key interface{}) ([]byte, error) { + nextKey := make([]byte, m.abi.KeySize) + nextKeyPtr := newPtr(unsafe.Pointer(&nextKey[0])) + + err := m.nextKey(key, nextKeyPtr) + if IsNotExist(err) { + return nil, nil + } + + return nextKey, err +} + +func (m *Map) nextKey(key interface{}, nextKeyOut syscallPtr) error { + var ( + keyPtr syscallPtr + err error + ) + + if key != nil { + keyPtr, err = marshalPtr(key, int(m.abi.KeySize)) + if err != nil { + return errors.WithMessage(err, "can't marshal key") + } + } + + err = bpfMapGetNextKey(m.fd, keyPtr, nextKeyOut) + return errors.WithMessage(err, "can't get next key") +} + +// Iterate traverses a map. +// +// It's safe to create multiple iterators at the same time. +// +// It's not possible to guarantee that all keys in a map will be +// returned if there are concurrent modifications to the map. +func (m *Map) Iterate() *MapIterator { + return newMapIterator(m) +} + +// Close removes a Map +func (m *Map) Close() error { + if m == nil { + // This makes it easier to clean up when iterating maps + // of maps / programs. + return nil + } + + return m.fd.close() +} + +// FD gets the file descriptor of the Map. +// +// Calling this function is invalid after Close has been called. +func (m *Map) FD() int { + fd, err := m.fd.value() + if err != nil { + // Best effort: -1 is the number most likely to be an + // invalid file descriptor. + return -1 + } + + return int(fd) +} + +// Clone creates a duplicate of the Map. +// +// Closing the duplicate does not affect the original, and vice versa. +// Changes made to the map are reflected by both instances however. +// +// Cloning a nil Map returns nil. +func (m *Map) Clone() (*Map, error) { + if m == nil { + return nil, nil + } + + dup, err := m.fd.dup() + if err != nil { + return nil, errors.Wrap(err, "can't clone map") + } + + return newMap(dup, &m.abi) +} + +// Pin persists the map past the lifetime of the process that created it. +// +// This requires bpffs to be mounted above fileName. See http://cilium.readthedocs.io/en/doc-1.0/kubernetes/install/#mounting-the-bpf-fs-optional +func (m *Map) Pin(fileName string) error { + return bpfPinObject(fileName, m.fd) +} + +// LoadPinnedMap load a Map from a BPF file. +// +// Requires at least Linux 4.13, and is not compatible with +// nested maps. Use LoadPinnedMapExplicit in these situations. +func LoadPinnedMap(fileName string) (*Map, error) { + fd, err := bpfGetObject(fileName) + if err != nil { + return nil, err + } + abi, err := newMapABIFromFd(fd) + if err != nil { + _ = fd.close() + return nil, err + } + return newMap(fd, abi) +} + +// LoadPinnedMapExplicit loads a map with explicit parameters. +func LoadPinnedMapExplicit(fileName string, abi *MapABI) (*Map, error) { + fd, err := bpfGetObject(fileName) + if err != nil { + return nil, err + } + return newMap(fd, abi) +} + +func unmarshalMap(buf []byte) (*Map, error) { + if len(buf) != 4 { + return nil, errors.New("map id requires 4 byte value") + } + + // Looking up an entry in a nested map or prog array returns an id, + // not an fd. + id := internal.NativeEndian.Uint32(buf) + fd, err := bpfGetMapFDByID(id) + if err != nil { + return nil, err + } + + abi, err := newMapABIFromFd(fd) + if err != nil { + _ = fd.close() + return nil, err + } + + return newMap(fd, abi) +} + +// MarshalBinary implements BinaryMarshaler. +func (m *Map) MarshalBinary() ([]byte, error) { + fd, err := m.fd.value() + if err != nil { + return nil, err + } + + buf := make([]byte, 4) + internal.NativeEndian.PutUint32(buf, fd) + return buf, nil +} + +// MapIterator iterates a Map. +// +// See Map.Iterate. +type MapIterator struct { + target *Map + prevKey interface{} + prevBytes []byte + count, maxEntries uint32 + done bool + err error +} + +func newMapIterator(target *Map) *MapIterator { + return &MapIterator{ + target: target, + maxEntries: target.abi.MaxEntries, + prevBytes: make([]byte, int(target.abi.KeySize)), + } +} + +var errIterationAborted = errors.New("iteration aborted") + +// Next decodes the next key and value. +// +// Iterating a hash map from which keys are being deleted is not +// safe. You may see the same key multiple times. Iteration may +// also abort with an error, see IsIterationAborted. +// +// Returns false if there are no more entries. You must check +// the result of Err afterwards. +// +// See Map.Get for further caveats around valueOut. +func (mi *MapIterator) Next(keyOut, valueOut interface{}) bool { + if mi.err != nil || mi.done { + return false + } + + for ; mi.count < mi.maxEntries; mi.count++ { + var nextBytes []byte + nextBytes, mi.err = mi.target.NextKeyBytes(mi.prevKey) + if mi.err != nil { + return false + } + + if nextBytes == nil { + mi.done = true + return false + } + + // The user can get access to nextBytes since unmarshalBytes + // does not copy when unmarshaling into a []byte. + // Make a copy to prevent accidental corruption of + // iterator state. + copy(mi.prevBytes, nextBytes) + mi.prevKey = mi.prevBytes + + mi.err = mi.target.Lookup(nextBytes, valueOut) + if IsNotExist(mi.err) { + // Even though the key should be valid, we couldn't look up + // its value. If we're iterating a hash map this is probably + // because a concurrent delete removed the value before we + // could get it. This means that the next call to NextKeyBytes + // is very likely to restart iteration. + // If we're iterating one of the fd maps like + // ProgramArray it means that a given slot doesn't have + // a valid fd associated. It's OK to continue to the next slot. + continue + } + if mi.err != nil { + return false + } + + mi.err = unmarshalBytes(keyOut, nextBytes) + return mi.err == nil + } + + mi.err = errIterationAborted + return false +} + +// Err returns any encountered error. +// +// The method must be called after Next returns nil. +func (mi *MapIterator) Err() error { + return mi.err +} + +// IsNotExist returns true if the error indicates that a +// key doesn't exist. +func IsNotExist(err error) bool { + return errors.Cause(err) == unix.ENOENT +} + +// IsIterationAborted returns true if the iteration was aborted. +// +// This occurs when keys are deleted from a hash map during iteration. +func IsIterationAborted(err error) bool { + return errors.Cause(err) == errIterationAborted +} diff --git a/vendor/github.com/cilium/ebpf/marshalers.go b/vendor/github.com/cilium/ebpf/marshalers.go new file mode 100644 index 0000000..44ba273 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/marshalers.go @@ -0,0 +1,192 @@ +package ebpf + +import ( + "bytes" + "encoding" + "encoding/binary" + "reflect" + "runtime" + "unsafe" + + "github.com/cilium/ebpf/internal" + + "github.com/pkg/errors" +) + +func marshalPtr(data interface{}, length int) (syscallPtr, error) { + if ptr, ok := data.(unsafe.Pointer); ok { + return newPtr(ptr), nil + } + + buf, err := marshalBytes(data, length) + if err != nil { + return syscallPtr{}, err + } + + return newPtr(unsafe.Pointer(&buf[0])), nil +} + +func marshalBytes(data interface{}, length int) (buf []byte, err error) { + switch value := data.(type) { + case encoding.BinaryMarshaler: + buf, err = value.MarshalBinary() + case string: + buf = []byte(value) + case []byte: + buf = value + case unsafe.Pointer: + err = errors.New("can't marshal from unsafe.Pointer") + default: + var wr bytes.Buffer + err = binary.Write(&wr, internal.NativeEndian, value) + err = errors.Wrapf(err, "encoding %T", value) + buf = wr.Bytes() + } + if err != nil { + return nil, err + } + + if len(buf) != length { + return nil, errors.Errorf("%T doesn't marshal to %d bytes", data, length) + } + return buf, nil +} + +func makeBuffer(dst interface{}, length int) (syscallPtr, []byte) { + if ptr, ok := dst.(unsafe.Pointer); ok { + return newPtr(ptr), nil + } + + buf := make([]byte, length) + return newPtr(unsafe.Pointer(&buf[0])), buf +} + +func unmarshalBytes(data interface{}, buf []byte) error { + switch value := data.(type) { + case unsafe.Pointer: + sh := &reflect.SliceHeader{ + Data: uintptr(value), + Len: len(buf), + Cap: len(buf), + } + + dst := *(*[]byte)(unsafe.Pointer(sh)) + copy(dst, buf) + runtime.KeepAlive(value) + return nil + case encoding.BinaryUnmarshaler: + return value.UnmarshalBinary(buf) + case *string: + *value = string(buf) + return nil + case *[]byte: + *value = buf + return nil + case string: + return errors.New("require pointer to string") + case []byte: + return errors.New("require pointer to []byte") + default: + rd := bytes.NewReader(buf) + err := binary.Read(rd, internal.NativeEndian, value) + return errors.Wrapf(err, "decoding %T", value) + } +} + +// marshalPerCPUValue encodes a slice containing one value per +// possible CPU into a buffer of bytes. +// +// Values are initialized to zero if the slice has less elements than CPUs. +// +// slice must have a type like []elementType. +func marshalPerCPUValue(slice interface{}, elemLength int) (syscallPtr, error) { + sliceType := reflect.TypeOf(slice) + if sliceType.Kind() != reflect.Slice { + return syscallPtr{}, errors.New("per-CPU value requires slice") + } + + possibleCPUs, err := internal.PossibleCPUs() + if err != nil { + return syscallPtr{}, err + } + + sliceValue := reflect.ValueOf(slice) + sliceLen := sliceValue.Len() + if sliceLen > possibleCPUs { + return syscallPtr{}, errors.Errorf("per-CPU value exceeds number of CPUs") + } + + alignedElemLength := align(elemLength, 8) + buf := make([]byte, alignedElemLength*possibleCPUs) + + for i := 0; i < sliceLen; i++ { + elem := sliceValue.Index(i).Interface() + elemBytes, err := marshalBytes(elem, elemLength) + if err != nil { + return syscallPtr{}, err + } + + offset := i * alignedElemLength + copy(buf[offset:offset+elemLength], elemBytes) + } + + return newPtr(unsafe.Pointer(&buf[0])), nil +} + +// unmarshalPerCPUValue decodes a buffer into a slice containing one value per +// possible CPU. +// +// valueOut must have a type like *[]elementType +func unmarshalPerCPUValue(slicePtr interface{}, elemLength int, buf []byte) error { + slicePtrType := reflect.TypeOf(slicePtr) + if slicePtrType.Kind() != reflect.Ptr || slicePtrType.Elem().Kind() != reflect.Slice { + return errors.Errorf("per-cpu value requires pointer to slice") + } + + possibleCPUs, err := internal.PossibleCPUs() + if err != nil { + return err + } + + sliceType := slicePtrType.Elem() + slice := reflect.MakeSlice(sliceType, possibleCPUs, possibleCPUs) + + sliceElemType := sliceType.Elem() + sliceElemIsPointer := sliceElemType.Kind() == reflect.Ptr + if sliceElemIsPointer { + sliceElemType = sliceElemType.Elem() + } + + step := len(buf) / possibleCPUs + if step < elemLength { + return errors.Errorf("per-cpu element length is larger than available data") + } + for i := 0; i < possibleCPUs; i++ { + var elem interface{} + if sliceElemIsPointer { + newElem := reflect.New(sliceElemType) + slice.Index(i).Set(newElem) + elem = newElem.Interface() + } else { + elem = slice.Index(i).Addr().Interface() + } + + // Make a copy, since unmarshal can hold on to itemBytes + elemBytes := make([]byte, elemLength) + copy(elemBytes, buf[:elemLength]) + + err := unmarshalBytes(elem, elemBytes) + if err != nil { + return errors.Wrapf(err, "cpu %d", i) + } + + buf = buf[step:] + } + + reflect.ValueOf(slicePtr).Elem().Set(slice) + return nil +} + +func align(n, alignment int) int { + return (int(n) + alignment - 1) / alignment * alignment +} diff --git a/vendor/github.com/cilium/ebpf/prog.go b/vendor/github.com/cilium/ebpf/prog.go new file mode 100644 index 0000000..03b24fb --- /dev/null +++ b/vendor/github.com/cilium/ebpf/prog.go @@ -0,0 +1,523 @@ +package ebpf + +import ( + "bytes" + "fmt" + "math" + "path/filepath" + "strings" + "time" + "unsafe" + + "github.com/cilium/ebpf/asm" + "github.com/cilium/ebpf/internal" + "github.com/cilium/ebpf/internal/unix" + + "github.com/pkg/errors" +) + +var ( + errNotSupported = errors.New("ebpf: not supported by kernel") +) + +const ( + // Number of bytes to pad the output buffer for BPF_PROG_TEST_RUN. + // This is currently the maximum of spare space allocated for SKB + // and XDP programs, and equal to XDP_PACKET_HEADROOM + NET_IP_ALIGN. + outputPad = 256 + 2 +) + +// DefaultVerifierLogSize is the default number of bytes allocated for the +// verifier log. +const DefaultVerifierLogSize = 64 * 1024 + +// ProgramOptions control loading a program into the kernel. +type ProgramOptions struct { + // Controls the detail emitted by the kernel verifier. Set to non-zero + // to enable logging. + LogLevel uint32 + // Controls the output buffer size for the verifier. Defaults to + // DefaultVerifierLogSize. + LogSize int +} + +// ProgramSpec defines a Program +type ProgramSpec struct { + // Name is passed to the kernel as a debug aid. Must only contain + // alpha numeric and '_' characters. + Name string + Type ProgramType + AttachType AttachType + Instructions asm.Instructions + License string + KernelVersion uint32 +} + +// Copy returns a copy of the spec. +func (ps *ProgramSpec) Copy() *ProgramSpec { + if ps == nil { + return nil + } + + cpy := *ps + cpy.Instructions = make(asm.Instructions, len(ps.Instructions)) + copy(cpy.Instructions, ps.Instructions) + return &cpy +} + +// Program represents BPF program loaded into the kernel. +// +// It is not safe to close a Program which is used by other goroutines. +type Program struct { + // Contains the output of the kernel verifier if enabled, + // otherwise it is empty. + VerifierLog string + + fd *bpfFD + name string + abi ProgramABI +} + +// NewProgram creates a new Program. +// +// Loading a program for the first time will perform +// feature detection by loading small, temporary programs. +func NewProgram(spec *ProgramSpec) (*Program, error) { + return NewProgramWithOptions(spec, ProgramOptions{}) +} + +// NewProgramWithOptions creates a new Program. +// +// Loading a program for the first time will perform +// feature detection by loading small, temporary programs. +func NewProgramWithOptions(spec *ProgramSpec, opts ProgramOptions) (*Program, error) { + attr, err := convertProgramSpec(spec, haveObjName.Result()) + if err != nil { + return nil, err + } + + logSize := DefaultVerifierLogSize + if opts.LogSize > 0 { + logSize = opts.LogSize + } + + var logBuf []byte + if opts.LogLevel > 0 { + logBuf = make([]byte, logSize) + attr.logLevel = opts.LogLevel + attr.logSize = uint32(len(logBuf)) + attr.logBuf = newPtr(unsafe.Pointer(&logBuf[0])) + } + + fd, err := bpfProgLoad(attr) + if err == nil { + prog := newProgram(fd, spec.Name, &ProgramABI{spec.Type}) + prog.VerifierLog = convertCString(logBuf) + return prog, nil + } + + truncated := errors.Cause(err) == unix.ENOSPC + if opts.LogLevel == 0 { + // Re-run with the verifier enabled to get better error messages. + logBuf = make([]byte, logSize) + attr.logLevel = 1 + attr.logSize = uint32(len(logBuf)) + attr.logBuf = newPtr(unsafe.Pointer(&logBuf[0])) + + _, nerr := bpfProgLoad(attr) + truncated = errors.Cause(nerr) == unix.ENOSPC + } + + logs := convertCString(logBuf) + if truncated { + logs += "\n(truncated...)" + } + + return nil, &loadError{err, logs} +} + +// NewProgramFromFD creates a program from a raw fd. +// +// You should not use fd after calling this function. +func NewProgramFromFD(fd int) (*Program, error) { + if fd < 0 { + return nil, errors.New("invalid fd") + } + bpfFd := newBPFFD(uint32(fd)) + + info, err := bpfGetProgInfoByFD(bpfFd) + if err != nil { + bpfFd.forget() + return nil, err + } + + var name string + if bpfName := convertCString(info.name[:]); bpfName != "" { + name = bpfName + } else { + name = convertCString(info.tag[:]) + } + + return newProgram(bpfFd, name, newProgramABIFromInfo(info)), nil +} + +func newProgram(fd *bpfFD, name string, abi *ProgramABI) *Program { + return &Program{ + name: name, + fd: fd, + abi: *abi, + } +} + +func convertProgramSpec(spec *ProgramSpec, includeName bool) (*bpfProgLoadAttr, error) { + if len(spec.Instructions) == 0 { + return nil, errors.New("Instructions cannot be empty") + } + + if len(spec.License) == 0 { + return nil, errors.New("License cannot be empty") + } + + buf := bytes.NewBuffer(make([]byte, 0, len(spec.Instructions)*asm.InstructionSize)) + err := spec.Instructions.Marshal(buf, internal.NativeEndian) + if err != nil { + return nil, err + } + + bytecode := buf.Bytes() + insCount := uint32(len(bytecode) / asm.InstructionSize) + lic := []byte(spec.License) + attr := &bpfProgLoadAttr{ + progType: spec.Type, + expectedAttachType: spec.AttachType, + insCount: insCount, + instructions: newPtr(unsafe.Pointer(&bytecode[0])), + license: newPtr(unsafe.Pointer(&lic[0])), + } + + name, err := newBPFObjName(spec.Name) + if err != nil { + return nil, err + } + + if includeName { + attr.progName = name + } + + return attr, nil +} + +func (p *Program) String() string { + if p.name != "" { + return fmt.Sprintf("%s(%s)#%s", p.abi.Type, p.name, p.fd) + } + return fmt.Sprintf("%s#%s", p.abi.Type, p.fd) +} + +// ABI gets the ABI of the Program +func (p *Program) ABI() ProgramABI { + return p.abi +} + +// FD gets the file descriptor of the Program. +// +// It is invalid to call this function after Close has been called. +func (p *Program) FD() int { + fd, err := p.fd.value() + if err != nil { + // Best effort: -1 is the number most likely to be an + // invalid file descriptor. + return -1 + } + + return int(fd) +} + +// Clone creates a duplicate of the Program. +// +// Closing the duplicate does not affect the original, and vice versa. +// +// Cloning a nil Program returns nil. +func (p *Program) Clone() (*Program, error) { + if p == nil { + return nil, nil + } + + dup, err := p.fd.dup() + if err != nil { + return nil, errors.Wrap(err, "can't clone program") + } + + return newProgram(dup, p.name, &p.abi), nil +} + +// Pin persists the Program past the lifetime of the process that created it +// +// This requires bpffs to be mounted above fileName. See http://cilium.readthedocs.io/en/doc-1.0/kubernetes/install/#mounting-the-bpf-fs-optional +func (p *Program) Pin(fileName string) error { + return errors.Wrap(bpfPinObject(fileName, p.fd), "can't pin program") +} + +// Close unloads the program from the kernel. +func (p *Program) Close() error { + if p == nil { + return nil + } + + return p.fd.close() +} + +// Test runs the Program in the kernel with the given input and returns the +// value returned by the eBPF program. outLen may be zero. +// +// Note: the kernel expects at least 14 bytes input for an ethernet header for +// XDP and SKB programs. +// +// This function requires at least Linux 4.12. +func (p *Program) Test(in []byte) (uint32, []byte, error) { + ret, out, _, err := p.testRun(in, 1) + return ret, out, err +} + +// Benchmark runs the Program with the given input for a number of times +// and returns the time taken per iteration. +// +// The returned value is the return value of the last execution of +// the program. +// +// This function requires at least Linux 4.12. +func (p *Program) Benchmark(in []byte, repeat int) (uint32, time.Duration, error) { + ret, _, total, err := p.testRun(in, repeat) + return ret, total, err +} + +var noProgTestRun = featureTest{ + Fn: func() bool { + prog, err := NewProgram(&ProgramSpec{ + Type: SocketFilter, + Instructions: asm.Instructions{ + asm.LoadImm(asm.R0, 0, asm.DWord), + asm.Return(), + }, + License: "MIT", + }) + if err != nil { + // This may be because we lack sufficient permissions, etc. + return false + } + defer prog.Close() + + fd, err := prog.fd.value() + if err != nil { + return false + } + + // Programs require at least 14 bytes input + in := make([]byte, 14) + attr := bpfProgTestRunAttr{ + fd: fd, + dataSizeIn: uint32(len(in)), + dataIn: newPtr(unsafe.Pointer(&in[0])), + } + + _, err = bpfCall(_ProgTestRun, unsafe.Pointer(&attr), unsafe.Sizeof(attr)) + return errors.Cause(err) == unix.EINVAL + }, +} + +func (p *Program) testRun(in []byte, repeat int) (uint32, []byte, time.Duration, error) { + if uint(repeat) > math.MaxUint32 { + return 0, nil, 0, fmt.Errorf("repeat is too high") + } + + if len(in) == 0 { + return 0, nil, 0, fmt.Errorf("missing input") + } + + if uint(len(in)) > math.MaxUint32 { + return 0, nil, 0, fmt.Errorf("input is too long") + } + + if noProgTestRun.Result() { + return 0, nil, 0, errNotSupported + } + + // Older kernels ignore the dataSizeOut argument when copying to user space. + // Combined with things like bpf_xdp_adjust_head() we don't really know what the final + // size will be. Hence we allocate an output buffer which we hope will always be large + // enough, and panic if the kernel wrote past the end of the allocation. + // See https://patchwork.ozlabs.org/cover/1006822/ + out := make([]byte, len(in)+outputPad) + + fd, err := p.fd.value() + if err != nil { + return 0, nil, 0, err + } + + attr := bpfProgTestRunAttr{ + fd: fd, + dataSizeIn: uint32(len(in)), + dataSizeOut: uint32(len(out)), + dataIn: newPtr(unsafe.Pointer(&in[0])), + dataOut: newPtr(unsafe.Pointer(&out[0])), + repeat: uint32(repeat), + } + + _, err = bpfCall(_ProgTestRun, unsafe.Pointer(&attr), unsafe.Sizeof(attr)) + if err != nil { + return 0, nil, 0, errors.Wrap(err, "can't run test") + } + + if int(attr.dataSizeOut) > cap(out) { + // Houston, we have a problem. The program created more data than we allocated, + // and the kernel wrote past the end of our buffer. + panic("kernel wrote past end of output buffer") + } + out = out[:int(attr.dataSizeOut)] + + total := time.Duration(attr.duration) * time.Nanosecond + return attr.retval, out, total, nil +} + +func unmarshalProgram(buf []byte) (*Program, error) { + if len(buf) != 4 { + return nil, errors.New("program id requires 4 byte value") + } + + // Looking up an entry in a nested map or prog array returns an id, + // not an fd. + id := internal.NativeEndian.Uint32(buf) + fd, err := bpfGetProgramFDByID(id) + if err != nil { + return nil, err + } + + abi, err := newProgramABIFromFd(fd) + if err != nil { + _ = fd.close() + return nil, err + } + + return newProgram(fd, "", abi), nil +} + +// MarshalBinary implements BinaryMarshaler. +func (p *Program) MarshalBinary() ([]byte, error) { + value, err := p.fd.value() + if err != nil { + return nil, err + } + + buf := make([]byte, 4) + internal.NativeEndian.PutUint32(buf, value) + return buf, nil +} + +// Attach a Program to a container object fd +func (p *Program) Attach(fd int, typ AttachType, flags AttachFlags) error { + if fd < 0 { + return errors.New("invalid fd") + } + + pfd, err := p.fd.value() + if err != nil { + return err + } + + attr := bpfProgAlterAttr{ + targetFd: uint32(fd), + attachBpfFd: pfd, + attachType: uint32(typ), + attachFlags: uint32(flags), + } + + return bpfProgAlter(_ProgAttach, &attr) +} + +// Detach a Program from a container object fd +func (p *Program) Detach(fd int, typ AttachType, flags AttachFlags) error { + if fd < 0 { + return errors.New("invalid fd") + } + + pfd, err := p.fd.value() + if err != nil { + return err + } + + attr := bpfProgAlterAttr{ + targetFd: uint32(fd), + attachBpfFd: pfd, + attachType: uint32(typ), + attachFlags: uint32(flags), + } + + return bpfProgAlter(_ProgDetach, &attr) +} + +// LoadPinnedProgram loads a Program from a BPF file. +// +// Requires at least Linux 4.13, use LoadPinnedProgramExplicit on +// earlier versions. +func LoadPinnedProgram(fileName string) (*Program, error) { + fd, err := bpfGetObject(fileName) + if err != nil { + return nil, err + } + + abi, err := newProgramABIFromFd(fd) + if err != nil { + _ = fd.close() + return nil, err + } + + return newProgram(fd, filepath.Base(fileName), abi), nil +} + +// LoadPinnedProgramExplicit loads a program with explicit parameters. +func LoadPinnedProgramExplicit(fileName string, abi *ProgramABI) (*Program, error) { + fd, err := bpfGetObject(fileName) + if err != nil { + return nil, err + } + + return newProgram(fd, filepath.Base(fileName), abi), nil +} + +// SanitizeName replaces all invalid characters in name. +// +// Use this to automatically generate valid names for maps and +// programs at run time. +// +// Passing a negative value for replacement will delete characters +// instead of replacing them. +func SanitizeName(name string, replacement rune) string { + return strings.Map(func(char rune) rune { + if invalidBPFObjNameChar(char) { + return replacement + } + return char + }, name) +} + +type loadError struct { + cause error + verifierLog string +} + +func (le *loadError) Error() string { + if le.verifierLog == "" { + return fmt.Sprintf("failed to load program: %s", le.cause) + } + return fmt.Sprintf("failed to load program: %s: %s", le.cause, le.verifierLog) +} + +func (le *loadError) Cause() error { + return le.cause +} + +// IsNotSupported returns true if an error occurred because +// the kernel does not have support for a specific feature. +func IsNotSupported(err error) bool { + return errors.Cause(err) == errNotSupported +} diff --git a/vendor/github.com/cilium/ebpf/ptr_32_be.go b/vendor/github.com/cilium/ebpf/ptr_32_be.go new file mode 100644 index 0000000..7757744 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/ptr_32_be.go @@ -0,0 +1,14 @@ +// +build armbe mips mips64p32 + +package ebpf + +import ( + "unsafe" +) + +// ptr wraps an unsafe.Pointer to be 64bit to +// conform to the syscall specification. +type syscallPtr struct { + pad uint32 + ptr unsafe.Pointer +} diff --git a/vendor/github.com/cilium/ebpf/ptr_32_le.go b/vendor/github.com/cilium/ebpf/ptr_32_le.go new file mode 100644 index 0000000..14b805e --- /dev/null +++ b/vendor/github.com/cilium/ebpf/ptr_32_le.go @@ -0,0 +1,14 @@ +// +build 386 amd64p32 arm mipsle mips64p32le + +package ebpf + +import ( + "unsafe" +) + +// ptr wraps an unsafe.Pointer to be 64bit to +// conform to the syscall specification. +type syscallPtr struct { + ptr unsafe.Pointer + pad uint32 +} diff --git a/vendor/github.com/cilium/ebpf/ptr_64.go b/vendor/github.com/cilium/ebpf/ptr_64.go new file mode 100644 index 0000000..c897d72 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/ptr_64.go @@ -0,0 +1,14 @@ +// +build !386,!amd64p32,!arm,!mipsle,!mips64p32le +// +build !armbe,!mips,!mips64p32 + +package ebpf + +import ( + "unsafe" +) + +// ptr wraps an unsafe.Pointer to be 64bit to +// conform to the syscall specification. +type syscallPtr struct { + ptr unsafe.Pointer +} diff --git a/vendor/github.com/cilium/ebpf/readme.md b/vendor/github.com/cilium/ebpf/readme.md new file mode 100644 index 0000000..26ab2b9 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/readme.md @@ -0,0 +1,20 @@ +eBPF +------- +[![](https://godoc.org/github.com/cilium/ebpf?status.svg)](https://godoc.org/github.com/cilium/ebpf) + +eBPF is a pure Go library that provides utilities for loading, compiling, and debugging eBPF programs. It has minimal external dependencies and is intended to be used in long running processes. + +[ebpf/asm](https://godoc.org/github.com/cilium/ebpf/asm) contains a basic assembler. + +The library is maintained by [Cloudflare](https://www.cloudflare.com) and [Cilium](https://www.cilium.io). Feel free to [join](https://cilium.herokuapp.com/) the [libbpf-go](https://cilium.slack.com/messages/libbpf-go) channel on Slack. + +## Current status + +The package is production ready, but **the API is explicitly unstable +right now**. Expect to update your code if you want to follow along. + +## Useful resources + +* [Cilium eBPF documentation](https://cilium.readthedocs.io/en/latest/bpf/#bpf-guide) (recommended) +* [Linux documentation on BPF](http://elixir.free-electrons.com/linux/latest/source/Documentation/networking/filter.txt) +* [eBPF features by Linux version](https://github.com/iovisor/bcc/blob/master/docs/kernel-versions.md) diff --git a/vendor/github.com/cilium/ebpf/syscalls.go b/vendor/github.com/cilium/ebpf/syscalls.go new file mode 100644 index 0000000..68abd3b --- /dev/null +++ b/vendor/github.com/cilium/ebpf/syscalls.go @@ -0,0 +1,420 @@ +package ebpf + +import ( + "bytes" + "path/filepath" + "runtime" + "strconv" + "strings" + "unsafe" + + "github.com/cilium/ebpf/internal/unix" + + "github.com/pkg/errors" +) + +var errClosedFd = errors.New("use of closed file descriptor") + +type bpfFD struct { + raw int64 +} + +func newBPFFD(value uint32) *bpfFD { + fd := &bpfFD{int64(value)} + runtime.SetFinalizer(fd, (*bpfFD).close) + return fd +} + +func (fd *bpfFD) String() string { + return strconv.FormatInt(fd.raw, 10) +} + +func (fd *bpfFD) value() (uint32, error) { + if fd.raw < 0 { + return 0, errClosedFd + } + + return uint32(fd.raw), nil +} + +func (fd *bpfFD) close() error { + if fd.raw < 0 { + return nil + } + + value := int(fd.raw) + fd.raw = -1 + + fd.forget() + return unix.Close(value) +} + +func (fd *bpfFD) forget() { + runtime.SetFinalizer(fd, nil) +} + +func (fd *bpfFD) dup() (*bpfFD, error) { + if fd.raw < 0 { + return nil, errClosedFd + } + + dup, err := unix.FcntlInt(uintptr(fd.raw), unix.F_DUPFD_CLOEXEC, 0) + if err != nil { + return nil, errors.Wrap(err, "can't dup fd") + } + + return newBPFFD(uint32(dup)), nil +} + +// bpfObjName is a null-terminated string made up of +// 'A-Za-z0-9_' characters. +type bpfObjName [unix.BPF_OBJ_NAME_LEN]byte + +// newBPFObjName truncates the result if it is too long. +func newBPFObjName(name string) (bpfObjName, error) { + idx := strings.IndexFunc(name, invalidBPFObjNameChar) + if idx != -1 { + return bpfObjName{}, errors.Errorf("invalid character '%c' in name '%s'", name[idx], name) + } + + var result bpfObjName + copy(result[:unix.BPF_OBJ_NAME_LEN-1], name) + return result, nil +} + +func invalidBPFObjNameChar(char rune) bool { + switch { + case char >= 'A' && char <= 'Z': + fallthrough + case char >= 'a' && char <= 'z': + fallthrough + case char >= '0' && char <= '9': + fallthrough + case char == '_': + return false + default: + return true + } +} + +type bpfMapCreateAttr struct { + mapType MapType + keySize uint32 + valueSize uint32 + maxEntries uint32 + flags uint32 + innerMapFd uint32 // since 4.12 56f668dfe00d + numaNode uint32 // since 4.14 96eabe7a40aa + mapName bpfObjName // since 4.15 ad5b177bd73f +} + +type bpfMapOpAttr struct { + mapFd uint32 + padding uint32 + key syscallPtr + value syscallPtr + flags uint64 +} + +type bpfMapInfo struct { + mapType uint32 + id uint32 + keySize uint32 + valueSize uint32 + maxEntries uint32 + flags uint32 + mapName bpfObjName // since 4.15 ad5b177bd73f +} + +type bpfPinObjAttr struct { + fileName syscallPtr + fd uint32 + padding uint32 +} + +type bpfProgLoadAttr struct { + progType ProgramType + insCount uint32 + instructions syscallPtr + license syscallPtr + logLevel uint32 + logSize uint32 + logBuf syscallPtr + kernelVersion uint32 // since 4.1 2541517c32be + progFlags uint32 // since 4.11 e07b98d9bffe + progName bpfObjName // since 4.15 067cae47771c + progIfIndex uint32 // since 4.15 1f6f4cb7ba21 + expectedAttachType AttachType // since 4.17 5e43f899b03a +} + +type bpfProgInfo struct { + progType uint32 + id uint32 + tag [unix.BPF_TAG_SIZE]byte + jitedLen uint32 + xlatedLen uint32 + jited syscallPtr + xlated syscallPtr + loadTime uint64 // since 4.15 cb4d2b3f03d8 + createdByUID uint32 + nrMapIDs uint32 + mapIds syscallPtr + name bpfObjName +} + +type bpfProgTestRunAttr struct { + fd uint32 + retval uint32 + dataSizeIn uint32 + dataSizeOut uint32 + dataIn syscallPtr + dataOut syscallPtr + repeat uint32 + duration uint32 +} + +type bpfProgAlterAttr struct { + targetFd uint32 + attachBpfFd uint32 + attachType uint32 + attachFlags uint32 +} + +type bpfObjGetInfoByFDAttr struct { + fd uint32 + infoLen uint32 + info syscallPtr // May be either bpfMapInfo or bpfProgInfo +} + +type bpfGetFDByIDAttr struct { + id uint32 + next uint32 +} + +func newPtr(ptr unsafe.Pointer) syscallPtr { + return syscallPtr{ptr: ptr} +} + +func bpfProgLoad(attr *bpfProgLoadAttr) (*bpfFD, error) { + for { + fd, err := bpfCall(_ProgLoad, unsafe.Pointer(attr), unsafe.Sizeof(*attr)) + // As of ~4.20 the verifier can be interrupted by a signal, + // and returns EAGAIN in that case. + if err == unix.EAGAIN { + continue + } + + if err != nil { + return nil, err + } + + return newBPFFD(uint32(fd)), nil + } +} + +func bpfProgAlter(cmd int, attr *bpfProgAlterAttr) error { + _, err := bpfCall(cmd, unsafe.Pointer(attr), unsafe.Sizeof(*attr)) + return err +} + +func bpfMapCreate(attr *bpfMapCreateAttr) (*bpfFD, error) { + fd, err := bpfCall(_MapCreate, unsafe.Pointer(attr), unsafe.Sizeof(*attr)) + if err != nil { + return nil, err + } + + return newBPFFD(uint32(fd)), nil +} + +func bpfMapLookupElem(m *bpfFD, key, valueOut syscallPtr) error { + fd, err := m.value() + if err != nil { + return err + } + + attr := bpfMapOpAttr{ + mapFd: fd, + key: key, + value: valueOut, + } + _, err = bpfCall(_MapLookupElem, unsafe.Pointer(&attr), unsafe.Sizeof(attr)) + return err +} + +func bpfMapUpdateElem(m *bpfFD, key, valueOut syscallPtr, flags uint64) error { + fd, err := m.value() + if err != nil { + return err + } + + attr := bpfMapOpAttr{ + mapFd: fd, + key: key, + value: valueOut, + flags: flags, + } + _, err = bpfCall(_MapUpdateElem, unsafe.Pointer(&attr), unsafe.Sizeof(attr)) + return err +} + +func bpfMapDeleteElem(m *bpfFD, key syscallPtr) error { + fd, err := m.value() + if err != nil { + return err + } + + attr := bpfMapOpAttr{ + mapFd: fd, + key: key, + } + _, err = bpfCall(_MapDeleteElem, unsafe.Pointer(&attr), unsafe.Sizeof(attr)) + return err +} + +func bpfMapGetNextKey(m *bpfFD, key, nextKeyOut syscallPtr) error { + fd, err := m.value() + if err != nil { + return err + } + + attr := bpfMapOpAttr{ + mapFd: fd, + key: key, + value: nextKeyOut, + } + _, err = bpfCall(_MapGetNextKey, unsafe.Pointer(&attr), unsafe.Sizeof(attr)) + return err +} + +const bpfFSType = 0xcafe4a11 + +func bpfPinObject(fileName string, fd *bpfFD) error { + dirName := filepath.Dir(fileName) + var statfs unix.Statfs_t + if err := unix.Statfs(dirName, &statfs); err != nil { + return err + } + if uint64(statfs.Type) != bpfFSType { + return errors.Errorf("%s is not on a bpf filesystem", fileName) + } + + value, err := fd.value() + if err != nil { + return err + } + + _, err = bpfCall(_ObjPin, unsafe.Pointer(&bpfPinObjAttr{ + fileName: newPtr(unsafe.Pointer(&[]byte(fileName)[0])), + fd: value, + }), 16) + return errors.Wrapf(err, "pin object %s", fileName) +} + +func bpfGetObject(fileName string) (*bpfFD, error) { + ptr, err := bpfCall(_ObjGet, unsafe.Pointer(&bpfPinObjAttr{ + fileName: newPtr(unsafe.Pointer(&[]byte(fileName)[0])), + }), 16) + if err != nil { + return nil, errors.Wrapf(err, "get object %s", fileName) + } + return newBPFFD(uint32(ptr)), nil +} + +func bpfGetObjectInfoByFD(fd *bpfFD, info unsafe.Pointer, size uintptr) error { + value, err := fd.value() + if err != nil { + return err + } + + // available from 4.13 + attr := bpfObjGetInfoByFDAttr{ + fd: value, + infoLen: uint32(size), + info: newPtr(info), + } + _, err = bpfCall(_ObjGetInfoByFD, unsafe.Pointer(&attr), unsafe.Sizeof(attr)) + return errors.Wrapf(err, "fd %d", value) +} + +func bpfGetProgInfoByFD(fd *bpfFD) (*bpfProgInfo, error) { + var info bpfProgInfo + err := bpfGetObjectInfoByFD(fd, unsafe.Pointer(&info), unsafe.Sizeof(info)) + return &info, errors.Wrap(err, "can't get program info") +} + +func bpfGetMapInfoByFD(fd *bpfFD) (*bpfMapInfo, error) { + var info bpfMapInfo + err := bpfGetObjectInfoByFD(fd, unsafe.Pointer(&info), unsafe.Sizeof(info)) + return &info, errors.Wrap(err, "can't get map info:") +} + +var haveObjName = featureTest{ + Fn: func() bool { + name, err := newBPFObjName("feature_test") + if err != nil { + // This really is a fatal error, but it should be caught + // by the unit tests not working. + return false + } + + attr := bpfMapCreateAttr{ + mapType: Array, + keySize: 4, + valueSize: 4, + maxEntries: 1, + mapName: name, + } + + fd, err := bpfMapCreate(&attr) + if err != nil { + return false + } + + _ = fd.close() + return true + }, +} + +func bpfGetMapFDByID(id uint32) (*bpfFD, error) { + // available from 4.13 + attr := bpfGetFDByIDAttr{ + id: id, + } + ptr, err := bpfCall(_MapGetFDByID, unsafe.Pointer(&attr), unsafe.Sizeof(attr)) + if err != nil { + return nil, errors.Wrapf(err, "can't get fd for map id %d", id) + } + return newBPFFD(uint32(ptr)), nil +} + +func bpfGetProgramFDByID(id uint32) (*bpfFD, error) { + // available from 4.13 + attr := bpfGetFDByIDAttr{ + id: id, + } + ptr, err := bpfCall(_ProgGetFDByID, unsafe.Pointer(&attr), unsafe.Sizeof(attr)) + if err != nil { + return nil, errors.Wrapf(err, "can't get fd for program id %d", id) + } + return newBPFFD(uint32(ptr)), nil +} + +func bpfCall(cmd int, attr unsafe.Pointer, size uintptr) (uintptr, error) { + r1, _, errNo := unix.Syscall(unix.SYS_BPF, uintptr(cmd), uintptr(attr), size) + runtime.KeepAlive(attr) + + var err error + if errNo != 0 { + err = errNo + } + + return r1, err +} + +func convertCString(in []byte) string { + inLen := bytes.IndexByte(in, 0) + if inLen == -1 { + return "" + } + return string(in[:inLen]) +} diff --git a/vendor/github.com/cilium/ebpf/types.go b/vendor/github.com/cilium/ebpf/types.go new file mode 100644 index 0000000..0daf9a7 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/types.go @@ -0,0 +1,189 @@ +package ebpf + +//go:generate stringer -output types_string.go -type=MapType,ProgramType + +// MapType indicates the type map structure +// that will be initialized in the kernel. +type MapType uint32 + +// All the various map types that can be created +const ( + UnspecifiedMap MapType = iota + // Hash is a hash map + Hash + // Array is an array map + Array + // ProgramArray - A program array map is a special kind of array map whose map + // values contain only file descriptors referring to other eBPF + // programs. Thus, both the key_size and value_size must be + // exactly four bytes. This map is used in conjunction with the + // TailCall helper. + ProgramArray + // PerfEventArray - A perf event array is used in conjunction with PerfEventRead + // and PerfEventOutput calls, to read the raw bpf_perf_data from the registers. + PerfEventArray + // PerCPUHash - This data structure is useful for people who have high performance + // network needs and can reconcile adds at the end of some cycle, so that + // hashes can be lock free without the use of XAdd, which can be costly. + PerCPUHash + // PerCPUArray - This data structure is useful for people who have high performance + // network needs and can reconcile adds at the end of some cycle, so that + // hashes can be lock free without the use of XAdd, which can be costly. + // Each CPU gets a copy of this hash, the contents of all of which can be reconciled + // later. + PerCPUArray + // StackTrace - This holds whole user and kernel stack traces, it can be retrieved with + // GetStackID + StackTrace + // CGroupArray - This is a very niche structure used to help SKBInCGroup determine + // if an skb is from a socket belonging to a specific cgroup + CGroupArray + // LRUHash - This allows you to create a small hash structure that will purge the + // least recently used items rather than thow an error when you run out of memory + LRUHash + // LRUCPUHash - This is NOT like PerCPUHash, this structure is shared among the CPUs, + // it has more to do with including the CPU id with the LRU calculation so that if a + // particular CPU is using a value over-and-over again, then it will be saved, but if + // a value is being retrieved a lot but sparsely across CPUs it is not as important, basically + // giving weight to CPU locality over overall usage. + LRUCPUHash + // LPMTrie - This is an implementation of Longest-Prefix-Match Trie structure. It is useful, + // for storing things like IP addresses which can be bit masked allowing for keys of differing + // values to refer to the same reference based on their masks. See wikipedia for more details. + LPMTrie + // ArrayOfMaps - Each item in the array is another map. The inner map mustn't be a map of maps + // itself. + ArrayOfMaps + // HashOfMaps - Each item in the hash map is another map. The inner map mustn't be a map of maps + // itself. + HashOfMaps +) + +// hasPerCPUValue returns true if the Map stores a value per CPU. +func (mt MapType) hasPerCPUValue() bool { + if mt == PerCPUHash || mt == PerCPUArray { + return true + } + return false +} + +const ( + _MapCreate = iota + _MapLookupElem + _MapUpdateElem + _MapDeleteElem + _MapGetNextKey + _ProgLoad + _ObjPin + _ObjGet + _ProgAttach + _ProgDetach + _ProgTestRun + _ProgGetNextID + _MapGetNextID + _ProgGetFDByID + _MapGetFDByID + _ObjGetInfoByFD +) + +const ( + _Any = iota + _NoExist + _Exist +) + +// ProgramType of the eBPF program +type ProgramType uint32 + +// eBPF program types +const ( + // Unrecognized program type + UnspecifiedProgram ProgramType = iota + // SocketFilter socket or seccomp filter + SocketFilter + // Kprobe program + Kprobe + // SchedCLS traffic control shaper + SchedCLS + // SchedACT routing control shaper + SchedACT + // TracePoint program + TracePoint + // XDP program + XDP + // PerfEvent program + PerfEvent + // CGroupSKB program + CGroupSKB + // CGroupSock program + CGroupSock + // LWTIn program + LWTIn + // LWTOut program + LWTOut + // LWTXmit program + LWTXmit + // SockOps program + SockOps + // SkSKB program + SkSKB + // CGroupDevice program + CGroupDevice + // SkMsg program + SkMsg + // RawTracepoint program + RawTracepoint + // CGroupSockAddr program + CGroupSockAddr + // LWTSeg6Local program + LWTSeg6Local + // LircMode2 program + LircMode2 + // SkReuseport program + SkReuseport + // FlowDissector program + FlowDissector + // CGroupSysctl program + CGroupSysctl + // RawTracepointWritable program + RawTracepointWritable + // CGroupSockopt program + CGroupSockopt +) + +// AttachType of the eBPF program, needed to differentiate allowed context accesses in +// some newer program types like CGroupSockAddr. Should be set to AttachNone if not required. +// Will cause invalid argument (EINVAL) at program load time if set incorrectly. +type AttachType uint32 + +// AttachNone is an alias for AttachCGroupInetIngress for readability reasons +const AttachNone AttachType = 0 + +const ( + AttachCGroupInetIngress AttachType = iota + AttachCGroupInetEgress + AttachCGroupInetSockCreate + AttachCGroupSockOps + AttachSkSKBStreamParser + AttachSkSKBStreamVerdict + AttachCGroupDevice + AttachSkMsgVerdict + AttachCGroupInet4Bind + AttachCGroupInet6Bind + AttachCGroupInet4Connect + AttachCGroupInet6Connect + AttachCGroupInet4PostBind + AttachCGroupInet6PostBind + AttachCGroupUDP4Sendmsg + AttachCGroupUDP6Sendmsg + AttachLircMode2 + AttachFlowDissector + AttachCGroupSysctl + AttachCGroupUDP4Recvmsg + AttachCGroupUDP6Recvmsg + AttachCGroupGetsockopt + AttachCGroupSetsockopt +) + +// AttachFlags of the eBPF program used in BPF_PROG_ATTACH command +type AttachFlags uint32 diff --git a/vendor/github.com/cilium/ebpf/types_string.go b/vendor/github.com/cilium/ebpf/types_string.go new file mode 100644 index 0000000..4813437 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/types_string.go @@ -0,0 +1,78 @@ +// Code generated by "stringer -output types_string.go -type=MapType,ProgramType"; DO NOT EDIT. + +package ebpf + +import "strconv" + +func _() { + // An "invalid array index" compiler error signifies that the constant values have changed. + // Re-run the stringer command to generate them again. + var x [1]struct{} + _ = x[UnspecifiedMap-0] + _ = x[Hash-1] + _ = x[Array-2] + _ = x[ProgramArray-3] + _ = x[PerfEventArray-4] + _ = x[PerCPUHash-5] + _ = x[PerCPUArray-6] + _ = x[StackTrace-7] + _ = x[CGroupArray-8] + _ = x[LRUHash-9] + _ = x[LRUCPUHash-10] + _ = x[LPMTrie-11] + _ = x[ArrayOfMaps-12] + _ = x[HashOfMaps-13] +} + +const _MapType_name = "UnspecifiedMapHashArrayProgramArrayPerfEventArrayPerCPUHashPerCPUArrayStackTraceCGroupArrayLRUHashLRUCPUHashLPMTrieArrayOfMapsHashOfMaps" + +var _MapType_index = [...]uint8{0, 14, 18, 23, 35, 49, 59, 70, 80, 91, 98, 108, 115, 126, 136} + +func (i MapType) String() string { + if i >= MapType(len(_MapType_index)-1) { + return "MapType(" + strconv.FormatInt(int64(i), 10) + ")" + } + return _MapType_name[_MapType_index[i]:_MapType_index[i+1]] +} +func _() { + // An "invalid array index" compiler error signifies that the constant values have changed. + // Re-run the stringer command to generate them again. + var x [1]struct{} + _ = x[UnspecifiedProgram-0] + _ = x[SocketFilter-1] + _ = x[Kprobe-2] + _ = x[SchedCLS-3] + _ = x[SchedACT-4] + _ = x[TracePoint-5] + _ = x[XDP-6] + _ = x[PerfEvent-7] + _ = x[CGroupSKB-8] + _ = x[CGroupSock-9] + _ = x[LWTIn-10] + _ = x[LWTOut-11] + _ = x[LWTXmit-12] + _ = x[SockOps-13] + _ = x[SkSKB-14] + _ = x[CGroupDevice-15] + _ = x[SkMsg-16] + _ = x[RawTracepoint-17] + _ = x[CGroupSockAddr-18] + _ = x[LWTSeg6Local-19] + _ = x[LircMode2-20] + _ = x[SkReuseport-21] + _ = x[FlowDissector-22] + _ = x[CGroupSysctl-23] + _ = x[RawTracepointWritable-24] + _ = x[CGroupSockopt-25] +} + +const _ProgramType_name = "UnspecifiedProgramSocketFilterKprobeSchedCLSSchedACTTracePointXDPPerfEventCGroupSKBCGroupSockLWTInLWTOutLWTXmitSockOpsSkSKBCGroupDeviceSkMsgRawTracepointCGroupSockAddrLWTSeg6LocalLircMode2SkReuseportFlowDissectorCGroupSysctlRawTracepointWritableCGroupSockopt" + +var _ProgramType_index = [...]uint16{0, 18, 30, 36, 44, 52, 62, 65, 74, 83, 93, 98, 104, 111, 118, 123, 135, 140, 153, 167, 179, 188, 199, 212, 224, 245, 258} + +func (i ProgramType) String() string { + if i >= ProgramType(len(_ProgramType_index)-1) { + return "ProgramType(" + strconv.FormatInt(int64(i), 10) + ")" + } + return _ProgramType_name[_ProgramType_index[i]:_ProgramType_index[i+1]] +}