--- /dev/null
+vendor/pkg
+/runc
+/runc-*
+contrib/cmd/recvtty/recvtty
+man/man8
+release
--- /dev/null
+approve_by_comment: true
+approve_regex: ^LGTM
+reject_regex: ^Rejected
+reset_on_push: true
+author_approval: ignored
+reviewers:
+ teams:
+ - runc-maintainers
+ name: default
+ required: 2
--- /dev/null
+language: go
+go:
+ - 1.9.x
+ - 1.10.x
+ - 1.11.x
+ - tip
+
+matrix:
+ allow_failures:
+ - go: tip
+
+go_import_path: github.com/opencontainers/runc
+
+# `make ci` uses Docker.
+sudo: required
+services:
+ - docker
+
+env:
+ global:
+ - BUILDTAGS="seccomp apparmor selinux ambient"
+
+before_install:
+ - echo "deb http://archive.ubuntu.com/ubuntu trusty-backports main restricted universe multiverse" | sudo tee -a /etc/apt/sources.list
+ - sudo apt-get -qq update
+ - sudo apt-get install -y libseccomp-dev/trusty-backports
+ - go get -u golang.org/x/lint/golint
+ - go get -u github.com/vbatts/git-validation
+ - env | grep TRAVIS_
+
+script:
+ - git-validation -run DCO,short-subject -v
+ - make BUILDTAGS="${BUILDTAGS}"
+ - make BUILDTAGS="${BUILDTAGS}" clean ci cross
--- /dev/null
+## Contribution Guidelines
+
+### Security issues
+
+If you are reporting a security issue, do not create an issue or file a pull
+request on GitHub. Instead, disclose the issue responsibly by sending an email
+to security@opencontainers.org (which is inhabited only by the maintainers of
+the various OCI projects).
+
+### Pull requests are always welcome
+
+We are always thrilled to receive pull requests, and do our best to
+process them as fast as possible. Not sure if that typo is worth a pull
+request? Do it! We will appreciate it.
+
+If your pull request is not accepted on the first try, don't be
+discouraged! If there's a problem with the implementation, hopefully you
+received feedback on what to improve.
+
+We're trying very hard to keep runc lean and focused. We don't want it
+to do everything for everybody. This means that we might decide against
+incorporating a new feature. However, there might be a way to implement
+that feature *on top of* runc.
+
+
+### Conventions
+
+Fork the repo and make changes on your fork in a feature branch:
+
+- If it's a bugfix branch, name it XXX-something where XXX is the number of the
+ issue
+- If it's a feature branch, create an enhancement issue to announce your
+ intentions, and name it XXX-something where XXX is the number of the issue.
+
+Submit unit tests for your changes. Go has a great test framework built in; use
+it! Take a look at existing tests for inspiration. Run the full test suite on
+your branch before submitting a pull request.
+
+Update the documentation when creating or modifying features. Test
+your documentation changes for clarity, concision, and correctness, as
+well as a clean documentation build. See ``docs/README.md`` for more
+information on building the docs and how docs get released.
+
+Write clean code. Universally formatted code promotes ease of writing, reading,
+and maintenance. Always run `gofmt -s -w file.go` on each changed file before
+committing your changes. Most editors have plugins that do this automatically.
+
+Pull requests descriptions should be as clear as possible and include a
+reference to all the issues that they address.
+
+Pull requests must not contain commits from other users or branches.
+
+Commit messages must start with a capitalized and short summary (max. 50
+chars) written in the imperative, followed by an optional, more detailed
+explanatory text which is separated from the summary by an empty line.
+
+Code review comments may be added to your pull request. Discuss, then make the
+suggested modifications and push additional commits to your feature branch. Be
+sure to post a comment after pushing. The new commits will show up in the pull
+request automatically, but the reviewers will not be notified unless you
+comment.
+
+Before the pull request is merged, make sure that you squash your commits into
+logical units of work using `git rebase -i` and `git push -f`. After every
+commit the test suite should be passing. Include documentation changes in the
+same commit so that a revert would remove all traces of the feature or fix.
+
+Commits that fix or close an issue should include a reference like `Closes #XXX`
+or `Fixes #XXX`, which will automatically close the issue when merged.
+
+### Sign your work
+
+The sign-off is a simple line at the end of the explanation for the
+patch, which certifies that you wrote it or otherwise have the right to
+pass it on as an open-source patch. The rules are pretty simple: if you
+can certify the below (from
+[developercertificate.org](http://developercertificate.org/)):
+
+```
+Developer Certificate of Origin
+Version 1.1
+
+Copyright (C) 2004, 2006 The Linux Foundation and its contributors.
+660 York Street, Suite 102,
+San Francisco, CA 94110 USA
+
+Everyone is permitted to copy and distribute verbatim copies of this
+license document, but changing it is not allowed.
+
+
+Developer's Certificate of Origin 1.1
+
+By making a contribution to this project, I certify that:
+
+(a) The contribution was created in whole or in part by me and I
+ have the right to submit it under the open source license
+ indicated in the file; or
+
+(b) The contribution is based upon previous work that, to the best
+ of my knowledge, is covered under an appropriate open source
+ license and I have the right under that license to submit that
+ work with modifications, whether created in whole or in part
+ by me, under the same open source license (unless I am
+ permitted to submit under a different license), as indicated
+ in the file; or
+
+(c) The contribution was provided directly to me by some other
+ person who certified (a), (b) or (c) and I have not modified
+ it.
+
+(d) I understand and agree that this project and the contribution
+ are public and that a record of the contribution (including all
+ personal information I submit with it, including my sign-off) is
+ maintained indefinitely and may be redistributed consistent with
+ this project or the open source license(s) involved.
+```
+
+then you just add a line to every git commit message:
+
+ Signed-off-by: Joe Smith <joe@gmail.com>
+
+using your real name (sorry, no pseudonyms or anonymous contributions.)
+
+You can add the sign off when creating the git commit via `git commit -s`.
--- /dev/null
+FROM golang:1.10-stretch
+
+RUN dpkg --add-architecture armel \
+ && dpkg --add-architecture armhf \
+ && dpkg --add-architecture arm64 \
+ && dpkg --add-architecture ppc64el \
+ && apt-get update && apt-get install -y \
+ build-essential \
+ curl \
+ sudo \
+ gawk \
+ iptables \
+ jq \
+ pkg-config \
+ libaio-dev \
+ libcap-dev \
+ libprotobuf-dev \
+ libprotobuf-c0-dev \
+ libnl-3-dev \
+ libnet-dev \
+ libseccomp2 \
+ libseccomp-dev \
+ protobuf-c-compiler \
+ protobuf-compiler \
+ python-minimal \
+ uidmap \
+ kmod \
+ crossbuild-essential-armel crossbuild-essential-armhf crossbuild-essential-arm64 crossbuild-essential-ppc64el \
+ libseccomp-dev:armel libseccomp-dev:armhf libseccomp-dev:arm64 libseccomp-dev:ppc64el \
+ --no-install-recommends \
+ && apt-get clean
+
+# Add a dummy user for the rootless integration tests. While runC does
+# not require an entry in /etc/passwd to operate, one of the tests uses
+# `git clone` -- and `git clone` does not allow you to clone a
+# repository if the current uid does not have an entry in /etc/passwd.
+RUN useradd -u1000 -m -d/home/rootless -s/bin/bash rootless
+
+# install bats
+RUN cd /tmp \
+ && git clone https://github.com/sstephenson/bats.git \
+ && cd bats \
+ && git reset --hard 03608115df2071fff4eaaff1605768c275e5f81f \
+ && ./install.sh /usr/local \
+ && rm -rf /tmp/bats
+
+# install criu
+ENV CRIU_VERSION v3.11
+RUN mkdir -p /usr/src/criu \
+ && curl -sSL https://github.com/checkpoint-restore/criu/archive/${CRIU_VERSION}.tar.gz | tar -v -C /usr/src/criu/ -xz --strip-components=1 \
+ && cd /usr/src/criu \
+ && make install-criu \
+ && rm -rf /usr/src/criu
+
+# setup a playground for us to spawn containers in
+ENV ROOTFS /busybox
+RUN mkdir -p ${ROOTFS}
+
+COPY script/tmpmount /
+WORKDIR /go/src/github.com/opencontainers/runc
+ENTRYPOINT ["/tmpmount"]
+
+ADD . /go/src/github.com/opencontainers/runc
+
+RUN . tests/integration/multi-arch.bash \
+ && curl -o- -sSL `get_busybox` | tar xfJC - ${ROOTFS}
--- /dev/null
+
+ Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+ END OF TERMS AND CONDITIONS
+
+ Copyright 2014 Docker, Inc.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
--- /dev/null
+Michael Crosby <michael@docker.com> (@crosbymichael)
+Rohit Jnagal <jnagal@google.com> (@rjnagal)
+Victor Marmol <vmarmol@google.com> (@vmarmol)
+Mrunal Patel <mpatel@redhat.com> (@mrunalp)
+Daniel, Dao Quang Minh <dqminh89@gmail.com> (@dqminh)
+Qiang Huang <h.huangqiang@huawei.com> (@hqhq)
+Aleksa Sarai <asarai@suse.de> (@cyphar)
--- /dev/null
+## Introduction
+
+Dear maintainer. Thank you for investing the time and energy to help
+make runc as useful as possible. Maintaining a project is difficult,
+sometimes unrewarding work. Sure, you will get to contribute cool
+features to the project. But most of your time will be spent reviewing,
+cleaning up, documenting, answering questions, justifying design
+decisions - while everyone has all the fun! But remember - the quality
+of the maintainers work is what distinguishes the good projects from the
+great. So please be proud of your work, even the unglamorous parts,
+and encourage a culture of appreciation and respect for *every* aspect
+of improving the project - not just the hot new features.
+
+This document is a manual for maintainers old and new. It explains what
+is expected of maintainers, how they should work, and what tools are
+available to them.
+
+This is a living document - if you see something out of date or missing,
+speak up!
+
+## What are a maintainer's responsibility?
+
+It is every maintainer's responsibility to:
+
+* 1) Expose a clear roadmap for improving their component.
+* 2) Deliver prompt feedback and decisions on pull requests.
+* 3) Be available to anyone with questions, bug reports, criticism etc.
+ on their component. This includes IRC and GitHub issues and pull requests.
+* 4) Make sure their component respects the philosophy, design and
+ roadmap of the project.
+
+## How are decisions made?
+
+Short answer: with pull requests to the runc repository.
+
+runc is an open-source project with an open design philosophy. This
+means that the repository is the source of truth for EVERY aspect of the
+project, including its philosophy, design, roadmap and APIs. *If it's
+part of the project, it's in the repo. It's in the repo, it's part of
+the project.*
+
+As a result, all decisions can be expressed as changes to the
+repository. An implementation change is a change to the source code. An
+API change is a change to the API specification. A philosophy change is
+a change to the philosophy manifesto. And so on.
+
+All decisions affecting runc, big and small, follow the same 3 steps:
+
+* Step 1: Open a pull request. Anyone can do this.
+
+* Step 2: Discuss the pull request. Anyone can do this.
+
+* Step 3: Accept (`LGTM`) or refuse a pull request. The relevant maintainers do
+this (see below "Who decides what?")
+
+*I'm a maintainer, should I make pull requests too?*
+
+Yes. Nobody should ever push to master directly. All changes should be
+made through a pull request.
+
+## Who decides what?
+
+All decisions are pull requests, and the relevant maintainers make
+decisions by accepting or refusing the pull request. Review and acceptance
+by anyone is denoted by adding a comment in the pull request: `LGTM`.
+However, only currently listed `MAINTAINERS` are counted towards the required
+two LGTMs.
+
+Overall the maintainer system works because of mutual respect across the
+maintainers of the project. The maintainers trust one another to make decisions
+in the best interests of the project. Sometimes maintainers can disagree and
+this is part of a healthy project to represent the point of views of various people.
+In the case where maintainers cannot find agreement on a specific change the
+role of a Chief Maintainer comes into play.
+
+The Chief Maintainer for the project is responsible for overall architecture
+of the project to maintain conceptual integrity. Large decisions and
+architecture changes should be reviewed by the chief maintainer.
+The current chief maintainer for the project is Michael Crosby (@crosbymichael).
+
+Even though the maintainer system is built on trust, if there is a conflict
+with the chief maintainer on a decision, their decision can be challenged
+and brought to the technical oversight board if two-thirds of the
+maintainers vote for an appeal. It is expected that this would be a
+very exceptional event.
+
+
+### How are maintainers added?
+
+The best maintainers have a vested interest in the project. Maintainers
+are first and foremost contributors that have shown they are committed to
+the long term success of the project. Contributors wanting to become
+maintainers are expected to be deeply involved in contributing code,
+pull request review, and triage of issues in the project for more than two months.
+
+Just contributing does not make you a maintainer, it is about building trust
+with the current maintainers of the project and being a person that they can
+depend on and trust to make decisions in the best interest of the project. The
+final vote to add a new maintainer should be approved by over 66% of the current
+maintainers with the chief maintainer having veto power. In case of a veto,
+conflict resolution rules expressed above apply. The voting period is
+five business days on the Pull Request to add the new maintainer.
+
+
+### What is expected of maintainers?
+
+Part of a healthy project is to have active maintainers to support the community
+in contributions and perform tasks to keep the project running. Maintainers are
+expected to be able to respond in a timely manner if their help is required on specific
+issues where they are pinged. Being a maintainer is a time consuming commitment and should
+not be taken lightly.
+
+When a maintainer is unable to perform the required duties they can be removed with
+a vote by 66% of the current maintainers with the chief maintainer having veto power.
+The voting period is ten business days. Issues related to a maintainer's performance should
+be discussed with them among the other maintainers so that they are not surprised by
+a pull request removing them.
+
+
+
--- /dev/null
+.PHONY: all shell dbuild man release \
+ localtest localunittest localintegration \
+ test unittest integration \
+ cross localcross
+
+GO := go
+
+SOURCES := $(shell find . 2>&1 | grep -E '.*\.(c|h|go)$$')
+PREFIX := $(DESTDIR)/usr/local
+BINDIR := $(PREFIX)/sbin
+GIT_BRANCH := $(shell git rev-parse --abbrev-ref HEAD 2>/dev/null)
+GIT_BRANCH_CLEAN := $(shell echo $(GIT_BRANCH) | sed -e "s/[^[:alnum:]]/-/g")
+RUNC_IMAGE := runc_dev$(if $(GIT_BRANCH_CLEAN),:$(GIT_BRANCH_CLEAN))
+PROJECT := github.com/opencontainers/runc
+BUILDTAGS ?= seccomp
+COMMIT_NO := $(shell git rev-parse HEAD 2> /dev/null || true)
+COMMIT := $(if $(shell git status --porcelain --untracked-files=no),"${COMMIT_NO}-dirty","${COMMIT_NO}")
+
+MAN_DIR := $(CURDIR)/man/man8
+MAN_PAGES = $(shell ls $(MAN_DIR)/*.8)
+MAN_PAGES_BASE = $(notdir $(MAN_PAGES))
+MAN_INSTALL_PATH := ${PREFIX}/share/man/man8/
+
+RELEASE_DIR := $(CURDIR)/release
+
+VERSION := ${shell cat ./VERSION}
+
+SHELL := $(shell command -v bash 2>/dev/null)
+
+.DEFAULT: runc
+
+runc: $(SOURCES)
+ $(GO) build -buildmode=pie $(EXTRA_FLAGS) -ldflags "-X main.gitCommit=${COMMIT} -X main.version=${VERSION} $(EXTRA_LDFLAGS)" -tags "$(BUILDTAGS)" -o runc .
+
+all: runc recvtty
+
+recvtty: contrib/cmd/recvtty/recvtty
+
+contrib/cmd/recvtty/recvtty: $(SOURCES)
+ $(GO) build -buildmode=pie $(EXTRA_FLAGS) -ldflags "-X main.gitCommit=${COMMIT} -X main.version=${VERSION} $(EXTRA_LDFLAGS)" -tags "$(BUILDTAGS)" -o contrib/cmd/recvtty/recvtty ./contrib/cmd/recvtty
+
+static: $(SOURCES)
+ CGO_ENABLED=1 $(GO) build $(EXTRA_FLAGS) -tags "$(BUILDTAGS) netgo osusergo static_build" -installsuffix netgo -ldflags "-w -extldflags -static -X main.gitCommit=${COMMIT} -X main.version=${VERSION} $(EXTRA_LDFLAGS)" -o runc .
+ CGO_ENABLED=1 $(GO) build $(EXTRA_FLAGS) -tags "$(BUILDTAGS) netgo osusergo static_build" -installsuffix netgo -ldflags "-w -extldflags -static -X main.gitCommit=${COMMIT} -X main.version=${VERSION} $(EXTRA_LDFLAGS)" -o contrib/cmd/recvtty/recvtty ./contrib/cmd/recvtty
+
+release:
+ script/release.sh -r release/$(VERSION) -v $(VERSION)
+
+dbuild: runcimage
+ docker run ${DOCKER_RUN_PROXY} --rm -v $(CURDIR):/go/src/$(PROJECT) --privileged $(RUNC_IMAGE) make clean all
+
+lint:
+ $(GO) vet $(allpackages)
+ $(GO) fmt $(allpackages)
+
+man:
+ man/md2man-all.sh
+
+runcimage:
+ docker build ${DOCKER_BUILD_PROXY} -t $(RUNC_IMAGE) .
+
+test:
+ make unittest integration rootlessintegration
+
+localtest:
+ make localunittest localintegration localrootlessintegration
+
+unittest: runcimage
+ docker run ${DOCKER_RUN_PROXY} -t --privileged --rm -v /lib/modules:/lib/modules:ro -v $(CURDIR):/go/src/$(PROJECT) $(RUNC_IMAGE) make localunittest TESTFLAGS=${TESTFLAGS}
+
+localunittest: all
+ $(GO) test -timeout 3m -tags "$(BUILDTAGS)" ${TESTFLAGS} -v $(allpackages)
+
+integration: runcimage
+ docker run ${DOCKER_RUN_PROXY} -t --privileged --rm -v /lib/modules:/lib/modules:ro -v $(CURDIR):/go/src/$(PROJECT) $(RUNC_IMAGE) make localintegration TESTPATH=${TESTPATH}
+
+localintegration: all
+ bats -t tests/integration${TESTPATH}
+
+rootlessintegration: runcimage
+ docker run ${DOCKER_RUN_PROXY} -t --privileged --rm -v $(CURDIR):/go/src/$(PROJECT) $(RUNC_IMAGE) make localrootlessintegration
+
+localrootlessintegration: all
+ tests/rootless.sh
+
+shell: runcimage
+ docker run ${DOCKER_RUN_PROXY} -ti --privileged --rm -v $(CURDIR):/go/src/$(PROJECT) $(RUNC_IMAGE) bash
+
+install:
+ install -D -m0755 runc $(BINDIR)/runc
+
+install-bash:
+ install -D -m0644 contrib/completions/bash/runc $(PREFIX)/share/bash-completion/completions/runc
+
+install-man:
+ install -d -m 755 $(MAN_INSTALL_PATH)
+ install -m 644 $(MAN_PAGES) $(MAN_INSTALL_PATH)
+
+uninstall:
+ rm -f $(BINDIR)/runc
+
+uninstall-bash:
+ rm -f $(PREFIX)/share/bash-completion/completions/runc
+
+uninstall-man:
+ rm -f $(addprefix $(MAN_INSTALL_PATH),$(MAN_PAGES_BASE))
+
+clean:
+ rm -f runc runc-*
+ rm -f contrib/cmd/recvtty/recvtty
+ rm -rf $(RELEASE_DIR)
+ rm -rf $(MAN_DIR)
+
+validate:
+ script/validate-gofmt
+ script/validate-c
+ $(GO) vet $(allpackages)
+
+ci: validate test release
+
+cross: runcimage
+ docker run ${DOCKER_RUN_PROXY} -e BUILDTAGS="$(BUILDTAGS)" --rm -v $(CURDIR):/go/src/$(PROJECT) $(RUNC_IMAGE) make localcross
+
+localcross:
+ CGO_ENABLED=1 GOARCH=arm GOARM=6 CC=arm-linux-gnueabi-gcc $(GO) build -buildmode=pie $(EXTRA_FLAGS) -ldflags "-X main.gitCommit=${COMMIT} -X main.version=${VERSION} $(EXTRA_LDFLAGS)" -tags "$(BUILDTAGS)" -o runc-armel .
+ CGO_ENABLED=1 GOARCH=arm GOARM=7 CC=arm-linux-gnueabihf-gcc $(GO) build -buildmode=pie $(EXTRA_FLAGS) -ldflags "-X main.gitCommit=${COMMIT} -X main.version=${VERSION} $(EXTRA_LDFLAGS)" -tags "$(BUILDTAGS)" -o runc-armhf .
+ CGO_ENABLED=1 GOARCH=arm64 CC=aarch64-linux-gnu-gcc $(GO) build -buildmode=pie $(EXTRA_FLAGS) -ldflags "-X main.gitCommit=${COMMIT} -X main.version=${VERSION} $(EXTRA_LDFLAGS)" -tags "$(BUILDTAGS)" -o runc-arm64 .
+ CGO_ENABLED=1 GOARCH=ppc64le CC=powerpc64le-linux-gnu-gcc $(GO) build -buildmode=pie $(EXTRA_FLAGS) -ldflags "-X main.gitCommit=${COMMIT} -X main.version=${VERSION} $(EXTRA_LDFLAGS)" -tags "$(BUILDTAGS)" -o runc-ppc64le .
+
+# memoize allpackages, so that it's executed only once and only if used
+_allpackages = $(shell $(GO) list ./... | grep -v vendor)
+allpackages = $(if $(__allpackages),,$(eval __allpackages := $$(_allpackages)))$(__allpackages)
--- /dev/null
+runc
+
+Copyright 2012-2015 Docker, Inc.
+
+This product includes software developed at Docker, Inc. (http://www.docker.com).
+
+The following is courtesy of our legal counsel:
+
+
+Use and transfer of Docker may be subject to certain restrictions by the
+United States and other governments.
+It is your responsibility to ensure that your use and/or transfer does not
+violate applicable laws.
+
+For more information, please see http://www.bis.doc.gov
+
+See also http://www.apache.org/dev/crypto.html and/or seek legal counsel.
--- /dev/null
+# runc principles
+
+In the design and development of runc and libcontainer we try to follow these principles:
+
+(Work in progress)
+
+* Don't try to replace every tool. Instead, be an ingredient to improve them.
+* Less code is better.
+* Fewer components are better. Do you really need to add one more class?
+* 50 lines of straightforward, readable code is better than 10 lines of magic that nobody can understand.
+* Don't do later what you can do now. "//TODO: refactor" is not acceptable in new code.
+* When hesitating between two options, choose the one that is easier to reverse.
+* "No" is temporary; "Yes" is forever. If you're not sure about a new feature, say no. You can change your mind later.
+* Containers must be portable to the greatest possible number of machines. Be suspicious of any change which makes machines less interchangeable.
+* The fewer moving parts in a container, the better.
+* Don't merge it unless you document it.
+* Don't document it unless you can keep it up-to-date.
+* Don't merge it unless you test it!
+* Everyone's problem is slightly different. Focus on the part that is the same for everyone, and solve that.
--- /dev/null
+# runc
+
+[](https://travis-ci.org/opencontainers/runc)
+[](https://goreportcard.com/report/github.com/opencontainers/runc)
+[](https://godoc.org/github.com/opencontainers/runc)
+
+## Introduction
+
+`runc` is a CLI tool for spawning and running containers according to the OCI specification.
+
+## Releases
+
+`runc` depends on and tracks the [runtime-spec](https://github.com/opencontainers/runtime-spec) repository.
+We will try to make sure that `runc` and the OCI specification major versions stay in lockstep.
+This means that `runc` 1.0.0 should implement the 1.0 version of the specification.
+
+You can find official releases of `runc` on the [release](https://github.com/opencontainers/runc/releases) page.
+
+### Security
+
+If you wish to report a security issue, please disclose the issue responsibly
+to security@opencontainers.org.
+
+## Building
+
+`runc` currently supports the Linux platform with various architecture support.
+It must be built with Go version 1.6 or higher in order for some features to function properly.
+
+In order to enable seccomp support you will need to install `libseccomp` on your platform.
+> e.g. `libseccomp-devel` for CentOS, or `libseccomp-dev` for Ubuntu
+
+Otherwise, if you do not want to build `runc` with seccomp support you can add `BUILDTAGS=""` when running make.
+
+```bash
+# create a 'github.com/opencontainers' in your GOPATH/src
+cd github.com/opencontainers
+git clone https://github.com/opencontainers/runc
+cd runc
+
+make
+sudo make install
+```
+
+You can also use `go get` to install to your `GOPATH`, assuming that you have a `github.com` parent folder already created under `src`:
+
+```bash
+go get github.com/opencontainers/runc
+cd $GOPATH/src/github.com/opencontainers/runc
+make
+sudo make install
+```
+
+`runc` will be installed to `/usr/local/sbin/runc` on your system.
+
+
+#### Build Tags
+
+`runc` supports optional build tags for compiling support of various features.
+To add build tags to the make option the `BUILDTAGS` variable must be set.
+
+```bash
+make BUILDTAGS='seccomp apparmor'
+```
+
+| Build Tag | Feature | Dependency |
+|-----------|------------------------------------|-------------|
+| seccomp | Syscall filtering | libseccomp |
+| selinux | selinux process and mount labeling | <none> |
+| apparmor | apparmor profile support | <none> |
+| ambient | ambient capability support | kernel 4.3 |
+| nokmem | disable kernel memory account | <none> |
+
+
+### Running the test suite
+
+`runc` currently supports running its test suite via Docker.
+To run the suite just type `make test`.
+
+```bash
+make test
+```
+
+There are additional make targets for running the tests outside of a container but this is not recommended as the tests are written with the expectation that they can write and remove anywhere.
+
+You can run a specific test case by setting the `TESTFLAGS` variable.
+
+```bash
+# make test TESTFLAGS="-run=SomeTestFunction"
+```
+
+You can run a specific integration test by setting the `TESTPATH` variable.
+
+```bash
+# make test TESTPATH="/checkpoint.bats"
+```
+
+You can run a test in your proxy environment by setting `DOCKER_BUILD_PROXY` and `DOCKER_RUN_PROXY` variables.
+
+```bash
+# make test DOCKER_BUILD_PROXY="--build-arg HTTP_PROXY=http://yourproxy/" DOCKER_RUN_PROXY="-e HTTP_PROXY=http://yourproxy/"
+```
+
+### Dependencies Management
+
+`runc` uses [vndr](https://github.com/LK4D4/vndr) for dependencies management.
+Please refer to [vndr](https://github.com/LK4D4/vndr) for how to add or update
+new dependencies.
+
+## Using runc
+
+### Creating an OCI Bundle
+
+In order to use runc you must have your container in the format of an OCI bundle.
+If you have Docker installed you can use its `export` method to acquire a root filesystem from an existing Docker container.
+
+```bash
+# create the top most bundle directory
+mkdir /mycontainer
+cd /mycontainer
+
+# create the rootfs directory
+mkdir rootfs
+
+# export busybox via Docker into the rootfs directory
+docker export $(docker create busybox) | tar -C rootfs -xvf -
+```
+
+After a root filesystem is populated you just generate a spec in the format of a `config.json` file inside your bundle.
+`runc` provides a `spec` command to generate a base template spec that you are then able to edit.
+To find features and documentation for fields in the spec please refer to the [specs](https://github.com/opencontainers/runtime-spec) repository.
+
+```bash
+runc spec
+```
+
+### Running Containers
+
+Assuming you have an OCI bundle from the previous step you can execute the container in two different ways.
+
+The first way is to use the convenience command `run` that will handle creating, starting, and deleting the container after it exits.
+
+```bash
+# run as root
+cd /mycontainer
+runc run mycontainerid
+```
+
+If you used the unmodified `runc spec` template this should give you a `sh` session inside the container.
+
+The second way to start a container is using the specs lifecycle operations.
+This gives you more power over how the container is created and managed while it is running.
+This will also launch the container in the background so you will have to edit the `config.json` to remove the `terminal` setting for the simple examples here.
+Your process field in the `config.json` should look like this below with `"terminal": false` and `"args": ["sleep", "5"]`.
+
+
+```json
+ "process": {
+ "terminal": false,
+ "user": {
+ "uid": 0,
+ "gid": 0
+ },
+ "args": [
+ "sleep", "5"
+ ],
+ "env": [
+ "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
+ "TERM=xterm"
+ ],
+ "cwd": "/",
+ "capabilities": {
+ "bounding": [
+ "CAP_AUDIT_WRITE",
+ "CAP_KILL",
+ "CAP_NET_BIND_SERVICE"
+ ],
+ "effective": [
+ "CAP_AUDIT_WRITE",
+ "CAP_KILL",
+ "CAP_NET_BIND_SERVICE"
+ ],
+ "inheritable": [
+ "CAP_AUDIT_WRITE",
+ "CAP_KILL",
+ "CAP_NET_BIND_SERVICE"
+ ],
+ "permitted": [
+ "CAP_AUDIT_WRITE",
+ "CAP_KILL",
+ "CAP_NET_BIND_SERVICE"
+ ],
+ "ambient": [
+ "CAP_AUDIT_WRITE",
+ "CAP_KILL",
+ "CAP_NET_BIND_SERVICE"
+ ]
+ },
+ "rlimits": [
+ {
+ "type": "RLIMIT_NOFILE",
+ "hard": 1024,
+ "soft": 1024
+ }
+ ],
+ "noNewPrivileges": true
+ },
+```
+
+Now we can go through the lifecycle operations in your shell.
+
+
+```bash
+# run as root
+cd /mycontainer
+runc create mycontainerid
+
+# view the container is created and in the "created" state
+runc list
+
+# start the process inside the container
+runc start mycontainerid
+
+# after 5 seconds view that the container has exited and is now in the stopped state
+runc list
+
+# now delete the container
+runc delete mycontainerid
+```
+
+This allows higher level systems to augment the containers creation logic with setup of various settings after the container is created and/or before it is deleted. For example, the container's network stack is commonly set up after `create` but before `start`.
+
+#### Rootless containers
+`runc` has the ability to run containers without root privileges. This is called `rootless`. You need to pass some parameters to `runc` in order to run rootless containers. See below and compare with the previous version. Run the following commands as an ordinary user:
+```bash
+# Same as the first example
+mkdir ~/mycontainer
+cd ~/mycontainer
+mkdir rootfs
+docker export $(docker create busybox) | tar -C rootfs -xvf -
+
+# The --rootless parameter instructs runc spec to generate a configuration for a rootless container, which will allow you to run the container as a non-root user.
+runc spec --rootless
+
+# The --root parameter tells runc where to store the container state. It must be writable by the user.
+runc --root /tmp/runc run mycontainerid
+```
+
+#### Supervisors
+
+`runc` can be used with process supervisors and init systems to ensure that containers are restarted when they exit.
+An example systemd unit file looks something like this.
+
+```systemd
+[Unit]
+Description=Start My Container
+
+[Service]
+Type=forking
+ExecStart=/usr/local/sbin/runc run -d --pid-file /run/mycontainerid.pid mycontainerid
+ExecStopPost=/usr/local/sbin/runc delete mycontainerid
+WorkingDirectory=/mycontainer
+PIDFile=/run/mycontainerid.pid
+
+[Install]
+WantedBy=multi-user.target
+```
+
+## License
+
+The code and docs are released under the [Apache 2.0 license](LICENSE).
--- /dev/null
+// +build linux
+
+package main
+
+import (
+ "fmt"
+ "os"
+ "strconv"
+ "strings"
+
+ "github.com/opencontainers/runc/libcontainer"
+ "github.com/opencontainers/runc/libcontainer/system"
+ "github.com/opencontainers/runtime-spec/specs-go"
+ "github.com/sirupsen/logrus"
+ "github.com/urfave/cli"
+
+ "golang.org/x/sys/unix"
+)
+
+var checkpointCommand = cli.Command{
+ Name: "checkpoint",
+ Usage: "checkpoint a running container",
+ ArgsUsage: `<container-id>
+
+Where "<container-id>" is the name for the instance of the container to be
+checkpointed.`,
+ Description: `The checkpoint command saves the state of the container instance.`,
+ Flags: []cli.Flag{
+ cli.StringFlag{Name: "image-path", Value: "", Usage: "path for saving criu image files"},
+ cli.StringFlag{Name: "work-path", Value: "", Usage: "path for saving work files and logs"},
+ cli.StringFlag{Name: "parent-path", Value: "", Usage: "path for previous criu image files in pre-dump"},
+ cli.BoolFlag{Name: "leave-running", Usage: "leave the process running after checkpointing"},
+ cli.BoolFlag{Name: "tcp-established", Usage: "allow open tcp connections"},
+ cli.BoolFlag{Name: "ext-unix-sk", Usage: "allow external unix sockets"},
+ cli.BoolFlag{Name: "shell-job", Usage: "allow shell jobs"},
+ cli.BoolFlag{Name: "lazy-pages", Usage: "use userfaultfd to lazily restore memory pages"},
+ cli.StringFlag{Name: "status-fd", Value: "", Usage: "criu writes \\0 to this FD once lazy-pages is ready"},
+ cli.StringFlag{Name: "page-server", Value: "", Usage: "ADDRESS:PORT of the page server"},
+ cli.BoolFlag{Name: "file-locks", Usage: "handle file locks, for safety"},
+ cli.BoolFlag{Name: "pre-dump", Usage: "dump container's memory information only, leave the container running after this"},
+ cli.StringFlag{Name: "manage-cgroups-mode", Value: "", Usage: "cgroups mode: 'soft' (default), 'full' and 'strict'"},
+ cli.StringSliceFlag{Name: "empty-ns", Usage: "create a namespace, but don't restore its properties"},
+ cli.BoolFlag{Name: "auto-dedup", Usage: "enable auto deduplication of memory images"},
+ },
+ Action: func(context *cli.Context) error {
+ if err := checkArgs(context, 1, exactArgs); err != nil {
+ return err
+ }
+ // XXX: Currently this is untested with rootless containers.
+ if os.Geteuid() != 0 || system.RunningInUserNS() {
+ logrus.Warn("runc checkpoint is untested with rootless containers")
+ }
+
+ container, err := getContainer(context)
+ if err != nil {
+ return err
+ }
+ status, err := container.Status()
+ if err != nil {
+ return err
+ }
+ if status == libcontainer.Created || status == libcontainer.Stopped {
+ fatalf("Container cannot be checkpointed in %s state", status.String())
+ }
+ defer destroy(container)
+ options := criuOptions(context)
+ // these are the mandatory criu options for a container
+ setPageServer(context, options)
+ setManageCgroupsMode(context, options)
+ if err := setEmptyNsMask(context, options); err != nil {
+ return err
+ }
+ return container.Checkpoint(options)
+ },
+}
+
+func getCheckpointImagePath(context *cli.Context) string {
+ imagePath := context.String("image-path")
+ if imagePath == "" {
+ imagePath = getDefaultImagePath(context)
+ }
+ return imagePath
+}
+
+func setPageServer(context *cli.Context, options *libcontainer.CriuOpts) {
+ // xxx following criu opts are optional
+ // The dump image can be sent to a criu page server
+ if psOpt := context.String("page-server"); psOpt != "" {
+ addressPort := strings.Split(psOpt, ":")
+ if len(addressPort) != 2 {
+ fatal(fmt.Errorf("Use --page-server ADDRESS:PORT to specify page server"))
+ }
+ portInt, err := strconv.Atoi(addressPort[1])
+ if err != nil {
+ fatal(fmt.Errorf("Invalid port number"))
+ }
+ options.PageServer = libcontainer.CriuPageServerInfo{
+ Address: addressPort[0],
+ Port: int32(portInt),
+ }
+ }
+}
+
+func setManageCgroupsMode(context *cli.Context, options *libcontainer.CriuOpts) {
+ if cgOpt := context.String("manage-cgroups-mode"); cgOpt != "" {
+ switch cgOpt {
+ case "soft":
+ options.ManageCgroupsMode = libcontainer.CRIU_CG_MODE_SOFT
+ case "full":
+ options.ManageCgroupsMode = libcontainer.CRIU_CG_MODE_FULL
+ case "strict":
+ options.ManageCgroupsMode = libcontainer.CRIU_CG_MODE_STRICT
+ default:
+ fatal(fmt.Errorf("Invalid manage cgroups mode"))
+ }
+ }
+}
+
+var namespaceMapping = map[specs.LinuxNamespaceType]int{
+ specs.NetworkNamespace: unix.CLONE_NEWNET,
+}
+
+func setEmptyNsMask(context *cli.Context, options *libcontainer.CriuOpts) error {
+ /* Runc doesn't manage network devices and their configuration */
+ nsmask := unix.CLONE_NEWNET
+
+ for _, ns := range context.StringSlice("empty-ns") {
+ f, exists := namespaceMapping[specs.LinuxNamespaceType(ns)]
+ if !exists {
+ return fmt.Errorf("namespace %q is not supported", ns)
+ }
+ nsmask |= f
+ }
+
+ options.EmptyNs = uint32(nsmask)
+ return nil
+}
--- /dev/null
+/*
+ * Copyright 2016 SUSE LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package main
+
+import (
+ "fmt"
+ "io"
+ "io/ioutil"
+ "net"
+ "os"
+ "strings"
+
+ "github.com/containerd/console"
+ "github.com/opencontainers/runc/libcontainer/utils"
+ "github.com/urfave/cli"
+)
+
+// version will be populated by the Makefile, read from
+// VERSION file of the source code.
+var version = ""
+
+// gitCommit will be the hash that the binary was built from
+// and will be populated by the Makefile
+var gitCommit = ""
+
+const (
+ usage = `Open Container Initiative contrib/cmd/recvtty
+
+recvtty is a reference implementation of a consumer of runC's --console-socket
+API. It has two main modes of operation:
+
+ * single: Only permit one terminal to be sent to the socket, which is
+ then hooked up to the stdio of the recvtty process. This is useful
+ for rudimentary shell management of a container.
+
+ * null: Permit as many terminals to be sent to the socket, but they
+ are read to /dev/null. This is used for testing, and imitates the
+ old runC API's --console=/dev/pts/ptmx hack which would allow for a
+ similar trick. This is probably not what you want to use, unless
+ you're doing something like our bats integration tests.
+
+To use recvtty, just specify a socket path at which you want to receive
+terminals:
+
+ $ recvtty [--mode <single|null>] socket.sock
+`
+)
+
+func bail(err error) {
+ fmt.Fprintf(os.Stderr, "[recvtty] fatal error: %v\n", err)
+ os.Exit(1)
+}
+
+func handleSingle(path string) error {
+ // Open a socket.
+ ln, err := net.Listen("unix", path)
+ if err != nil {
+ return err
+ }
+ defer ln.Close()
+
+ // We only accept a single connection, since we can only really have
+ // one reader for os.Stdin. Plus this is all a PoC.
+ conn, err := ln.Accept()
+ if err != nil {
+ return err
+ }
+ defer conn.Close()
+
+ // Close ln, to allow for other instances to take over.
+ ln.Close()
+
+ // Get the fd of the connection.
+ unixconn, ok := conn.(*net.UnixConn)
+ if !ok {
+ return fmt.Errorf("failed to cast to unixconn")
+ }
+
+ socket, err := unixconn.File()
+ if err != nil {
+ return err
+ }
+ defer socket.Close()
+
+ // Get the master file descriptor from runC.
+ master, err := utils.RecvFd(socket)
+ if err != nil {
+ return err
+ }
+ c, err := console.ConsoleFromFile(master)
+ if err != nil {
+ return err
+ }
+ console.ClearONLCR(c.Fd())
+
+ // Copy from our stdio to the master fd.
+ quitChan := make(chan struct{})
+ go func() {
+ io.Copy(os.Stdout, c)
+ quitChan <- struct{}{}
+ }()
+ go func() {
+ io.Copy(c, os.Stdin)
+ quitChan <- struct{}{}
+ }()
+
+ // Only close the master fd once we've stopped copying.
+ <-quitChan
+ c.Close()
+ return nil
+}
+
+func handleNull(path string) error {
+ // Open a socket.
+ ln, err := net.Listen("unix", path)
+ if err != nil {
+ return err
+ }
+ defer ln.Close()
+
+ // As opposed to handleSingle we accept as many connections as we get, but
+ // we don't interact with Stdin at all (and we copy stdout to /dev/null).
+ for {
+ conn, err := ln.Accept()
+ if err != nil {
+ return err
+ }
+ go func(conn net.Conn) {
+ // Don't leave references lying around.
+ defer conn.Close()
+
+ // Get the fd of the connection.
+ unixconn, ok := conn.(*net.UnixConn)
+ if !ok {
+ return
+ }
+
+ socket, err := unixconn.File()
+ if err != nil {
+ return
+ }
+ defer socket.Close()
+
+ // Get the master file descriptor from runC.
+ master, err := utils.RecvFd(socket)
+ if err != nil {
+ return
+ }
+
+ // Just do a dumb copy to /dev/null.
+ devnull, err := os.OpenFile("/dev/null", os.O_RDWR, 0)
+ if err != nil {
+ // TODO: Handle this nicely.
+ return
+ }
+
+ io.Copy(devnull, master)
+ devnull.Close()
+ }(conn)
+ }
+}
+
+func main() {
+ app := cli.NewApp()
+ app.Name = "recvtty"
+ app.Usage = usage
+
+ // Set version to be the same as runC.
+ var v []string
+ if version != "" {
+ v = append(v, version)
+ }
+ if gitCommit != "" {
+ v = append(v, fmt.Sprintf("commit: %s", gitCommit))
+ }
+ app.Version = strings.Join(v, "\n")
+
+ // Set the flags.
+ app.Flags = []cli.Flag{
+ cli.StringFlag{
+ Name: "mode, m",
+ Value: "single",
+ Usage: "Mode of operation (single or null)",
+ },
+ cli.StringFlag{
+ Name: "pid-file",
+ Value: "",
+ Usage: "Path to write daemon process ID to",
+ },
+ }
+
+ app.Action = func(ctx *cli.Context) error {
+ args := ctx.Args()
+ if len(args) != 1 {
+ return fmt.Errorf("need to specify a single socket path")
+ }
+ path := ctx.Args()[0]
+
+ pidPath := ctx.String("pid-file")
+ if pidPath != "" {
+ pid := fmt.Sprintf("%d\n", os.Getpid())
+ if err := ioutil.WriteFile(pidPath, []byte(pid), 0644); err != nil {
+ return err
+ }
+ }
+
+ switch ctx.String("mode") {
+ case "single":
+ if err := handleSingle(path); err != nil {
+ return err
+ }
+ case "null":
+ if err := handleNull(path); err != nil {
+ return err
+ }
+ default:
+ return fmt.Errorf("need to select a valid mode: %s", ctx.String("mode"))
+ }
+ return nil
+ }
+ if err := app.Run(os.Args); err != nil {
+ bail(err)
+ }
+}
--- /dev/null
+#!/bin/bash
+#
+# bash completion file for runc command
+#
+# This script provides completion of:
+# - commands and their options
+# - filepaths
+#
+# To enable the completions either:
+# - place this file in /usr/share/bash-completion/completions
+# or
+# - copy this file to e.g. ~/.runc-completion.sh and add the line
+# below to your .bashrc after bash completion features are loaded
+# . ~/.runc-completion.sh
+#
+# Configuration:
+#
+
+# Note for developers:
+# Please arrange options sorted alphabetically by long name with the short
+# options immediately following their corresponding long form.
+# This order should be applied to lists, alternatives and code blocks.
+
+__runc_previous_extglob_setting=$(shopt -p extglob)
+shopt -s extglob
+
+__runc_list_all() {
+ COMPREPLY=($(compgen -W "$(runc list -q)" -- $cur))
+}
+
+__runc_pos_first_nonflag() {
+ local argument_flags=$1
+
+ local counter=$((${subcommand_pos:-${command_pos}} + 1))
+ while [ $counter -le $cword ]; do
+ if [ -n "$argument_flags" ] && eval "case '${words[$counter]}' in $argument_flags) true ;; *) false ;; esac"; then
+ ((counter++))
+ else
+ case "${words[$counter]}" in
+ -*) ;;
+ *)
+ break
+ ;;
+ esac
+ fi
+ ((counter++))
+ done
+
+ echo $counter
+}
+
+# Transforms a multiline list of strings into a single line string
+# with the words separated by "|".
+# This is used to prepare arguments to __runc_pos_first_nonflag().
+__runc_to_alternatives() {
+ local parts=($1)
+ local IFS='|'
+ echo "${parts[*]}"
+}
+
+# Transforms a multiline list of options into an extglob pattern
+# suitable for use in case statements.
+__runc_to_extglob() {
+ local extglob=$(__runc_to_alternatives "$1")
+ echo "@($extglob)"
+}
+
+# Subcommand processing.
+# Locates the first occurrence of any of the subcommands contained in the
+# first argument. In case of a match, calls the corresponding completion
+# function and returns 0.
+# If no match is found, 1 is returned. The calling function can then
+# continue processing its completion.
+#
+# TODO if the preceding command has options that accept arguments and an
+# argument is equal to one of the subcommands, this is falsely detected as
+# a match.
+__runc_subcommands() {
+ local subcommands="$1"
+
+ local counter=$(($command_pos + 1))
+ while [ $counter -lt $cword ]; do
+ case "${words[$counter]}" in
+ $(__runc_to_extglob "$subcommands"))
+ subcommand_pos=$counter
+ local subcommand=${words[$counter]}
+ local completions_func=_runc_${command}_${subcommand}
+ declare -F $completions_func >/dev/null && $completions_func
+ return 0
+ ;;
+ esac
+ ((counter++))
+ done
+ return 1
+}
+
+# List all Signals
+__runc_list_signals() {
+ COMPREPLY=($(compgen -W "$(for i in $(kill -l | xargs); do echo $i; done | grep SIG)"))
+}
+
+# suppress trailing whitespace
+__runc_nospace() {
+ # compopt is not available in ancient bash versions
+ type compopt &>/dev/null && compopt -o nospace
+}
+
+# The list of capabilities is defined in types.go, ALL was added manually.
+__runc_complete_capabilities() {
+ COMPREPLY=($(compgen -W "
+ ALL
+ AUDIT_CONTROL
+ AUDIT_WRITE
+ AUDIT_READ
+ BLOCK_SUSPEND
+ CHOWN
+ DAC_OVERRIDE
+ DAC_READ_SEARCH
+ FOWNER
+ FSETID
+ IPC_LOCK
+ IPC_OWNER
+ KILL
+ LEASE
+ LINUX_IMMUTABLE
+ MAC_ADMIN
+ MAC_OVERRIDE
+ MKNOD
+ NET_ADMIN
+ NET_BIND_SERVICE
+ NET_BROADCAST
+ NET_RAW
+ SETFCAP
+ SETGID
+ SETPCAP
+ SETUID
+ SYS_ADMIN
+ SYS_BOOT
+ SYS_CHROOT
+ SYSLOG
+ SYS_MODULE
+ SYS_NICE
+ SYS_PACCT
+ SYS_PTRACE
+ SYS_RAWIO
+ SYS_RESOURCE
+ SYS_TIME
+ SYS_TTY_CONFIG
+ WAKE_ALARM
+ " -- "$cur"))
+}
+
+_runc_exec() {
+ local boolean_options="
+ --help
+ --no-new-privs
+ --tty, -t
+ --detach, -d
+ "
+
+ local options_with_args="
+ --console
+ --cwd
+ --env, -e
+ --user, -u
+ --process, -p
+ --pid-file
+ --process-label
+ --apparmor
+ --cap, -c
+ "
+
+ local all_options="$options_with_args $boolean_options"
+
+ case "$prev" in
+ --cap | -c)
+ __runc_complete_capabilities
+ return
+ ;;
+
+ --console | --cwd | --process | --apparmor)
+ case "$cur" in
+ *:*) ;; # TODO somehow do _filedir for stuff inside the image, if it's already specified (which is also somewhat difficult to determine)
+ '')
+ COMPREPLY=($(compgen -W '/' -- "$cur"))
+ __runc_nospace
+ ;;
+ /*)
+ _filedir
+ __runc_nospace
+ ;;
+ esac
+ return
+ ;;
+ --env | -e)
+ COMPREPLY=($(compgen -e -- "$cur"))
+ __runc_nospace
+ return
+ ;;
+ $(__runc_to_extglob "$options_with_args"))
+ return
+ ;;
+ esac
+
+ case "$cur" in
+ -*)
+ COMPREPLY=($(compgen -W "$all_options" -- "$cur"))
+ ;;
+ *)
+ __runc_list_all
+ ;;
+ esac
+}
+
+# global options that may appear after the runc command
+_runc_runc() {
+ local boolean_options="
+ $global_boolean_options
+ --help
+ --version -v
+ --debug
+ "
+ local options_with_args="
+ --log
+ --log-format
+ --root
+ --criu
+ "
+
+ case "$prev" in
+ --log | --root | --criu)
+ case "$cur" in
+ *:*) ;; # TODO somehow do _filedir for stuff inside the image, if it's already specified (which is also somewhat difficult to determine)
+ '')
+ COMPREPLY=($(compgen -W '/' -- "$cur"))
+ __runc_nospace
+ ;;
+ *)
+ _filedir
+ __runc_nospace
+ ;;
+ esac
+ return
+ ;;
+
+ --log-format)
+ COMPREPLY=($(compgen -W 'text json' -- "$cur"))
+ return
+ ;;
+
+ $(__runc_to_extglob "$options_with_args"))
+ return
+ ;;
+ esac
+
+ case "$cur" in
+ -*)
+ COMPREPLY=($(compgen -W "$boolean_options $options_with_args" -- "$cur"))
+ ;;
+ *)
+ local counter=$(__runc_pos_first_nonflag $(__runc_to_extglob "$options_with_args"))
+ if [ $cword -eq $counter ]; then
+ COMPREPLY=($(compgen -W "${commands[*]} help" -- "$cur"))
+ fi
+ ;;
+ esac
+}
+
+_runc_pause() {
+ local boolean_options="
+ --help
+ -h
+ "
+
+ case "$cur" in
+ -*)
+ COMPREPLY=($(compgen -W "$boolean_options $options_with_args" -- "$cur"))
+ ;;
+ *)
+ __runc_list_all
+ ;;
+ esac
+}
+
+_runc_ps() {
+ local boolean_options="
+ --help
+ -h
+ "
+
+ case "$cur" in
+ -*)
+ COMPREPLY=($(compgen -W "$boolean_options $options_with_args" -- "$cur"))
+ ;;
+ *)
+ __runc_list_all
+ ;;
+ esac
+}
+
+_runc_delete() {
+ local boolean_options="
+ --help
+ -h
+ "
+
+ case "$cur" in
+ -*)
+ COMPREPLY=($(compgen -W "$boolean_options $options_with_args" -- "$cur"))
+ ;;
+ *)
+ __runc_list_all
+ ;;
+ esac
+}
+
+_runc_kill() {
+ local boolean_options="
+ --help
+ -h
+ --all
+ -a
+ "
+
+ case "$prev" in
+ "kill")
+ __runc_list_all
+ return
+ ;;
+ *)
+ __runc_list_signals
+ return
+ ;;
+ esac
+ case "$cur" in
+ -*)
+ COMPREPLY=($(compgen -W "$boolean_options $options_with_args" -- "$cur"))
+ ;;
+ *)
+ __runc_list_all
+ ;;
+ esac
+}
+
+_runc_events() {
+ local boolean_options="
+ --help
+ --stats
+ "
+
+ local options_with_args="
+ --interval
+ "
+
+ case "$prev" in
+ $(__runc_to_extglob "$options_with_args"))
+ return
+ ;;
+ esac
+
+ case "$cur" in
+ -*)
+ COMPREPLY=($(compgen -W "$boolean_options $options_with_args" -- "$cur"))
+ ;;
+ *)
+ __runc_list_all
+ ;;
+ esac
+}
+
+_runc_list() {
+ local boolean_options="
+ --help
+ --quiet
+ -q
+ "
+
+ local options_with_args="
+ --format
+ -f
+ "
+
+ case "$prev" in
+ --format | -f)
+ COMPREPLY=($(compgen -W 'text json' -- "$cur"))
+ return
+ ;;
+
+ $(__runc_to_extglob "$options_with_args"))
+ return
+ ;;
+ esac
+
+ case "$cur" in
+ -*)
+ COMPREPLY=($(compgen -W "$boolean_options $options_with_args" -- "$cur"))
+ ;;
+ *)
+ local counter=$(__runc_pos_first_nonflag $(__runc_to_extglob "$options_with_args"))
+ ;;
+ esac
+}
+
+_runc_spec() {
+ local boolean_options="
+ --help
+ "
+
+ local options_with_args="
+ --bundle
+ -b
+ "
+
+ case "$prev" in
+ --bundle | -b)
+ case "$cur" in
+ '')
+ COMPREPLY=($(compgen -W '/' -- "$cur"))
+ __runc_nospace
+ ;;
+ /*)
+ _filedir
+ __runc_nospace
+ ;;
+ esac
+ return
+ ;;
+
+ $(__runc_to_extglob "$options_with_args"))
+ return
+ ;;
+ esac
+
+ case "$cur" in
+ -*)
+ COMPREPLY=($(compgen -W "$boolean_options $options_with_args" -- "$cur"))
+ ;;
+ *)
+ local counter=$(__runc_pos_first_nonflag $(__runc_to_extglob "$options_with_args"))
+ ;;
+ esac
+}
+
+_runc_run() {
+ local boolean_options="
+ --help
+ --detatch
+ -d
+ --no-subreaper
+ --no-pivot
+ --no-new-keyring
+ "
+
+ local options_with_args="
+ --bundle
+ -b
+ --console
+ --pid-file
+ "
+
+ case "$prev" in
+ --bundle | -b | --console | --pid-file)
+ case "$cur" in
+ '')
+ COMPREPLY=($(compgen -W '/' -- "$cur"))
+ __runc_nospace
+ ;;
+ /*)
+ _filedir
+ __runc_nospace
+ ;;
+ esac
+ return
+ ;;
+
+ $(__runc_to_extglob "$options_with_args"))
+ return
+ ;;
+ esac
+
+ case "$cur" in
+ -*)
+ COMPREPLY=($(compgen -W "$boolean_options $options_with_args" -- "$cur"))
+ ;;
+ *)
+ __runc_list_all
+ ;;
+ esac
+}
+
+_runc_checkpoint() {
+ local boolean_options="
+ --help
+ -h
+ --leave-running
+ --tcp-established
+ --ext-unix-sk
+ --shell-job
+ --file-locks
+ "
+
+ local options_with_args="
+ --image-path
+ --work-path
+ --page-server
+ --manage-cgroups-mode
+ "
+
+ case "$prev" in
+ --page-server) ;;
+
+ --manage-cgroups-mode)
+ COMPREPLY=($(compgen -W "soft full strict" -- "$cur"))
+ return
+ ;;
+
+ --image-path | --work-path)
+ case "$cur" in
+ *:*) ;; # TODO somehow do _filedir for stuff inside the image, if it's already specified (which is also somewhat difficult to determine)
+ '')
+ COMPREPLY=($(compgen -W '/' -- "$cur"))
+ __runc_nospace
+ ;;
+ *)
+ _filedir
+ __runc_nospace
+ ;;
+ esac
+ return
+ ;;
+
+ $(__runc_to_extglob "$options_with_args"))
+ return
+ ;;
+ esac
+
+ case "$cur" in
+ -*)
+ COMPREPLY=($(compgen -W "$boolean_options $options_with_args" -- "$cur"))
+ ;;
+ *)
+ __runc_list_all
+ ;;
+ esac
+}
+_runc_create() {
+ local boolean_options="
+ --help
+ --no-pivot
+ --no-new-keyring
+ "
+
+ local options_with_args="
+ --bundle
+ -b
+ --console
+ --pid-file
+ "
+ case "$prev" in
+ --bundle | -b | --console | --pid-file)
+ case "$cur" in
+ '')
+ COMPREPLY=($(compgen -W '/' -- "$cur"))
+ __runc_nospace
+ ;;
+ /*)
+ _filedir
+ __runc_nospace
+ ;;
+ esac
+ return
+ ;;
+
+ $(__runc_to_extglob "$options_with_args"))
+ return
+ ;;
+ esac
+
+ case "$cur" in
+ -*)
+ COMPREPLY=($(compgen -W "$boolean_options $options_with_args" -- "$cur"))
+ ;;
+ *)
+ __runc_list_all
+ ;;
+ esac
+
+}
+
+_runc_help() {
+ local counter=$(__runc_pos_first_nonflag)
+ if [ $cword -eq $counter ]; then
+ COMPREPLY=($(compgen -W "${commands[*]}" -- "$cur"))
+ fi
+}
+
+_runc_restore() {
+ local boolean_options="
+ --help
+ --tcp-established
+ --ext-unix-sk
+ --shell-job
+ --file-locks
+ --detach
+ -d
+ --no-subreaper
+ --no-pivot
+ "
+
+ local options_with_args="
+ -b
+ --bundle
+ --image-path
+ --work-path
+ --manage-cgroups-mode
+ --pid-file
+ "
+
+ local all_options="$options_with_args $boolean_options"
+
+ case "$prev" in
+ --manage-cgroups-mode)
+ COMPREPLY=($(compgen -W "soft full strict" -- "$cur"))
+ return
+ ;;
+
+ --pid-file | --image-path | --work-path | --bundle | -b)
+ case "$cur" in
+ *:*) ;; # TODO somehow do _filedir for stuff inside the image, if it's already specified (which is also somewhat difficult to determine)
+ '')
+ COMPREPLY=($(compgen -W '/' -- "$cur"))
+ __runc_nospace
+ ;;
+ /*)
+ _filedir
+ __runc_nospace
+ ;;
+ esac
+ return
+ ;;
+
+ $(__runc_to_extglob "$options_with_args"))
+ return
+ ;;
+ esac
+
+ case "$cur" in
+ -*)
+ COMPREPLY=($(compgen -W "$all_options" -- "$cur"))
+ ;;
+ *)
+ __runc_list_all
+ ;;
+ esac
+}
+
+_runc_resume() {
+ local boolean_options="
+ --help
+ -h
+ "
+
+ case "$cur" in
+ -*)
+ COMPREPLY=($(compgen -W "$boolean_options $options_with_args" -- "$cur"))
+ ;;
+ *)
+ __runc_list_all
+ ;;
+ esac
+}
+
+_runc_state() {
+ local boolean_options="
+ --help
+ -h
+ "
+
+ case "$cur" in
+ -*)
+ COMPREPLY=($(compgen -W "$boolean_options $options_with_args" -- "$cur"))
+ ;;
+ *)
+ __runc_list_all
+ ;;
+ esac
+}
+_runc_start() {
+ local boolean_options="
+ --help
+ -h
+ "
+
+ case "$cur" in
+ -*)
+ COMPREPLY=($(compgen -W "$boolean_options $options_with_args" -- "$cur"))
+ ;;
+ *)
+ __runc_list_all
+ ;;
+ esac
+}
+_runc_update() {
+ local boolean_options="
+ --help
+ "
+
+ local options_with_args="
+ --blkio-weight
+ --cpu-period
+ --cpu-quota
+ --cpu-rt-period
+ --cpu-rt-runtime
+ --cpu-share
+ --cpuset-cpus
+ --cpuset-mems
+ --kernel-memory
+ --kernel-memory-tcp
+ --memory
+ --memory-reservation
+ --memory-swap
+
+ "
+
+ case "$prev" in
+ $(__runc_to_extglob "$options_with_args"))
+ return
+ ;;
+ esac
+
+ case "$cur" in
+ -*)
+ COMPREPLY=($(compgen -W "$boolean_options $options_with_args" -- "$cur"))
+ ;;
+ *)
+ __runc_list_all
+ ;;
+ esac
+}
+
+_runc() {
+ local previous_extglob_setting=$(shopt -p extglob)
+ shopt -s extglob
+
+ local commands=(
+ checkpoint
+ create
+ delete
+ events
+ exec
+ init
+ kill
+ list
+ pause
+ ps
+ restore
+ resume
+ run
+ spec
+ start
+ state
+ update
+ help
+ h
+ )
+
+ # These options are valid as global options for all client commands
+ # and valid as command options for `runc daemon`
+ local global_boolean_options="
+ --help -h
+ --version -v
+ "
+
+ COMPREPLY=()
+ local cur prev words cword
+ _get_comp_words_by_ref -n : cur prev words cword
+
+ local command='runc' command_pos=0 subcommand_pos
+ local counter=1
+ while [ $counter -lt $cword ]; do
+ case "${words[$counter]}" in
+ -*) ;;
+ =)
+ ((counter++))
+ ;;
+ *)
+ command="${words[$counter]}"
+ command_pos=$counter
+ break
+ ;;
+ esac
+ ((counter++))
+ done
+
+ local completions_func=_runc_${command}
+ declare -F $completions_func >/dev/null && $completions_func
+
+ eval "$previous_extglob_setting"
+ return 0
+}
+
+eval "$__runc_previous_extglob_setting"
+unset __runc_previous_extglob_setting
+
+complete -F _runc runc
--- /dev/null
+package main
+
+import (
+ "os"
+
+ "github.com/urfave/cli"
+)
+
+var createCommand = cli.Command{
+ Name: "create",
+ Usage: "create a container",
+ ArgsUsage: `<container-id>
+
+Where "<container-id>" is your name for the instance of the container that you
+are starting. The name you provide for the container instance must be unique on
+your host.`,
+ Description: `The create command creates an instance of a container for a bundle. The bundle
+is a directory with a specification file named "` + specConfig + `" and a root
+filesystem.
+
+The specification file includes an args parameter. The args parameter is used
+to specify command(s) that get run when the container is started. To change the
+command(s) that get executed on start, edit the args parameter of the spec. See
+"runc spec --help" for more explanation.`,
+ Flags: []cli.Flag{
+ cli.StringFlag{
+ Name: "bundle, b",
+ Value: "",
+ Usage: `path to the root of the bundle directory, defaults to the current directory`,
+ },
+ cli.StringFlag{
+ Name: "console-socket",
+ Value: "",
+ Usage: "path to an AF_UNIX socket which will receive a file descriptor referencing the master end of the console's pseudoterminal",
+ },
+ cli.StringFlag{
+ Name: "pid-file",
+ Value: "",
+ Usage: "specify the file to write the process id to",
+ },
+ cli.BoolFlag{
+ Name: "no-pivot",
+ Usage: "do not use pivot root to jail process inside rootfs. This should be used whenever the rootfs is on top of a ramdisk",
+ },
+ cli.BoolFlag{
+ Name: "no-new-keyring",
+ Usage: "do not create a new session keyring for the container. This will cause the container to inherit the calling processes session key",
+ },
+ cli.IntFlag{
+ Name: "preserve-fds",
+ Usage: "Pass N additional file descriptors to the container (stdio + $LISTEN_FDS + N in total)",
+ },
+ },
+ Action: func(context *cli.Context) error {
+ if err := checkArgs(context, 1, exactArgs); err != nil {
+ return err
+ }
+ if err := revisePidFile(context); err != nil {
+ return err
+ }
+ spec, err := setupSpec(context)
+ if err != nil {
+ return err
+ }
+ status, err := startContainer(context, spec, CT_ACT_CREATE, nil)
+ if err != nil {
+ return err
+ }
+ // exit with the container's exit status so any external supervisor is
+ // notified of the exit with the correct exit status.
+ os.Exit(status)
+ return nil
+ },
+}
--- /dev/null
+// +build !solaris
+
+package main
+
+import (
+ "fmt"
+ "os"
+ "path/filepath"
+ "syscall"
+ "time"
+
+ "github.com/opencontainers/runc/libcontainer"
+ "github.com/urfave/cli"
+
+ "golang.org/x/sys/unix"
+)
+
+func killContainer(container libcontainer.Container) error {
+ _ = container.Signal(unix.SIGKILL, false)
+ for i := 0; i < 100; i++ {
+ time.Sleep(100 * time.Millisecond)
+ if err := container.Signal(syscall.Signal(0), false); err != nil {
+ destroy(container)
+ return nil
+ }
+ }
+ return fmt.Errorf("container init still running")
+}
+
+var deleteCommand = cli.Command{
+ Name: "delete",
+ Usage: "delete any resources held by the container often used with detached container",
+ ArgsUsage: `<container-id>
+
+Where "<container-id>" is the name for the instance of the container.
+
+EXAMPLE:
+For example, if the container id is "ubuntu01" and runc list currently shows the
+status of "ubuntu01" as "stopped" the following will delete resources held for
+"ubuntu01" removing "ubuntu01" from the runc list of containers:
+
+ # runc delete ubuntu01`,
+ Flags: []cli.Flag{
+ cli.BoolFlag{
+ Name: "force, f",
+ Usage: "Forcibly deletes the container if it is still running (uses SIGKILL)",
+ },
+ },
+ Action: func(context *cli.Context) error {
+ if err := checkArgs(context, 1, exactArgs); err != nil {
+ return err
+ }
+
+ id := context.Args().First()
+ force := context.Bool("force")
+ container, err := getContainer(context)
+ if err != nil {
+ if lerr, ok := err.(libcontainer.Error); ok && lerr.Code() == libcontainer.ContainerNotExists {
+ // if there was an aborted start or something of the sort then the container's directory could exist but
+ // libcontainer does not see it because the state.json file inside that directory was never created.
+ path := filepath.Join(context.GlobalString("root"), id)
+ if e := os.RemoveAll(path); e != nil {
+ fmt.Fprintf(os.Stderr, "remove %s: %v\n", path, e)
+ }
+ if force {
+ return nil
+ }
+ }
+ return err
+ }
+ s, err := container.Status()
+ if err != nil {
+ return err
+ }
+ switch s {
+ case libcontainer.Stopped:
+ destroy(container)
+ case libcontainer.Created:
+ return killContainer(container)
+ default:
+ if force {
+ return killContainer(container)
+ }
+ return fmt.Errorf("cannot delete container %s that is not stopped: %s\n", id, s)
+ }
+
+ return nil
+ },
+}
--- /dev/null
+# Terminals and Standard IO #
+
+*Note that the default configuration of `runc` (foreground, new terminal) is
+generally the best option for most users. This document exists to help explain
+what the purpose of the different modes is, and to try to steer users away from
+common mistakes and misunderstandings.*
+
+In general, most processes on Unix (and Unix-like) operating systems have 3
+standard file descriptors provided at the start, collectively referred to as
+"standard IO" (`stdio`):
+
+* `0`: standard-in (`stdin`), the input stream into the process
+* `1`: standard-out (`stdout`), the output stream from the process
+* `2`: standard-error (`stderr`), the error stream from the process
+
+When creating and running a container via `runc`, it is important to take care
+to structure the `stdio` the new container's process receives. In some ways
+containers are just regular processes, while in other ways they're an isolated
+sub-partition of your machine (in a similar sense to a VM). This means that the
+structure of IO is not as simple as with ordinary programs (which generally
+just use the file descriptors you give them).
+
+## Other File Descriptors ##
+
+Before we continue, it is important to note that processes can have more file
+descriptors than just `stdio`. By default in `runc` no other file descriptors
+will be passed to the spawned container process. If you wish to explicitly pass
+file descriptors to the container you have to use the `--preserve-fds` option.
+These ancillary file descriptors don't have any of the strange semantics
+discussed further in this document (those only apply to `stdio`) -- they are
+passed untouched by `runc`.
+
+It should be noted that `--preserve-fds` does not take individual file
+descriptors to preserve. Instead, it takes how many file descriptors (not
+including `stdio` or `LISTEN_FDS`) should be passed to the container. In the
+following example:
+
+```
+% runc run --preserve-fds 5 <container>
+```
+
+`runc` will pass the first `5` file descriptors (`3`, `4`, `5`, `6`, and `7` --
+assuming that `LISTEN_FDS` has not been configured) to the container.
+
+In addition to `--preserve-fds`, `LISTEN_FDS` file descriptors are passed
+automatically to allow for `systemd`-style socket activation. To extend the
+above example:
+
+```
+% LISTEN_PID=$pid_of_runc LISTEN_FDS=3 runc run --preserve-fds 5 <container>
+```
+
+`runc` will now pass the first `8` file descriptors (and it will also pass
+`LISTEN_FDS=3` and `LISTEN_PID=1` to the container). The first `3` (`3`, `4`,
+and `5`) were passed due to `LISTEN_FDS` and the other `5` (`6`, `7`, `8`, `9`,
+and `10`) were passed due to `--preserve-fds`. You should keep this in mind if
+you use `runc` directly in something like a `systemd` unit file. To disable
+this `LISTEN_FDS`-style passing just unset `LISTEN_FDS`.
+
+**Be very careful when passing file descriptors to a container process.** Due
+to some Linux kernel (mis)features, a container with access to certain types of
+file descriptors (such as `O_PATH` descriptors) outside of the container's root
+file system can use these to break out of the container's pivoted mount
+namespace. [This has resulted in CVEs in the past.][CVE-2016-9962]
+
+[CVE-2016-9962]: https://nvd.nist.gov/vuln/detail/CVE-2016-9962
+
+## <a name="terminal-modes" /> Terminal Modes ##
+
+`runc` supports two distinct methods for passing `stdio` to the container's
+primary process:
+
+* [new terminal](#new-terminal) (`terminal: true`)
+* [pass-through](#pass-through) (`terminal: false`)
+
+When first using `runc` these two modes will look incredibly similar, but this
+can be quite deceptive as these different modes have quite different
+characteristics.
+
+By default, `runc spec` will create a configuration that will create a new
+terminal (`terminal: true`). However, if the `terminal: ...` line is not
+present in `config.json` then pass-through is the default.
+
+*In general we recommend using new terminal, because it means that tools like
+`sudo` will work inside your container. But pass-through can be useful if you
+know what you're doing, or if you're using `runc` as part of a non-interactive
+pipeline.*
+
+### <a name="new-terminal"> New Terminal ###
+
+In new terminal mode, `runc` will create a brand-new "console" (or more
+precisely, a new pseudo-terminal using the container's namespaced
+`/dev/pts/ptmx`) for your contained process to use as its `stdio`.
+
+When you start a process in new terminal mode, `runc` will do the following:
+
+1. Create a new pseudo-terminal.
+2. Pass the slave end to the container's primary process as its `stdio`.
+3. Send the master end to a process to interact with the `stdio` for the
+ container's primary process ([details below](#runc-modes)).
+
+It should be noted that since a new pseudo-terminal is being used for
+communication with the container, some strange properties of pseudo-terminals
+might surprise you. For instance, by default, all new pseudo-terminals
+translate the byte `'\n'` to the sequence `'\r\n'` on both `stdout` and
+`stderr`. In addition there are [a whole range of `ioctls(2)` that can only
+interact with pseudo-terminal `stdio`][tty_ioctl(4)].
+
+> **NOTE**: In new terminal mode, all three `stdio` file descriptors are the
+> same underlying file. The reason for this is to match how a shell's `stdio`
+> looks to a process (as well as remove race condition issues with having to
+> deal with multiple master pseudo-terminal file descriptors). However this
+> means that it is not really possible to uniquely distinguish between `stdout`
+> and `stderr` from the caller's perspective.
+
+[tty_ioctl(4)]: https://linux.die.net/man/4/tty_ioctl
+
+### <a name="pass-through"> Pass-Through ###
+
+If you have already set up some file handles that you wish your contained
+process to use as its `stdio`, then you can ask `runc` to pass them through to
+the contained process (this is not necessarily the same as `--preserve-fds`'s
+passing of file descriptors -- [details below](#runc-modes)). As an example
+(assuming that `terminal: false` is set in `config.json`):
+
+```
+% echo input | runc run some_container > /tmp/log.out 2>& /tmp/log.err
+```
+
+Here the container's various `stdio` file descriptors will be substituted with
+the following:
+
+* `stdin` will be sourced from the `echo input` pipeline.
+* `stdout` will be output into `/tmp/log.out` on the host.
+* `stderr` will be output into `/tmp/log.err` on the host.
+
+It should be noted that the actual file handles seen inside the container may
+be different [based on the mode `runc` is being used in](#runc-modes) (for
+instance, the file referenced by `1` could be `/tmp/log.out` directly or a pipe
+which `runc` is using to buffer output, based on the mode). However the net
+result will be the same in either case. In principle you could use the [new
+terminal mode](#new-terminal) in a pipeline, but the difference will become
+more clear when you are introduced to [`runc`'s detached mode](#runc-modes).
+
+## <a name="runc-modes" /> `runc` Modes ##
+
+`runc` itself runs in two modes:
+
+* [foreground](#foreground)
+* [detached](#detached)
+
+You can use either [terminal mode](#terminal-modes) with either `runc` mode.
+However, there are considerations that may indicate preference for one mode
+over another. It should be noted that while two types of modes (terminal and
+`runc`) are conceptually independent from each other, you should be aware of
+the intricacies of which combination you are using.
+
+*In general we recommend using foreground because it's the most
+straight-forward to use, with the only downside being that you will have a
+long-running `runc` process. Detached mode is difficult to get right and
+generally requires having your own `stdio` management.*
+
+### Foreground ###
+
+The default (and most straight-forward) mode of `runc`. In this mode, your
+`runc` command remains in the foreground with the container process as a child.
+All `stdio` is buffered through the foreground `runc` process (irrespective of
+which terminal mode you are using). This is conceptually quite similar to
+running a normal process interactively in a shell (and if you are using `runc`
+in a shell interactively, this is what you should use).
+
+Because the `stdio` will be buffered in this mode, some very important
+peculiarities of this mode should be kept in mind:
+
+* With [new terminal mode](#new-terminal), the container will see a
+ pseudo-terminal as its `stdio` (as you might expect). However, the `stdio` of
+ the foreground `runc` process will remain the `stdio` that the process was
+ started with -- and `runc` will copy all `stdio` between its `stdio` and the
+ container's `stdio`. This means that while a new pseudo-terminal has been
+ created, the foreground `runc` process manages it over the lifetime of the
+ container.
+
+* With [pass-through mode](#pass-through), the foreground `runc`'s `stdio` is
+ **not** passed to the container. Instead, the container's `stdio` is a set of
+ pipes which are used to copy data between `runc`'s `stdio` and the
+ container's `stdio`. This means that the container never has direct access to
+ host file descriptors (aside from the pipes created by the container runtime,
+ but that shouldn't be an issue).
+
+The main drawback of the foreground mode of operation is that it requires a
+long-running foreground `runc` process. If you kill the foreground `runc`
+process then you will no longer have access to the `stdio` of the container
+(and in most cases this will result in the container dying abnormally due to
+`SIGPIPE` or some other error). By extension this means that any bug in the
+long-running foreground `runc` process (such as a memory leak) or a stray
+OOM-kill sweep could result in your container being killed **through no fault
+of the user**. In addition, there is no way in foreground mode of passing a
+file descriptor directly to the container process as its `stdio` (like
+`--preserve-fds` does).
+
+These shortcomings are obviously sub-optimal and are the reason that `runc` has
+an additional mode called "detached mode".
+
+### Detached ###
+
+In contrast to foreground mode, in detached mode there is no long-running
+foreground `runc` process once the container has started. In fact, there is no
+long-running `runc` process at all. However, this means that it is up to the
+caller to handle the `stdio` after `runc` has set it up for you. In a shell
+this means that the `runc` command will exit and control will return to the
+shell, after the container has been set up.
+
+You can run `runc` in detached mode in one of the following ways:
+
+* `runc run -d ...` which operates similar to `runc run` but is detached.
+* `runc create` followed by `runc start` which is the standard container
+ lifecycle defined by the OCI runtime specification (`runc create` sets up the
+ container completely, waiting for `runc start` to begin execution of user
+ code).
+
+The main use-case of detached mode is for higher-level tools that want to be
+wrappers around `runc`. By running `runc` in detached mode, those tools have
+far more control over the container's `stdio` without `runc` getting in the
+way (most wrappers around `runc` like `cri-o` or `containerd` use detached mode
+for this reason).
+
+Unfortunately using detached mode is a bit more complicated and requires more
+care than the foreground mode -- mainly because it is now up to the caller to
+handle the `stdio` of the container.
+
+#### Detached Pass-Through ####
+
+In detached mode, pass-through actually does what it says on the tin -- the
+`stdio` file descriptors of the `runc` process are passed through (untouched)
+to the container's `stdio`. The purpose of this option is to allow a user to
+set up `stdio` for a container themselves and then force `runc` to just use
+their pre-prepared `stdio` (without any pseudo-terminal funny business). *If
+you don't see why this would be useful, don't use this option.*
+
+**You must be incredibly careful when using detached pass-through (especially
+in a shell).** The reason for this is that by using detached pass-through you
+are passing host file descriptors to the container. In the case of a shell,
+usually your `stdio` is going to be a pseudo-terminal (on your host). A
+malicious container could take advantage of TTY-specific `ioctls` like
+`TIOCSTI` to fake input into the **host** shell (remember that in detached
+mode, control is returned to your shell and so the terminal you've given the
+container is being read by a shell prompt).
+
+There are also several other issues with running non-malicious containers in a
+shell with detached pass-through (where you pass your shell's `stdio` to the
+container):
+
+* Output from the container will be interleaved with output from your shell (in
+ a non-deterministic way), without any real way of distinguishing from where a
+ particular piece of output came from.
+
+* Any input to `stdin` will be non-deterministically split and given to either
+ the container or the shell (because both are blocked on a `read(2)` of the
+ same FIFO-style file descriptor).
+
+They are all related to the fact that there is going to be a race when either
+your host or the container tries to read from (or write to) `stdio`. This
+problem is especially obvious when in a shell, where usually the terminal has
+been put into raw mode (where each individual key-press should cause `read(2)`
+to return).
+
+> **NOTE**: There is also currently a [known problem][issue-1721] where using
+> detached pass-through will result in the container hanging if the `stdout` or
+> `stderr` is a pipe (though this should be a temporary issue).
+
+[issue-1721]: https://github.com/opencontainers/runc/issues/1721
+
+#### Detached New Terminal ####
+
+When creating a new pseudo-terminal in detached mode, and fairly obvious
+problem appears -- how do we use the new terminal that `runc` created? Unlike
+in pass-through, `runc` has created a new set of file descriptors that need to
+be used by *something* in order for container communication to work.
+
+The way this problem is resolved is through the use of Unix domain sockets.
+There is a feature of Unix sockets called `SCM_RIGHTS` which allows a file
+descriptor to be sent through a Unix socket to a completely separate process
+(which can then use that file descriptor as though they opened it). When using
+`runc` in detached new terminal mode, this is how a user gets access to the
+pseudo-terminal's master file descriptor.
+
+To this end, there is a new option (which is required if you want to use `runc`
+in detached new terminal mode): `--console-socket`. This option takes the path
+to a Unix domain socket which `runc` will connect to and send the
+pseudo-terminal master file descriptor down. The general process for getting
+the pseudo-terminal master is as follows:
+
+1. Create a Unix domain socket at some path, `$socket_path`.
+2. Call `runc run` or `runc create` with the argument `--console-socket
+ $socket_path`.
+3. Using `recvmsg(2)` retrieve the file descriptor sent using `SCM_RIGHTS` by
+ `runc`.
+4. Now the manager can interact with the `stdio` of the container, using the
+ retrieved pseudo-terminal master.
+
+After `runc` exits, the only process with a copy of the pseudo-terminal master
+file descriptor is whoever read the file descriptor from the socket.
+
+> **NOTE**: Currently `runc` doesn't support abstract socket addresses (due to
+> it not being possible to pass an `argv` with a null-byte as the first
+> character). In the future this may change, but currently you must use a valid
+> path name.
+
+In order to help users make use of detached new terminal mode, we have provided
+a [Go implementation in the `go-runc` bindings][containerd/go-runc.Socket], as
+well as [a simple client][recvtty].
+
+[containerd/go-runc.Socket]: https://godoc.org/github.com/containerd/go-runc#Socket
+[recvtty]: /contrib/cmd/recvtty
--- /dev/null
+// +build linux
+
+package main
+
+import (
+ "encoding/json"
+ "fmt"
+ "os"
+ "sync"
+ "time"
+
+ "github.com/opencontainers/runc/libcontainer"
+ "github.com/opencontainers/runc/libcontainer/cgroups"
+ "github.com/opencontainers/runc/libcontainer/intelrdt"
+
+ "github.com/sirupsen/logrus"
+ "github.com/urfave/cli"
+)
+
+// event struct for encoding the event data to json.
+type event struct {
+ Type string `json:"type"`
+ ID string `json:"id"`
+ Data interface{} `json:"data,omitempty"`
+}
+
+// stats is the runc specific stats structure for stability when encoding and decoding stats.
+type stats struct {
+ CPU cpu `json:"cpu"`
+ Memory memory `json:"memory"`
+ Pids pids `json:"pids"`
+ Blkio blkio `json:"blkio"`
+ Hugetlb map[string]hugetlb `json:"hugetlb"`
+ IntelRdt intelRdt `json:"intel_rdt"`
+}
+
+type hugetlb struct {
+ Usage uint64 `json:"usage,omitempty"`
+ Max uint64 `json:"max,omitempty"`
+ Failcnt uint64 `json:"failcnt"`
+}
+
+type blkioEntry struct {
+ Major uint64 `json:"major,omitempty"`
+ Minor uint64 `json:"minor,omitempty"`
+ Op string `json:"op,omitempty"`
+ Value uint64 `json:"value,omitempty"`
+}
+
+type blkio struct {
+ IoServiceBytesRecursive []blkioEntry `json:"ioServiceBytesRecursive,omitempty"`
+ IoServicedRecursive []blkioEntry `json:"ioServicedRecursive,omitempty"`
+ IoQueuedRecursive []blkioEntry `json:"ioQueueRecursive,omitempty"`
+ IoServiceTimeRecursive []blkioEntry `json:"ioServiceTimeRecursive,omitempty"`
+ IoWaitTimeRecursive []blkioEntry `json:"ioWaitTimeRecursive,omitempty"`
+ IoMergedRecursive []blkioEntry `json:"ioMergedRecursive,omitempty"`
+ IoTimeRecursive []blkioEntry `json:"ioTimeRecursive,omitempty"`
+ SectorsRecursive []blkioEntry `json:"sectorsRecursive,omitempty"`
+}
+
+type pids struct {
+ Current uint64 `json:"current,omitempty"`
+ Limit uint64 `json:"limit,omitempty"`
+}
+
+type throttling struct {
+ Periods uint64 `json:"periods,omitempty"`
+ ThrottledPeriods uint64 `json:"throttledPeriods,omitempty"`
+ ThrottledTime uint64 `json:"throttledTime,omitempty"`
+}
+
+type cpuUsage struct {
+ // Units: nanoseconds.
+ Total uint64 `json:"total,omitempty"`
+ Percpu []uint64 `json:"percpu,omitempty"`
+ Kernel uint64 `json:"kernel"`
+ User uint64 `json:"user"`
+}
+
+type cpu struct {
+ Usage cpuUsage `json:"usage,omitempty"`
+ Throttling throttling `json:"throttling,omitempty"`
+}
+
+type memoryEntry struct {
+ Limit uint64 `json:"limit"`
+ Usage uint64 `json:"usage,omitempty"`
+ Max uint64 `json:"max,omitempty"`
+ Failcnt uint64 `json:"failcnt"`
+}
+
+type memory struct {
+ Cache uint64 `json:"cache,omitempty"`
+ Usage memoryEntry `json:"usage,omitempty"`
+ Swap memoryEntry `json:"swap,omitempty"`
+ Kernel memoryEntry `json:"kernel,omitempty"`
+ KernelTCP memoryEntry `json:"kernelTCP,omitempty"`
+ Raw map[string]uint64 `json:"raw,omitempty"`
+}
+
+type l3CacheInfo struct {
+ CbmMask string `json:"cbm_mask,omitempty"`
+ MinCbmBits uint64 `json:"min_cbm_bits,omitempty"`
+ NumClosids uint64 `json:"num_closids,omitempty"`
+}
+
+type memBwInfo struct {
+ BandwidthGran uint64 `json:"bandwidth_gran,omitempty"`
+ DelayLinear uint64 `json:"delay_linear,omitempty"`
+ MinBandwidth uint64 `json:"min_bandwidth,omitempty"`
+ NumClosids uint64 `json:"num_closids,omitempty"`
+}
+
+type intelRdt struct {
+ // The read-only L3 cache information
+ L3CacheInfo *l3CacheInfo `json:"l3_cache_info,omitempty"`
+
+ // The read-only L3 cache schema in root
+ L3CacheSchemaRoot string `json:"l3_cache_schema_root,omitempty"`
+
+ // The L3 cache schema in 'container_id' group
+ L3CacheSchema string `json:"l3_cache_schema,omitempty"`
+
+ // The read-only memory bandwidth information
+ MemBwInfo *memBwInfo `json:"mem_bw_info,omitempty"`
+
+ // The read-only memory bandwidth schema in root
+ MemBwSchemaRoot string `json:"mem_bw_schema_root,omitempty"`
+
+ // The memory bandwidth schema in 'container_id' group
+ MemBwSchema string `json:"mem_bw_schema,omitempty"`
+}
+
+var eventsCommand = cli.Command{
+ Name: "events",
+ Usage: "display container events such as OOM notifications, cpu, memory, and IO usage statistics",
+ ArgsUsage: `<container-id>
+
+Where "<container-id>" is the name for the instance of the container.`,
+ Description: `The events command displays information about the container. By default the
+information is displayed once every 5 seconds.`,
+ Flags: []cli.Flag{
+ cli.DurationFlag{Name: "interval", Value: 5 * time.Second, Usage: "set the stats collection interval"},
+ cli.BoolFlag{Name: "stats", Usage: "display the container's stats then exit"},
+ },
+ Action: func(context *cli.Context) error {
+ if err := checkArgs(context, 1, exactArgs); err != nil {
+ return err
+ }
+ container, err := getContainer(context)
+ if err != nil {
+ return err
+ }
+ duration := context.Duration("interval")
+ if duration <= 0 {
+ return fmt.Errorf("duration interval must be greater than 0")
+ }
+ status, err := container.Status()
+ if err != nil {
+ return err
+ }
+ if status == libcontainer.Stopped {
+ return fmt.Errorf("container with id %s is not running", container.ID())
+ }
+ var (
+ stats = make(chan *libcontainer.Stats, 1)
+ events = make(chan *event, 1024)
+ group = &sync.WaitGroup{}
+ )
+ group.Add(1)
+ go func() {
+ defer group.Done()
+ enc := json.NewEncoder(os.Stdout)
+ for e := range events {
+ if err := enc.Encode(e); err != nil {
+ logrus.Error(err)
+ }
+ }
+ }()
+ if context.Bool("stats") {
+ s, err := container.Stats()
+ if err != nil {
+ return err
+ }
+ events <- &event{Type: "stats", ID: container.ID(), Data: convertLibcontainerStats(s)}
+ close(events)
+ group.Wait()
+ return nil
+ }
+ go func() {
+ for range time.Tick(context.Duration("interval")) {
+ s, err := container.Stats()
+ if err != nil {
+ logrus.Error(err)
+ continue
+ }
+ stats <- s
+ }
+ }()
+ n, err := container.NotifyOOM()
+ if err != nil {
+ return err
+ }
+ for {
+ select {
+ case _, ok := <-n:
+ if ok {
+ // this means an oom event was received, if it is !ok then
+ // the channel was closed because the container stopped and
+ // the cgroups no longer exist.
+ events <- &event{Type: "oom", ID: container.ID()}
+ } else {
+ n = nil
+ }
+ case s := <-stats:
+ events <- &event{Type: "stats", ID: container.ID(), Data: convertLibcontainerStats(s)}
+ }
+ if n == nil {
+ close(events)
+ break
+ }
+ }
+ group.Wait()
+ return nil
+ },
+}
+
+func convertLibcontainerStats(ls *libcontainer.Stats) *stats {
+ cg := ls.CgroupStats
+ if cg == nil {
+ return nil
+ }
+ var s stats
+ s.Pids.Current = cg.PidsStats.Current
+ s.Pids.Limit = cg.PidsStats.Limit
+
+ s.CPU.Usage.Kernel = cg.CpuStats.CpuUsage.UsageInKernelmode
+ s.CPU.Usage.User = cg.CpuStats.CpuUsage.UsageInUsermode
+ s.CPU.Usage.Total = cg.CpuStats.CpuUsage.TotalUsage
+ s.CPU.Usage.Percpu = cg.CpuStats.CpuUsage.PercpuUsage
+ s.CPU.Throttling.Periods = cg.CpuStats.ThrottlingData.Periods
+ s.CPU.Throttling.ThrottledPeriods = cg.CpuStats.ThrottlingData.ThrottledPeriods
+ s.CPU.Throttling.ThrottledTime = cg.CpuStats.ThrottlingData.ThrottledTime
+
+ s.Memory.Cache = cg.MemoryStats.Cache
+ s.Memory.Kernel = convertMemoryEntry(cg.MemoryStats.KernelUsage)
+ s.Memory.KernelTCP = convertMemoryEntry(cg.MemoryStats.KernelTCPUsage)
+ s.Memory.Swap = convertMemoryEntry(cg.MemoryStats.SwapUsage)
+ s.Memory.Usage = convertMemoryEntry(cg.MemoryStats.Usage)
+ s.Memory.Raw = cg.MemoryStats.Stats
+
+ s.Blkio.IoServiceBytesRecursive = convertBlkioEntry(cg.BlkioStats.IoServiceBytesRecursive)
+ s.Blkio.IoServicedRecursive = convertBlkioEntry(cg.BlkioStats.IoServicedRecursive)
+ s.Blkio.IoQueuedRecursive = convertBlkioEntry(cg.BlkioStats.IoQueuedRecursive)
+ s.Blkio.IoServiceTimeRecursive = convertBlkioEntry(cg.BlkioStats.IoServiceTimeRecursive)
+ s.Blkio.IoWaitTimeRecursive = convertBlkioEntry(cg.BlkioStats.IoWaitTimeRecursive)
+ s.Blkio.IoMergedRecursive = convertBlkioEntry(cg.BlkioStats.IoMergedRecursive)
+ s.Blkio.IoTimeRecursive = convertBlkioEntry(cg.BlkioStats.IoTimeRecursive)
+ s.Blkio.SectorsRecursive = convertBlkioEntry(cg.BlkioStats.SectorsRecursive)
+
+ s.Hugetlb = make(map[string]hugetlb)
+ for k, v := range cg.HugetlbStats {
+ s.Hugetlb[k] = convertHugtlb(v)
+ }
+
+ if is := ls.IntelRdtStats; is != nil {
+ if intelrdt.IsCatEnabled() {
+ s.IntelRdt.L3CacheInfo = convertL3CacheInfo(is.L3CacheInfo)
+ s.IntelRdt.L3CacheSchemaRoot = is.L3CacheSchemaRoot
+ s.IntelRdt.L3CacheSchema = is.L3CacheSchema
+ }
+ if intelrdt.IsMbaEnabled() {
+ s.IntelRdt.MemBwInfo = convertMemBwInfo(is.MemBwInfo)
+ s.IntelRdt.MemBwSchemaRoot = is.MemBwSchemaRoot
+ s.IntelRdt.MemBwSchema = is.MemBwSchema
+ }
+ }
+
+ return &s
+}
+
+func convertHugtlb(c cgroups.HugetlbStats) hugetlb {
+ return hugetlb{
+ Usage: c.Usage,
+ Max: c.MaxUsage,
+ Failcnt: c.Failcnt,
+ }
+}
+
+func convertMemoryEntry(c cgroups.MemoryData) memoryEntry {
+ return memoryEntry{
+ Limit: c.Limit,
+ Usage: c.Usage,
+ Max: c.MaxUsage,
+ Failcnt: c.Failcnt,
+ }
+}
+
+func convertBlkioEntry(c []cgroups.BlkioStatEntry) []blkioEntry {
+ var out []blkioEntry
+ for _, e := range c {
+ out = append(out, blkioEntry{
+ Major: e.Major,
+ Minor: e.Minor,
+ Op: e.Op,
+ Value: e.Value,
+ })
+ }
+ return out
+}
+
+func convertL3CacheInfo(i *intelrdt.L3CacheInfo) *l3CacheInfo {
+ return &l3CacheInfo{
+ CbmMask: i.CbmMask,
+ MinCbmBits: i.MinCbmBits,
+ NumClosids: i.NumClosids,
+ }
+}
+
+func convertMemBwInfo(i *intelrdt.MemBwInfo) *memBwInfo {
+ return &memBwInfo{
+ BandwidthGran: i.BandwidthGran,
+ DelayLinear: i.DelayLinear,
+ MinBandwidth: i.MinBandwidth,
+ NumClosids: i.NumClosids,
+ }
+}
--- /dev/null
+// +build linux
+
+package main
+
+import (
+ "encoding/json"
+ "fmt"
+ "os"
+ "strconv"
+ "strings"
+
+ "github.com/opencontainers/runc/libcontainer"
+ "github.com/opencontainers/runc/libcontainer/utils"
+ "github.com/opencontainers/runtime-spec/specs-go"
+ "github.com/urfave/cli"
+)
+
+var execCommand = cli.Command{
+ Name: "exec",
+ Usage: "execute new process inside the container",
+ ArgsUsage: `<container-id> <command> [command options] || -p process.json <container-id>
+
+Where "<container-id>" is the name for the instance of the container and
+"<command>" is the command to be executed in the container.
+"<command>" can't be empty unless a "-p" flag provided.
+
+EXAMPLE:
+For example, if the container is configured to run the linux ps command the
+following will output a list of processes running in the container:
+
+ # runc exec <container-id> ps`,
+ Flags: []cli.Flag{
+ cli.StringFlag{
+ Name: "console-socket",
+ Usage: "path to an AF_UNIX socket which will receive a file descriptor referencing the master end of the console's pseudoterminal",
+ },
+ cli.StringFlag{
+ Name: "cwd",
+ Usage: "current working directory in the container",
+ },
+ cli.StringSliceFlag{
+ Name: "env, e",
+ Usage: "set environment variables",
+ },
+ cli.BoolFlag{
+ Name: "tty, t",
+ Usage: "allocate a pseudo-TTY",
+ },
+ cli.StringFlag{
+ Name: "user, u",
+ Usage: "UID (format: <uid>[:<gid>])",
+ },
+ cli.Int64SliceFlag{
+ Name: "additional-gids, g",
+ Usage: "additional gids",
+ },
+ cli.StringFlag{
+ Name: "process, p",
+ Usage: "path to the process.json",
+ },
+ cli.BoolFlag{
+ Name: "detach,d",
+ Usage: "detach from the container's process",
+ },
+ cli.StringFlag{
+ Name: "pid-file",
+ Value: "",
+ Usage: "specify the file to write the process id to",
+ },
+ cli.StringFlag{
+ Name: "process-label",
+ Usage: "set the asm process label for the process commonly used with selinux",
+ },
+ cli.StringFlag{
+ Name: "apparmor",
+ Usage: "set the apparmor profile for the process",
+ },
+ cli.BoolFlag{
+ Name: "no-new-privs",
+ Usage: "set the no new privileges value for the process",
+ },
+ cli.StringSliceFlag{
+ Name: "cap, c",
+ Value: &cli.StringSlice{},
+ Usage: "add a capability to the bounding set for the process",
+ },
+ cli.BoolFlag{
+ Name: "no-subreaper",
+ Usage: "disable the use of the subreaper used to reap reparented processes",
+ Hidden: true,
+ },
+ },
+ Action: func(context *cli.Context) error {
+ if err := checkArgs(context, 1, minArgs); err != nil {
+ return err
+ }
+ if err := revisePidFile(context); err != nil {
+ return err
+ }
+ status, err := execProcess(context)
+ if err == nil {
+ os.Exit(status)
+ }
+ return fmt.Errorf("exec failed: %v", err)
+ },
+ SkipArgReorder: true,
+}
+
+func execProcess(context *cli.Context) (int, error) {
+ container, err := getContainer(context)
+ if err != nil {
+ return -1, err
+ }
+ status, err := container.Status()
+ if err != nil {
+ return -1, err
+ }
+ if status == libcontainer.Stopped {
+ return -1, fmt.Errorf("cannot exec a container that has stopped")
+ }
+ path := context.String("process")
+ if path == "" && len(context.Args()) == 1 {
+ return -1, fmt.Errorf("process args cannot be empty")
+ }
+ detach := context.Bool("detach")
+ state, err := container.State()
+ if err != nil {
+ return -1, err
+ }
+ bundle := utils.SearchLabels(state.Config.Labels, "bundle")
+ p, err := getProcess(context, bundle)
+ if err != nil {
+ return -1, err
+ }
+ r := &runner{
+ enableSubreaper: false,
+ shouldDestroy: false,
+ container: container,
+ consoleSocket: context.String("console-socket"),
+ detach: detach,
+ pidFile: context.String("pid-file"),
+ action: CT_ACT_RUN,
+ init: false,
+ }
+ return r.run(p)
+}
+
+func getProcess(context *cli.Context, bundle string) (*specs.Process, error) {
+ if path := context.String("process"); path != "" {
+ f, err := os.Open(path)
+ if err != nil {
+ return nil, err
+ }
+ defer f.Close()
+ var p specs.Process
+ if err := json.NewDecoder(f).Decode(&p); err != nil {
+ return nil, err
+ }
+ return &p, validateProcessSpec(&p)
+ }
+ // process via cli flags
+ if err := os.Chdir(bundle); err != nil {
+ return nil, err
+ }
+ spec, err := loadSpec(specConfig)
+ if err != nil {
+ return nil, err
+ }
+ p := spec.Process
+ p.Args = context.Args()[1:]
+ // override the cwd, if passed
+ if context.String("cwd") != "" {
+ p.Cwd = context.String("cwd")
+ }
+ if ap := context.String("apparmor"); ap != "" {
+ p.ApparmorProfile = ap
+ }
+ if l := context.String("process-label"); l != "" {
+ p.SelinuxLabel = l
+ }
+ if caps := context.StringSlice("cap"); len(caps) > 0 {
+ for _, c := range caps {
+ p.Capabilities.Bounding = append(p.Capabilities.Bounding, c)
+ p.Capabilities.Inheritable = append(p.Capabilities.Inheritable, c)
+ p.Capabilities.Effective = append(p.Capabilities.Effective, c)
+ p.Capabilities.Permitted = append(p.Capabilities.Permitted, c)
+ p.Capabilities.Ambient = append(p.Capabilities.Ambient, c)
+ }
+ }
+ // append the passed env variables
+ p.Env = append(p.Env, context.StringSlice("env")...)
+
+ // set the tty
+ if context.IsSet("tty") {
+ p.Terminal = context.Bool("tty")
+ }
+ if context.IsSet("no-new-privs") {
+ p.NoNewPrivileges = context.Bool("no-new-privs")
+ }
+ // override the user, if passed
+ if context.String("user") != "" {
+ u := strings.SplitN(context.String("user"), ":", 2)
+ if len(u) > 1 {
+ gid, err := strconv.Atoi(u[1])
+ if err != nil {
+ return nil, fmt.Errorf("parsing %s as int for gid failed: %v", u[1], err)
+ }
+ p.User.GID = uint32(gid)
+ }
+ uid, err := strconv.Atoi(u[0])
+ if err != nil {
+ return nil, fmt.Errorf("parsing %s as int for uid failed: %v", u[0], err)
+ }
+ p.User.UID = uint32(uid)
+ }
+ for _, gid := range context.Int64Slice("additional-gids") {
+ if gid < 0 {
+ return nil, fmt.Errorf("additional-gids must be a positive number %d", gid)
+ }
+ p.User.AdditionalGids = append(p.User.AdditionalGids, uint32(gid))
+ }
+ return p, nil
+}
--- /dev/null
+package main
+
+import (
+ "os"
+ "runtime"
+
+ "github.com/opencontainers/runc/libcontainer"
+ _ "github.com/opencontainers/runc/libcontainer/nsenter"
+ "github.com/urfave/cli"
+)
+
+func init() {
+ if len(os.Args) > 1 && os.Args[1] == "init" {
+ runtime.GOMAXPROCS(1)
+ runtime.LockOSThread()
+ }
+}
+
+var initCommand = cli.Command{
+ Name: "init",
+ Usage: `initialize the namespaces and launch the process (do not call it outside of runc)`,
+ Action: func(context *cli.Context) error {
+ factory, _ := libcontainer.New("")
+ if err := factory.StartInitialization(); err != nil {
+ // as the error is sent back to the parent there is no need to log
+ // or write it to stderr because the parent process will handle this
+ os.Exit(1)
+ }
+ panic("libcontainer: container init failed to exec")
+ },
+}
--- /dev/null
+// +build linux
+
+package main
+
+import (
+ "fmt"
+ "strconv"
+ "strings"
+ "syscall"
+
+ "github.com/urfave/cli"
+)
+
+var killCommand = cli.Command{
+ Name: "kill",
+ Usage: "kill sends the specified signal (default: SIGTERM) to the container's init process",
+ ArgsUsage: `<container-id> [signal]
+
+Where "<container-id>" is the name for the instance of the container and
+"[signal]" is the signal to be sent to the init process.
+
+EXAMPLE:
+For example, if the container id is "ubuntu01" the following will send a "KILL"
+signal to the init process of the "ubuntu01" container:
+
+ # runc kill ubuntu01 KILL`,
+ Flags: []cli.Flag{
+ cli.BoolFlag{
+ Name: "all, a",
+ Usage: "send the specified signal to all processes inside the container",
+ },
+ },
+ Action: func(context *cli.Context) error {
+ if err := checkArgs(context, 1, minArgs); err != nil {
+ return err
+ }
+ if err := checkArgs(context, 2, maxArgs); err != nil {
+ return err
+ }
+ container, err := getContainer(context)
+ if err != nil {
+ return err
+ }
+
+ sigstr := context.Args().Get(1)
+ if sigstr == "" {
+ sigstr = "SIGTERM"
+ }
+
+ signal, err := parseSignal(sigstr)
+ if err != nil {
+ return err
+ }
+ return container.Signal(signal, context.Bool("all"))
+ },
+}
+
+func parseSignal(rawSignal string) (syscall.Signal, error) {
+ s, err := strconv.Atoi(rawSignal)
+ if err == nil {
+ return syscall.Signal(s), nil
+ }
+ signal, ok := signalMap[strings.TrimPrefix(strings.ToUpper(rawSignal), "SIG")]
+ if !ok {
+ return -1, fmt.Errorf("unknown signal %q", rawSignal)
+ }
+ return signal, nil
+}
--- /dev/null
+# libcontainer
+
+[](https://godoc.org/github.com/opencontainers/runc/libcontainer)
+
+Libcontainer provides a native Go implementation for creating containers
+with namespaces, cgroups, capabilities, and filesystem access controls.
+It allows you to manage the lifecycle of the container performing additional operations
+after the container is created.
+
+
+#### Container
+A container is a self contained execution environment that shares the kernel of the
+host system and which is (optionally) isolated from other containers in the system.
+
+#### Using libcontainer
+
+Because containers are spawned in a two step process you will need a binary that
+will be executed as the init process for the container. In libcontainer, we use
+the current binary (/proc/self/exe) to be executed as the init process, and use
+arg "init", we call the first step process "bootstrap", so you always need a "init"
+function as the entry of "bootstrap".
+
+In addition to the go init function the early stage bootstrap is handled by importing
+[nsenter](https://github.com/opencontainers/runc/blob/master/libcontainer/nsenter/README.md).
+
+```go
+import (
+ _ "github.com/opencontainers/runc/libcontainer/nsenter"
+)
+
+func init() {
+ if len(os.Args) > 1 && os.Args[1] == "init" {
+ runtime.GOMAXPROCS(1)
+ runtime.LockOSThread()
+ factory, _ := libcontainer.New("")
+ if err := factory.StartInitialization(); err != nil {
+ logrus.Fatal(err)
+ }
+ panic("--this line should have never been executed, congratulations--")
+ }
+}
+```
+
+Then to create a container you first have to initialize an instance of a factory
+that will handle the creation and initialization for a container.
+
+```go
+factory, err := libcontainer.New("/var/lib/container", libcontainer.Cgroupfs, libcontainer.InitArgs(os.Args[0], "init"))
+if err != nil {
+ logrus.Fatal(err)
+ return
+}
+```
+
+Once you have an instance of the factory created we can create a configuration
+struct describing how the container is to be created. A sample would look similar to this:
+
+```go
+defaultMountFlags := unix.MS_NOEXEC | unix.MS_NOSUID | unix.MS_NODEV
+config := &configs.Config{
+ Rootfs: "/your/path/to/rootfs",
+ Capabilities: &configs.Capabilities{
+ Bounding: []string{
+ "CAP_CHOWN",
+ "CAP_DAC_OVERRIDE",
+ "CAP_FSETID",
+ "CAP_FOWNER",
+ "CAP_MKNOD",
+ "CAP_NET_RAW",
+ "CAP_SETGID",
+ "CAP_SETUID",
+ "CAP_SETFCAP",
+ "CAP_SETPCAP",
+ "CAP_NET_BIND_SERVICE",
+ "CAP_SYS_CHROOT",
+ "CAP_KILL",
+ "CAP_AUDIT_WRITE",
+ },
+ Effective: []string{
+ "CAP_CHOWN",
+ "CAP_DAC_OVERRIDE",
+ "CAP_FSETID",
+ "CAP_FOWNER",
+ "CAP_MKNOD",
+ "CAP_NET_RAW",
+ "CAP_SETGID",
+ "CAP_SETUID",
+ "CAP_SETFCAP",
+ "CAP_SETPCAP",
+ "CAP_NET_BIND_SERVICE",
+ "CAP_SYS_CHROOT",
+ "CAP_KILL",
+ "CAP_AUDIT_WRITE",
+ },
+ Inheritable: []string{
+ "CAP_CHOWN",
+ "CAP_DAC_OVERRIDE",
+ "CAP_FSETID",
+ "CAP_FOWNER",
+ "CAP_MKNOD",
+ "CAP_NET_RAW",
+ "CAP_SETGID",
+ "CAP_SETUID",
+ "CAP_SETFCAP",
+ "CAP_SETPCAP",
+ "CAP_NET_BIND_SERVICE",
+ "CAP_SYS_CHROOT",
+ "CAP_KILL",
+ "CAP_AUDIT_WRITE",
+ },
+ Permitted: []string{
+ "CAP_CHOWN",
+ "CAP_DAC_OVERRIDE",
+ "CAP_FSETID",
+ "CAP_FOWNER",
+ "CAP_MKNOD",
+ "CAP_NET_RAW",
+ "CAP_SETGID",
+ "CAP_SETUID",
+ "CAP_SETFCAP",
+ "CAP_SETPCAP",
+ "CAP_NET_BIND_SERVICE",
+ "CAP_SYS_CHROOT",
+ "CAP_KILL",
+ "CAP_AUDIT_WRITE",
+ },
+ Ambient: []string{
+ "CAP_CHOWN",
+ "CAP_DAC_OVERRIDE",
+ "CAP_FSETID",
+ "CAP_FOWNER",
+ "CAP_MKNOD",
+ "CAP_NET_RAW",
+ "CAP_SETGID",
+ "CAP_SETUID",
+ "CAP_SETFCAP",
+ "CAP_SETPCAP",
+ "CAP_NET_BIND_SERVICE",
+ "CAP_SYS_CHROOT",
+ "CAP_KILL",
+ "CAP_AUDIT_WRITE",
+ },
+ },
+ Namespaces: configs.Namespaces([]configs.Namespace{
+ {Type: configs.NEWNS},
+ {Type: configs.NEWUTS},
+ {Type: configs.NEWIPC},
+ {Type: configs.NEWPID},
+ {Type: configs.NEWUSER},
+ {Type: configs.NEWNET},
+ {Type: configs.NEWCGROUP},
+ }),
+ Cgroups: &configs.Cgroup{
+ Name: "test-container",
+ Parent: "system",
+ Resources: &configs.Resources{
+ MemorySwappiness: nil,
+ AllowAllDevices: nil,
+ AllowedDevices: configs.DefaultAllowedDevices,
+ },
+ },
+ MaskPaths: []string{
+ "/proc/kcore",
+ "/sys/firmware",
+ },
+ ReadonlyPaths: []string{
+ "/proc/sys", "/proc/sysrq-trigger", "/proc/irq", "/proc/bus",
+ },
+ Devices: configs.DefaultAutoCreatedDevices,
+ Hostname: "testing",
+ Mounts: []*configs.Mount{
+ {
+ Source: "proc",
+ Destination: "/proc",
+ Device: "proc",
+ Flags: defaultMountFlags,
+ },
+ {
+ Source: "tmpfs",
+ Destination: "/dev",
+ Device: "tmpfs",
+ Flags: unix.MS_NOSUID | unix.MS_STRICTATIME,
+ Data: "mode=755",
+ },
+ {
+ Source: "devpts",
+ Destination: "/dev/pts",
+ Device: "devpts",
+ Flags: unix.MS_NOSUID | unix.MS_NOEXEC,
+ Data: "newinstance,ptmxmode=0666,mode=0620,gid=5",
+ },
+ {
+ Device: "tmpfs",
+ Source: "shm",
+ Destination: "/dev/shm",
+ Data: "mode=1777,size=65536k",
+ Flags: defaultMountFlags,
+ },
+ {
+ Source: "mqueue",
+ Destination: "/dev/mqueue",
+ Device: "mqueue",
+ Flags: defaultMountFlags,
+ },
+ {
+ Source: "sysfs",
+ Destination: "/sys",
+ Device: "sysfs",
+ Flags: defaultMountFlags | unix.MS_RDONLY,
+ },
+ },
+ UidMappings: []configs.IDMap{
+ {
+ ContainerID: 0,
+ HostID: 1000,
+ Size: 65536,
+ },
+ },
+ GidMappings: []configs.IDMap{
+ {
+ ContainerID: 0,
+ HostID: 1000,
+ Size: 65536,
+ },
+ },
+ Networks: []*configs.Network{
+ {
+ Type: "loopback",
+ Address: "127.0.0.1/0",
+ Gateway: "localhost",
+ },
+ },
+ Rlimits: []configs.Rlimit{
+ {
+ Type: unix.RLIMIT_NOFILE,
+ Hard: uint64(1025),
+ Soft: uint64(1025),
+ },
+ },
+}
+```
+
+Once you have the configuration populated you can create a container:
+
+```go
+container, err := factory.Create("container-id", config)
+if err != nil {
+ logrus.Fatal(err)
+ return
+}
+```
+
+To spawn bash as the initial process inside the container and have the
+processes pid returned in order to wait, signal, or kill the process:
+
+```go
+process := &libcontainer.Process{
+ Args: []string{"/bin/bash"},
+ Env: []string{"PATH=/bin"},
+ User: "daemon",
+ Stdin: os.Stdin,
+ Stdout: os.Stdout,
+ Stderr: os.Stderr,
+}
+
+err := container.Run(process)
+if err != nil {
+ container.Destroy()
+ logrus.Fatal(err)
+ return
+}
+
+// wait for the process to finish.
+_, err := process.Wait()
+if err != nil {
+ logrus.Fatal(err)
+}
+
+// destroy the container.
+container.Destroy()
+```
+
+Additional ways to interact with a running container are:
+
+```go
+// return all the pids for all processes running inside the container.
+processes, err := container.Processes()
+
+// get detailed cpu, memory, io, and network statistics for the container and
+// it's processes.
+stats, err := container.Stats()
+
+// pause all processes inside the container.
+container.Pause()
+
+// resume all paused processes.
+container.Resume()
+
+// send signal to container's init process.
+container.Signal(signal)
+
+// update container resource constraints.
+container.Set(config)
+
+// get current status of the container.
+status, err := container.Status()
+
+// get current container's state information.
+state, err := container.State()
+```
+
+
+#### Checkpoint & Restore
+
+libcontainer now integrates [CRIU](http://criu.org/) for checkpointing and restoring containers.
+This let's you save the state of a process running inside a container to disk, and then restore
+that state into a new process, on the same machine or on another machine.
+
+`criu` version 1.5.2 or higher is required to use checkpoint and restore.
+If you don't already have `criu` installed, you can build it from source, following the
+[online instructions](http://criu.org/Installation). `criu` is also installed in the docker image
+generated when building libcontainer with docker.
+
+
+## Copyright and license
+
+Code and documentation copyright 2014 Docker, inc.
+The code and documentation are released under the [Apache 2.0 license](../LICENSE).
+The documentation is also released under Creative Commons Attribution 4.0 International License.
+You may obtain a copy of the license, titled CC-BY-4.0, at http://creativecommons.org/licenses/by/4.0/.
--- /dev/null
+## Container Specification - v1
+
+This is the standard configuration for version 1 containers. It includes
+namespaces, standard filesystem setup, a default Linux capability set, and
+information about resource reservations. It also has information about any
+populated environment settings for the processes running inside a container.
+
+Along with the configuration of how a container is created the standard also
+discusses actions that can be performed on a container to manage and inspect
+information about the processes running inside.
+
+The v1 profile is meant to be able to accommodate the majority of applications
+with a strong security configuration.
+
+### System Requirements and Compatibility
+
+Minimum requirements:
+* Kernel version - 3.10 recommended 2.6.2x minimum(with backported patches)
+* Mounted cgroups with each subsystem in its own hierarchy
+
+
+### Namespaces
+
+| Flag | Enabled |
+| --------------- | ------- |
+| CLONE_NEWPID | 1 |
+| CLONE_NEWUTS | 1 |
+| CLONE_NEWIPC | 1 |
+| CLONE_NEWNET | 1 |
+| CLONE_NEWNS | 1 |
+| CLONE_NEWUSER | 1 |
+| CLONE_NEWCGROUP | 1 |
+
+Namespaces are created for the container via the `unshare` syscall.
+
+
+### Filesystem
+
+A root filesystem must be provided to a container for execution. The container
+will use this root filesystem (rootfs) to jail and spawn processes inside where
+the binaries and system libraries are local to that directory. Any binaries
+to be executed must be contained within this rootfs.
+
+Mounts that happen inside the container are automatically cleaned up when the
+container exits as the mount namespace is destroyed and the kernel will
+unmount all the mounts that were setup within that namespace.
+
+For a container to execute properly there are certain filesystems that
+are required to be mounted within the rootfs that the runtime will setup.
+
+| Path | Type | Flags | Data |
+| ----------- | ------ | -------------------------------------- | ---------------------------------------- |
+| /proc | proc | MS_NOEXEC,MS_NOSUID,MS_NODEV | |
+| /dev | tmpfs | MS_NOEXEC,MS_STRICTATIME | mode=755 |
+| /dev/shm | tmpfs | MS_NOEXEC,MS_NOSUID,MS_NODEV | mode=1777,size=65536k |
+| /dev/mqueue | mqueue | MS_NOEXEC,MS_NOSUID,MS_NODEV | |
+| /dev/pts | devpts | MS_NOEXEC,MS_NOSUID | newinstance,ptmxmode=0666,mode=620,gid=5 |
+| /sys | sysfs | MS_NOEXEC,MS_NOSUID,MS_NODEV,MS_RDONLY | |
+
+
+After a container's filesystems are mounted within the newly created
+mount namespace `/dev` will need to be populated with a set of device nodes.
+It is expected that a rootfs does not need to have any device nodes specified
+for `/dev` within the rootfs as the container will setup the correct devices
+that are required for executing a container's process.
+
+| Path | Mode | Access |
+| ------------ | ---- | ---------- |
+| /dev/null | 0666 | rwm |
+| /dev/zero | 0666 | rwm |
+| /dev/full | 0666 | rwm |
+| /dev/tty | 0666 | rwm |
+| /dev/random | 0666 | rwm |
+| /dev/urandom | 0666 | rwm |
+
+
+**ptmx**
+`/dev/ptmx` will need to be a symlink to the host's `/dev/ptmx` within
+the container.
+
+The use of a pseudo TTY is optional within a container and it should support both.
+If a pseudo is provided to the container `/dev/console` will need to be
+setup by binding the console in `/dev/` after it has been populated and mounted
+in tmpfs.
+
+| Source | Destination | UID GID | Mode | Type |
+| --------------- | ------------ | ------- | ---- | ---- |
+| *pty host path* | /dev/console | 0 0 | 0600 | bind |
+
+
+After `/dev/null` has been setup we check for any external links between
+the container's io, STDIN, STDOUT, STDERR. If the container's io is pointing
+to `/dev/null` outside the container we close and `dup2` the `/dev/null`
+that is local to the container's rootfs.
+
+
+After the container has `/proc` mounted a few standard symlinks are setup
+within `/dev/` for the io.
+
+| Source | Destination |
+| --------------- | ----------- |
+| /proc/self/fd | /dev/fd |
+| /proc/self/fd/0 | /dev/stdin |
+| /proc/self/fd/1 | /dev/stdout |
+| /proc/self/fd/2 | /dev/stderr |
+
+A `pivot_root` is used to change the root for the process, effectively
+jailing the process inside the rootfs.
+
+```c
+put_old = mkdir(...);
+pivot_root(rootfs, put_old);
+chdir("/");
+unmount(put_old, MS_DETACH);
+rmdir(put_old);
+```
+
+For container's running with a rootfs inside `ramfs` a `MS_MOVE` combined
+with a `chroot` is required as `pivot_root` is not supported in `ramfs`.
+
+```c
+mount(rootfs, "/", NULL, MS_MOVE, NULL);
+chroot(".");
+chdir("/");
+```
+
+The `umask` is set back to `0022` after the filesystem setup has been completed.
+
+### Resources
+
+Cgroups are used to handle resource allocation for containers. This includes
+system resources like cpu, memory, and device access.
+
+| Subsystem | Enabled |
+| ---------- | ------- |
+| devices | 1 |
+| memory | 1 |
+| cpu | 1 |
+| cpuacct | 1 |
+| cpuset | 1 |
+| blkio | 1 |
+| perf_event | 1 |
+| freezer | 1 |
+| hugetlb | 1 |
+| pids | 1 |
+
+
+All cgroup subsystem are joined so that statistics can be collected from
+each of the subsystems. Freezer does not expose any stats but is joined
+so that containers can be paused and resumed.
+
+The parent process of the container's init must place the init pid inside
+the correct cgroups before the initialization begins. This is done so
+that no processes or threads escape the cgroups. This sync is
+done via a pipe ( specified in the runtime section below ) that the container's
+init process will block waiting for the parent to finish setup.
+
+### IntelRdt
+
+Intel platforms with new Xeon CPU support Resource Director Technology (RDT).
+Cache Allocation Technology (CAT) and Memory Bandwidth Allocation (MBA) are
+two sub-features of RDT.
+
+Cache Allocation Technology (CAT) provides a way for the software to restrict
+cache allocation to a defined 'subset' of L3 cache which may be overlapping
+with other 'subsets'. The different subsets are identified by class of
+service (CLOS) and each CLOS has a capacity bitmask (CBM).
+
+Memory Bandwidth Allocation (MBA) provides indirect and approximate throttle
+over memory bandwidth for the software. A user controls the resource by
+indicating the percentage of maximum memory bandwidth.
+
+It can be used to handle L3 cache and memory bandwidth resources allocation
+for containers if hardware and kernel support Intel RDT CAT and MBA features.
+
+In Linux 4.10 kernel or newer, the interface is defined and exposed via
+"resource control" filesystem, which is a "cgroup-like" interface.
+
+Comparing with cgroups, it has similar process management lifecycle and
+interfaces in a container. But unlike cgroups' hierarchy, it has single level
+filesystem layout.
+
+CAT and MBA features are introduced in Linux 4.10 and 4.12 kernel via
+"resource control" filesystem.
+
+Intel RDT "resource control" filesystem hierarchy:
+```
+mount -t resctrl resctrl /sys/fs/resctrl
+tree /sys/fs/resctrl
+/sys/fs/resctrl/
+|-- info
+| |-- L3
+| | |-- cbm_mask
+| | |-- min_cbm_bits
+| | |-- num_closids
+| |-- MB
+| |-- bandwidth_gran
+| |-- delay_linear
+| |-- min_bandwidth
+| |-- num_closids
+|-- ...
+|-- schemata
+|-- tasks
+|-- <container_id>
+ |-- ...
+ |-- schemata
+ |-- tasks
+```
+
+For runc, we can make use of `tasks` and `schemata` configuration for L3
+cache and memory bandwidth resources constraints.
+
+The file `tasks` has a list of tasks that belongs to this group (e.g.,
+<container_id>" group). Tasks can be added to a group by writing the task ID
+to the "tasks" file (which will automatically remove them from the previous
+group to which they belonged). New tasks created by fork(2) and clone(2) are
+added to the same group as their parent.
+
+The file `schemata` has a list of all the resources available to this group.
+Each resource (L3 cache, memory bandwidth) has its own line and format.
+
+L3 cache schema:
+It has allocation bitmasks/values for L3 cache on each socket, which
+contains L3 cache id and capacity bitmask (CBM).
+```
+ Format: "L3:<cache_id0>=<cbm0>;<cache_id1>=<cbm1>;..."
+```
+For example, on a two-socket machine, the schema line could be "L3:0=ff;1=c0"
+which means L3 cache id 0's CBM is 0xff, and L3 cache id 1's CBM is 0xc0.
+
+The valid L3 cache CBM is a *contiguous bits set* and number of bits that can
+be set is less than the max bit. The max bits in the CBM is varied among
+supported Intel CPU models. Kernel will check if it is valid when writing.
+e.g., default value 0xfffff in root indicates the max bits of CBM is 20
+bits, which mapping to entire L3 cache capacity. Some valid CBM values to
+set in a group: 0xf, 0xf0, 0x3ff, 0x1f00 and etc.
+
+Memory bandwidth schema:
+It has allocation values for memory bandwidth on each socket, which contains
+L3 cache id and memory bandwidth percentage.
+```
+ Format: "MB:<cache_id0>=bandwidth0;<cache_id1>=bandwidth1;..."
+```
+For example, on a two-socket machine, the schema line could be "MB:0=20;1=70"
+
+The minimum bandwidth percentage value for each CPU model is predefined and
+can be looked up through "info/MB/min_bandwidth". The bandwidth granularity
+that is allocated is also dependent on the CPU model and can be looked up at
+"info/MB/bandwidth_gran". The available bandwidth control steps are:
+min_bw + N * bw_gran. Intermediate values are rounded to the next control
+step available on the hardware.
+
+For more information about Intel RDT kernel interface:
+https://www.kernel.org/doc/Documentation/x86/intel_rdt_ui.txt
+
+```
+An example for runc:
+Consider a two-socket machine with two L3 caches where the default CBM is
+0x7ff and the max CBM length is 11 bits, and minimum memory bandwidth of 10%
+with a memory bandwidth granularity of 10%.
+
+Tasks inside the container only have access to the "upper" 7/11 of L3 cache
+on socket 0 and the "lower" 5/11 L3 cache on socket 1, and may use a
+maximum memory bandwidth of 20% on socket 0 and 70% on socket 1.
+
+"linux": {
+ "intelRdt": {
+ "closID": "guaranteed_group",
+ "l3CacheSchema": "L3:0=7f0;1=1f",
+ "memBwSchema": "MB:0=20;1=70"
+ }
+}
+```
+
+### Security
+
+The standard set of Linux capabilities that are set in a container
+provide a good default for security and flexibility for the applications.
+
+
+| Capability | Enabled |
+| -------------------- | ------- |
+| CAP_NET_RAW | 1 |
+| CAP_NET_BIND_SERVICE | 1 |
+| CAP_AUDIT_READ | 1 |
+| CAP_AUDIT_WRITE | 1 |
+| CAP_DAC_OVERRIDE | 1 |
+| CAP_SETFCAP | 1 |
+| CAP_SETPCAP | 1 |
+| CAP_SETGID | 1 |
+| CAP_SETUID | 1 |
+| CAP_MKNOD | 1 |
+| CAP_CHOWN | 1 |
+| CAP_FOWNER | 1 |
+| CAP_FSETID | 1 |
+| CAP_KILL | 1 |
+| CAP_SYS_CHROOT | 1 |
+| CAP_NET_BROADCAST | 0 |
+| CAP_SYS_MODULE | 0 |
+| CAP_SYS_RAWIO | 0 |
+| CAP_SYS_PACCT | 0 |
+| CAP_SYS_ADMIN | 0 |
+| CAP_SYS_NICE | 0 |
+| CAP_SYS_RESOURCE | 0 |
+| CAP_SYS_TIME | 0 |
+| CAP_SYS_TTY_CONFIG | 0 |
+| CAP_AUDIT_CONTROL | 0 |
+| CAP_MAC_OVERRIDE | 0 |
+| CAP_MAC_ADMIN | 0 |
+| CAP_NET_ADMIN | 0 |
+| CAP_SYSLOG | 0 |
+| CAP_DAC_READ_SEARCH | 0 |
+| CAP_LINUX_IMMUTABLE | 0 |
+| CAP_IPC_LOCK | 0 |
+| CAP_IPC_OWNER | 0 |
+| CAP_SYS_PTRACE | 0 |
+| CAP_SYS_BOOT | 0 |
+| CAP_LEASE | 0 |
+| CAP_WAKE_ALARM | 0 |
+| CAP_BLOCK_SUSPEND | 0 |
+
+
+Additional security layers like [apparmor](https://wiki.ubuntu.com/AppArmor)
+and [selinux](http://selinuxproject.org/page/Main_Page) can be used with
+the containers. A container should support setting an apparmor profile or
+selinux process and mount labels if provided in the configuration.
+
+Standard apparmor profile:
+```c
+#include <tunables/global>
+profile <profile_name> flags=(attach_disconnected,mediate_deleted) {
+ #include <abstractions/base>
+ network,
+ capability,
+ file,
+ umount,
+
+ deny @{PROC}/sys/fs/** wklx,
+ deny @{PROC}/sysrq-trigger rwklx,
+ deny @{PROC}/mem rwklx,
+ deny @{PROC}/kmem rwklx,
+ deny @{PROC}/sys/kernel/[^s][^h][^m]* wklx,
+ deny @{PROC}/sys/kernel/*/** wklx,
+
+ deny mount,
+
+ deny /sys/[^f]*/** wklx,
+ deny /sys/f[^s]*/** wklx,
+ deny /sys/fs/[^c]*/** wklx,
+ deny /sys/fs/c[^g]*/** wklx,
+ deny /sys/fs/cg[^r]*/** wklx,
+ deny /sys/firmware/efi/efivars/** rwklx,
+ deny /sys/kernel/security/** rwklx,
+}
+```
+
+*TODO: seccomp work is being done to find a good default config*
+
+### Runtime and Init Process
+
+During container creation the parent process needs to talk to the container's init
+process and have a form of synchronization. This is accomplished by creating
+a pipe that is passed to the container's init. When the init process first spawns
+it will block on its side of the pipe until the parent closes its side. This
+allows the parent to have time to set the new process inside a cgroup hierarchy
+and/or write any uid/gid mappings required for user namespaces.
+The pipe is passed to the init process via FD 3.
+
+The application consuming libcontainer should be compiled statically. libcontainer
+does not define any init process and the arguments provided are used to `exec` the
+process inside the application. There should be no long running init within the
+container spec.
+
+If a pseudo tty is provided to a container it will open and `dup2` the console
+as the container's STDIN, STDOUT, STDERR as well as mounting the console
+as `/dev/console`.
+
+An extra set of mounts are provided to a container and setup for use. A container's
+rootfs can contain some non portable files inside that can cause side effects during
+execution of a process. These files are usually created and populated with the container
+specific information via the runtime.
+
+**Extra runtime files:**
+* /etc/hosts
+* /etc/resolv.conf
+* /etc/hostname
+* /etc/localtime
+
+
+#### Defaults
+
+There are a few defaults that can be overridden by users, but in their omission
+these apply to processes within a container.
+
+| Type | Value |
+| ------------------- | ------------------------------ |
+| Parent Death Signal | SIGKILL |
+| UID | 0 |
+| GID | 0 |
+| GROUPS | 0, NULL |
+| CWD | "/" |
+| $HOME | Current user's home dir or "/" |
+| Readonly rootfs | false |
+| Pseudo TTY | false |
+
+
+## Actions
+
+After a container is created there is a standard set of actions that can
+be done to the container. These actions are part of the public API for
+a container.
+
+| Action | Description |
+| -------------- | ------------------------------------------------------------------ |
+| Get processes | Return all the pids for processes running inside a container |
+| Get Stats | Return resource statistics for the container as a whole |
+| Wait | Waits on the container's init process ( pid 1 ) |
+| Wait Process | Wait on any of the container's processes returning the exit status |
+| Destroy | Kill the container's init process and remove any filesystem state |
+| Signal | Send a signal to the container's init process |
+| Signal Process | Send a signal to any of the container's processes |
+| Pause | Pause all processes inside the container |
+| Resume | Resume all processes inside the container if paused |
+| Exec | Execute a new process inside of the container ( requires setns ) |
+| Set | Setup configs of the container after it's created |
+
+### Execute a new process inside of a running container
+
+User can execute a new process inside of a running container. Any binaries to be
+executed must be accessible within the container's rootfs.
+
+The started process will run inside the container's rootfs. Any changes
+made by the process to the container's filesystem will persist after the
+process finished executing.
+
+The started process will join all the container's existing namespaces. When the
+container is paused, the process will also be paused and will resume when
+the container is unpaused. The started process will only run when the container's
+primary process (PID 1) is running, and will not be restarted when the container
+is restarted.
+
+#### Planned additions
+
+The started process will have its own cgroups nested inside the container's
+cgroups. This is used for process tracking and optionally resource allocation
+handling for the new process. Freezer cgroup is required, the rest of the cgroups
+are optional. The process executor must place its pid inside the correct
+cgroups before starting the process. This is done so that no child processes or
+threads can escape the cgroups.
+
+When the process is stopped, the process executor will try (in a best-effort way)
+to stop all its children and remove the sub-cgroups.
--- /dev/null
+// +build apparmor,linux
+
+package apparmor
+
+import (
+ "fmt"
+ "io/ioutil"
+ "os"
+)
+
+// IsEnabled returns true if apparmor is enabled for the host.
+func IsEnabled() bool {
+ if _, err := os.Stat("/sys/kernel/security/apparmor"); err == nil && os.Getenv("container") == "" {
+ if _, err = os.Stat("/sbin/apparmor_parser"); err == nil {
+ buf, err := ioutil.ReadFile("/sys/module/apparmor/parameters/enabled")
+ return err == nil && len(buf) > 1 && buf[0] == 'Y'
+ }
+ }
+ return false
+}
+
+func setprocattr(attr, value string) error {
+ // Under AppArmor you can only change your own attr, so use /proc/self/
+ // instead of /proc/<tid>/ like libapparmor does
+ path := fmt.Sprintf("/proc/self/attr/%s", attr)
+
+ f, err := os.OpenFile(path, os.O_WRONLY, 0)
+ if err != nil {
+ return err
+ }
+ defer f.Close()
+
+ _, err = fmt.Fprintf(f, "%s", value)
+ return err
+}
+
+// changeOnExec reimplements aa_change_onexec from libapparmor in Go
+func changeOnExec(name string) error {
+ value := "exec " + name
+ if err := setprocattr("exec", value); err != nil {
+ return fmt.Errorf("apparmor failed to apply profile: %s", err)
+ }
+ return nil
+}
+
+// ApplyProfile will apply the profile with the specified name to the process after
+// the next exec.
+func ApplyProfile(name string) error {
+ if name == "" {
+ return nil
+ }
+
+ return changeOnExec(name)
+}
--- /dev/null
+// +build !apparmor !linux
+
+package apparmor
+
+import (
+ "errors"
+)
+
+var ErrApparmorNotEnabled = errors.New("apparmor: config provided but apparmor not supported")
+
+func IsEnabled() bool {
+ return false
+}
+
+func ApplyProfile(name string) error {
+ if name != "" {
+ return ErrApparmorNotEnabled
+ }
+ return nil
+}
--- /dev/null
+// +build linux
+
+package libcontainer
+
+import (
+ "fmt"
+ "strings"
+
+ "github.com/opencontainers/runc/libcontainer/configs"
+ "github.com/syndtr/gocapability/capability"
+)
+
+const allCapabilityTypes = capability.CAPS | capability.BOUNDS | capability.AMBS
+
+var capabilityMap map[string]capability.Cap
+
+func init() {
+ capabilityMap = make(map[string]capability.Cap)
+ last := capability.CAP_LAST_CAP
+ // workaround for RHEL6 which has no /proc/sys/kernel/cap_last_cap
+ if last == capability.Cap(63) {
+ last = capability.CAP_BLOCK_SUSPEND
+ }
+ for _, cap := range capability.List() {
+ if cap > last {
+ continue
+ }
+ capKey := fmt.Sprintf("CAP_%s", strings.ToUpper(cap.String()))
+ capabilityMap[capKey] = cap
+ }
+}
+
+func newContainerCapList(capConfig *configs.Capabilities) (*containerCapabilities, error) {
+ bounding := []capability.Cap{}
+ for _, c := range capConfig.Bounding {
+ v, ok := capabilityMap[c]
+ if !ok {
+ return nil, fmt.Errorf("unknown capability %q", c)
+ }
+ bounding = append(bounding, v)
+ }
+ effective := []capability.Cap{}
+ for _, c := range capConfig.Effective {
+ v, ok := capabilityMap[c]
+ if !ok {
+ return nil, fmt.Errorf("unknown capability %q", c)
+ }
+ effective = append(effective, v)
+ }
+ inheritable := []capability.Cap{}
+ for _, c := range capConfig.Inheritable {
+ v, ok := capabilityMap[c]
+ if !ok {
+ return nil, fmt.Errorf("unknown capability %q", c)
+ }
+ inheritable = append(inheritable, v)
+ }
+ permitted := []capability.Cap{}
+ for _, c := range capConfig.Permitted {
+ v, ok := capabilityMap[c]
+ if !ok {
+ return nil, fmt.Errorf("unknown capability %q", c)
+ }
+ permitted = append(permitted, v)
+ }
+ ambient := []capability.Cap{}
+ for _, c := range capConfig.Ambient {
+ v, ok := capabilityMap[c]
+ if !ok {
+ return nil, fmt.Errorf("unknown capability %q", c)
+ }
+ ambient = append(ambient, v)
+ }
+ pid, err := capability.NewPid(0)
+ if err != nil {
+ return nil, err
+ }
+ return &containerCapabilities{
+ bounding: bounding,
+ effective: effective,
+ inheritable: inheritable,
+ permitted: permitted,
+ ambient: ambient,
+ pid: pid,
+ }, nil
+}
+
+type containerCapabilities struct {
+ pid capability.Capabilities
+ bounding []capability.Cap
+ effective []capability.Cap
+ inheritable []capability.Cap
+ permitted []capability.Cap
+ ambient []capability.Cap
+}
+
+// ApplyBoundingSet sets the capability bounding set to those specified in the whitelist.
+func (c *containerCapabilities) ApplyBoundingSet() error {
+ c.pid.Clear(capability.BOUNDS)
+ c.pid.Set(capability.BOUNDS, c.bounding...)
+ return c.pid.Apply(capability.BOUNDS)
+}
+
+// Apply sets all the capabilities for the current process in the config.
+func (c *containerCapabilities) ApplyCaps() error {
+ c.pid.Clear(allCapabilityTypes)
+ c.pid.Set(capability.BOUNDS, c.bounding...)
+ c.pid.Set(capability.PERMITTED, c.permitted...)
+ c.pid.Set(capability.INHERITABLE, c.inheritable...)
+ c.pid.Set(capability.EFFECTIVE, c.effective...)
+ c.pid.Set(capability.AMBIENT, c.ambient...)
+ return c.pid.Apply(allCapabilityTypes)
+}
--- /dev/null
+// +build linux
+
+package cgroups
+
+import (
+ "fmt"
+
+ "github.com/opencontainers/runc/libcontainer/configs"
+)
+
+type Manager interface {
+ // Applies cgroup configuration to the process with the specified pid
+ Apply(pid int) error
+
+ // Returns the PIDs inside the cgroup set
+ GetPids() ([]int, error)
+
+ // Returns the PIDs inside the cgroup set & all sub-cgroups
+ GetAllPids() ([]int, error)
+
+ // Returns statistics for the cgroup set
+ GetStats() (*Stats, error)
+
+ // Toggles the freezer cgroup according with specified state
+ Freeze(state configs.FreezerState) error
+
+ // Destroys the cgroup set
+ Destroy() error
+
+ // The option func SystemdCgroups() and Cgroupfs() require following attributes:
+ // Paths map[string]string
+ // Cgroups *configs.Cgroup
+ // Paths maps cgroup subsystem to path at which it is mounted.
+ // Cgroups specifies specific cgroup settings for the various subsystems
+
+ // Returns cgroup paths to save in a state file and to be able to
+ // restore the object later.
+ GetPaths() map[string]string
+
+ // Sets the cgroup as configured.
+ Set(container *configs.Config) error
+}
+
+type NotFoundError struct {
+ Subsystem string
+}
+
+func (e *NotFoundError) Error() string {
+ return fmt.Sprintf("mountpoint for %s not found", e.Subsystem)
+}
+
+func NewNotFoundError(sub string) error {
+ return &NotFoundError{
+ Subsystem: sub,
+ }
+}
+
+func IsNotFound(err error) bool {
+ if err == nil {
+ return false
+ }
+ _, ok := err.(*NotFoundError)
+ return ok
+}
--- /dev/null
+// +build linux
+
+package cgroups
+
+import (
+ "testing"
+)
+
+func TestParseCgroups(t *testing.T) {
+ cgroups, err := ParseCgroupFile("/proc/self/cgroup")
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ if _, ok := cgroups["cpu"]; !ok {
+ t.Fail()
+ }
+}
--- /dev/null
+// +build !linux
+
+package cgroups
--- /dev/null
+// +build linux
+
+package fs
+
+import (
+ "fmt"
+ "io"
+ "io/ioutil"
+ "os"
+ "path/filepath"
+ "sync"
+
+ "github.com/opencontainers/runc/libcontainer/cgroups"
+ "github.com/opencontainers/runc/libcontainer/configs"
+ libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils"
+ "github.com/pkg/errors"
+ "golang.org/x/sys/unix"
+)
+
+var (
+ subsystems = subsystemSet{
+ &CpusetGroup{},
+ &DevicesGroup{},
+ &MemoryGroup{},
+ &CpuGroup{},
+ &CpuacctGroup{},
+ &PidsGroup{},
+ &BlkioGroup{},
+ &HugetlbGroup{},
+ &NetClsGroup{},
+ &NetPrioGroup{},
+ &PerfEventGroup{},
+ &FreezerGroup{},
+ &NameGroup{GroupName: "name=systemd", Join: true},
+ }
+ HugePageSizes, _ = cgroups.GetHugePageSize()
+)
+
+var errSubsystemDoesNotExist = fmt.Errorf("cgroup: subsystem does not exist")
+
+type subsystemSet []subsystem
+
+func (s subsystemSet) Get(name string) (subsystem, error) {
+ for _, ss := range s {
+ if ss.Name() == name {
+ return ss, nil
+ }
+ }
+ return nil, errSubsystemDoesNotExist
+}
+
+type subsystem interface {
+ // Name returns the name of the subsystem.
+ Name() string
+ // Returns the stats, as 'stats', corresponding to the cgroup under 'path'.
+ GetStats(path string, stats *cgroups.Stats) error
+ // Removes the cgroup represented by 'cgroupData'.
+ Remove(*cgroupData) error
+ // Creates and joins the cgroup represented by 'cgroupData'.
+ Apply(*cgroupData) error
+ // Set the cgroup represented by cgroup.
+ Set(path string, cgroup *configs.Cgroup) error
+}
+
+type Manager struct {
+ mu sync.Mutex
+ Cgroups *configs.Cgroup
+ Rootless bool // ignore permission-related errors
+ Paths map[string]string
+}
+
+// The absolute path to the root of the cgroup hierarchies.
+var cgroupRootLock sync.Mutex
+var cgroupRoot string
+
+// Gets the cgroupRoot.
+func getCgroupRoot() (string, error) {
+ cgroupRootLock.Lock()
+ defer cgroupRootLock.Unlock()
+
+ if cgroupRoot != "" {
+ return cgroupRoot, nil
+ }
+
+ root, err := cgroups.FindCgroupMountpointDir()
+ if err != nil {
+ return "", err
+ }
+
+ if _, err := os.Stat(root); err != nil {
+ return "", err
+ }
+
+ cgroupRoot = root
+ return cgroupRoot, nil
+}
+
+type cgroupData struct {
+ root string
+ innerPath string
+ config *configs.Cgroup
+ pid int
+}
+
+// isIgnorableError returns whether err is a permission error (in the loose
+// sense of the word). This includes EROFS (which for an unprivileged user is
+// basically a permission error) and EACCES (for similar reasons) as well as
+// the normal EPERM.
+func isIgnorableError(rootless bool, err error) bool {
+ // We do not ignore errors if we are root.
+ if !rootless {
+ return false
+ }
+ // Is it an ordinary EPERM?
+ if os.IsPermission(errors.Cause(err)) {
+ return true
+ }
+
+ // Try to handle other errnos.
+ var errno error
+ switch err := errors.Cause(err).(type) {
+ case *os.PathError:
+ errno = err.Err
+ case *os.LinkError:
+ errno = err.Err
+ case *os.SyscallError:
+ errno = err.Err
+ }
+ return errno == unix.EROFS || errno == unix.EPERM || errno == unix.EACCES
+}
+
+func (m *Manager) Apply(pid int) (err error) {
+ if m.Cgroups == nil {
+ return nil
+ }
+ m.mu.Lock()
+ defer m.mu.Unlock()
+
+ var c = m.Cgroups
+
+ d, err := getCgroupData(m.Cgroups, pid)
+ if err != nil {
+ return err
+ }
+
+ m.Paths = make(map[string]string)
+ if c.Paths != nil {
+ for name, path := range c.Paths {
+ _, err := d.path(name)
+ if err != nil {
+ if cgroups.IsNotFound(err) {
+ continue
+ }
+ return err
+ }
+ m.Paths[name] = path
+ }
+ return cgroups.EnterPid(m.Paths, pid)
+ }
+
+ for _, sys := range subsystems {
+ // TODO: Apply should, ideally, be reentrant or be broken up into a separate
+ // create and join phase so that the cgroup hierarchy for a container can be
+ // created then join consists of writing the process pids to cgroup.procs
+ p, err := d.path(sys.Name())
+ if err != nil {
+ // The non-presence of the devices subsystem is
+ // considered fatal for security reasons.
+ if cgroups.IsNotFound(err) && sys.Name() != "devices" {
+ continue
+ }
+ return err
+ }
+ m.Paths[sys.Name()] = p
+
+ if err := sys.Apply(d); err != nil {
+ // In the case of rootless (including euid=0 in userns), where an explicit cgroup path hasn't
+ // been set, we don't bail on error in case of permission problems.
+ // Cases where limits have been set (and we couldn't create our own
+ // cgroup) are handled by Set.
+ if isIgnorableError(m.Rootless, err) && m.Cgroups.Path == "" {
+ delete(m.Paths, sys.Name())
+ continue
+ }
+ return err
+ }
+
+ }
+ return nil
+}
+
+func (m *Manager) Destroy() error {
+ if m.Cgroups == nil || m.Cgroups.Paths != nil {
+ return nil
+ }
+ m.mu.Lock()
+ defer m.mu.Unlock()
+ if err := cgroups.RemovePaths(m.Paths); err != nil {
+ return err
+ }
+ m.Paths = make(map[string]string)
+ return nil
+}
+
+func (m *Manager) GetPaths() map[string]string {
+ m.mu.Lock()
+ paths := m.Paths
+ m.mu.Unlock()
+ return paths
+}
+
+func (m *Manager) GetStats() (*cgroups.Stats, error) {
+ m.mu.Lock()
+ defer m.mu.Unlock()
+ stats := cgroups.NewStats()
+ for name, path := range m.Paths {
+ sys, err := subsystems.Get(name)
+ if err == errSubsystemDoesNotExist || !cgroups.PathExists(path) {
+ continue
+ }
+ if err := sys.GetStats(path, stats); err != nil {
+ return nil, err
+ }
+ }
+ return stats, nil
+}
+
+func (m *Manager) Set(container *configs.Config) error {
+ // If Paths are set, then we are just joining cgroups paths
+ // and there is no need to set any values.
+ if m.Cgroups.Paths != nil {
+ return nil
+ }
+
+ paths := m.GetPaths()
+ for _, sys := range subsystems {
+ path := paths[sys.Name()]
+ if err := sys.Set(path, container.Cgroups); err != nil {
+ if m.Rootless && sys.Name() == "devices" {
+ continue
+ }
+ // When m.Rootless is true, errors from the device subsystem are ignored because it is really not expected to work.
+ // However, errors from other subsystems are not ignored.
+ // see @test "runc create (rootless + limits + no cgrouppath + no permission) fails with informative error"
+ if path == "" {
+ // We never created a path for this cgroup, so we cannot set
+ // limits for it (though we have already tried at this point).
+ return fmt.Errorf("cannot set %s limit: container could not join or create cgroup", sys.Name())
+ }
+ return err
+ }
+ }
+
+ if m.Paths["cpu"] != "" {
+ if err := CheckCpushares(m.Paths["cpu"], container.Cgroups.Resources.CpuShares); err != nil {
+ return err
+ }
+ }
+ return nil
+}
+
+// Freeze toggles the container's freezer cgroup depending on the state
+// provided
+func (m *Manager) Freeze(state configs.FreezerState) error {
+ paths := m.GetPaths()
+ dir := paths["freezer"]
+ prevState := m.Cgroups.Resources.Freezer
+ m.Cgroups.Resources.Freezer = state
+ freezer, err := subsystems.Get("freezer")
+ if err != nil {
+ return err
+ }
+ err = freezer.Set(dir, m.Cgroups)
+ if err != nil {
+ m.Cgroups.Resources.Freezer = prevState
+ return err
+ }
+ return nil
+}
+
+func (m *Manager) GetPids() ([]int, error) {
+ paths := m.GetPaths()
+ return cgroups.GetPids(paths["devices"])
+}
+
+func (m *Manager) GetAllPids() ([]int, error) {
+ paths := m.GetPaths()
+ return cgroups.GetAllPids(paths["devices"])
+}
+
+func getCgroupData(c *configs.Cgroup, pid int) (*cgroupData, error) {
+ root, err := getCgroupRoot()
+ if err != nil {
+ return nil, err
+ }
+
+ if (c.Name != "" || c.Parent != "") && c.Path != "" {
+ return nil, fmt.Errorf("cgroup: either Path or Name and Parent should be used")
+ }
+
+ // XXX: Do not remove this code. Path safety is important! -- cyphar
+ cgPath := libcontainerUtils.CleanPath(c.Path)
+ cgParent := libcontainerUtils.CleanPath(c.Parent)
+ cgName := libcontainerUtils.CleanPath(c.Name)
+
+ innerPath := cgPath
+ if innerPath == "" {
+ innerPath = filepath.Join(cgParent, cgName)
+ }
+
+ return &cgroupData{
+ root: root,
+ innerPath: innerPath,
+ config: c,
+ pid: pid,
+ }, nil
+}
+
+func (raw *cgroupData) path(subsystem string) (string, error) {
+ mnt, err := cgroups.FindCgroupMountpoint(raw.root, subsystem)
+ // If we didn't mount the subsystem, there is no point we make the path.
+ if err != nil {
+ return "", err
+ }
+
+ // If the cgroup name/path is absolute do not look relative to the cgroup of the init process.
+ if filepath.IsAbs(raw.innerPath) {
+ // Sometimes subsystems can be mounted together as 'cpu,cpuacct'.
+ return filepath.Join(raw.root, filepath.Base(mnt), raw.innerPath), nil
+ }
+
+ // Use GetOwnCgroupPath instead of GetInitCgroupPath, because the creating
+ // process could in container and shared pid namespace with host, and
+ // /proc/1/cgroup could point to whole other world of cgroups.
+ parentPath, err := cgroups.GetOwnCgroupPath(subsystem)
+ if err != nil {
+ return "", err
+ }
+
+ return filepath.Join(parentPath, raw.innerPath), nil
+}
+
+func (raw *cgroupData) join(subsystem string) (string, error) {
+ path, err := raw.path(subsystem)
+ if err != nil {
+ return "", err
+ }
+ if err := os.MkdirAll(path, 0755); err != nil {
+ return "", err
+ }
+ if err := cgroups.WriteCgroupProc(path, raw.pid); err != nil {
+ return "", err
+ }
+ return path, nil
+}
+
+func writeFile(dir, file, data string) error {
+ // Normally dir should not be empty, one case is that cgroup subsystem
+ // is not mounted, we will get empty dir, and we want it fail here.
+ if dir == "" {
+ return fmt.Errorf("no such directory for %s", file)
+ }
+ if err := ioutil.WriteFile(filepath.Join(dir, file), []byte(data), 0700); err != nil {
+ return fmt.Errorf("failed to write %v to %v: %v", data, file, err)
+ }
+ return nil
+}
+
+func readFile(dir, file string) (string, error) {
+ data, err := ioutil.ReadFile(filepath.Join(dir, file))
+ return string(data), err
+}
+
+func removePath(p string, err error) error {
+ if err != nil {
+ return err
+ }
+ if p != "" {
+ return os.RemoveAll(p)
+ }
+ return nil
+}
+
+func CheckCpushares(path string, c uint64) error {
+ var cpuShares uint64
+
+ if c == 0 {
+ return nil
+ }
+
+ fd, err := os.Open(filepath.Join(path, "cpu.shares"))
+ if err != nil {
+ return err
+ }
+ defer fd.Close()
+
+ _, err = fmt.Fscanf(fd, "%d", &cpuShares)
+ if err != nil && err != io.EOF {
+ return err
+ }
+
+ if c > cpuShares {
+ return fmt.Errorf("The maximum allowed cpu-shares is %d", cpuShares)
+ } else if c < cpuShares {
+ return fmt.Errorf("The minimum allowed cpu-shares is %d", cpuShares)
+ }
+
+ return nil
+}
--- /dev/null
+// +build linux
+
+package fs
+
+import (
+ "path/filepath"
+ "strings"
+ "testing"
+
+ "github.com/opencontainers/runc/libcontainer/configs"
+)
+
+func TestInvalidCgroupPath(t *testing.T) {
+ root, err := getCgroupRoot()
+ if err != nil {
+ t.Errorf("couldn't get cgroup root: %v", err)
+ }
+
+ config := &configs.Cgroup{
+ Path: "../../../../../../../../../../some/path",
+ }
+
+ data, err := getCgroupData(config, 0)
+ if err != nil {
+ t.Errorf("couldn't get cgroup data: %v", err)
+ }
+
+ // Make sure the final innerPath doesn't go outside the cgroup mountpoint.
+ if strings.HasPrefix(data.innerPath, "..") {
+ t.Errorf("SECURITY: cgroup innerPath is outside cgroup mountpoint!")
+ }
+
+ // Double-check, using an actual cgroup.
+ deviceRoot := filepath.Join(root, "devices")
+ devicePath, err := data.path("devices")
+ if err != nil {
+ t.Errorf("couldn't get cgroup path: %v", err)
+ }
+ if !strings.HasPrefix(devicePath, deviceRoot) {
+ t.Errorf("SECURITY: cgroup path() is outside cgroup mountpoint!")
+ }
+}
+
+func TestInvalidAbsoluteCgroupPath(t *testing.T) {
+ root, err := getCgroupRoot()
+ if err != nil {
+ t.Errorf("couldn't get cgroup root: %v", err)
+ }
+
+ config := &configs.Cgroup{
+ Path: "/../../../../../../../../../../some/path",
+ }
+
+ data, err := getCgroupData(config, 0)
+ if err != nil {
+ t.Errorf("couldn't get cgroup data: %v", err)
+ }
+
+ // Make sure the final innerPath doesn't go outside the cgroup mountpoint.
+ if strings.HasPrefix(data.innerPath, "..") {
+ t.Errorf("SECURITY: cgroup innerPath is outside cgroup mountpoint!")
+ }
+
+ // Double-check, using an actual cgroup.
+ deviceRoot := filepath.Join(root, "devices")
+ devicePath, err := data.path("devices")
+ if err != nil {
+ t.Errorf("couldn't get cgroup path: %v", err)
+ }
+ if !strings.HasPrefix(devicePath, deviceRoot) {
+ t.Errorf("SECURITY: cgroup path() is outside cgroup mountpoint!")
+ }
+}
+
+// XXX: Remove me after we get rid of configs.Cgroup.Name and configs.Cgroup.Parent.
+func TestInvalidCgroupParent(t *testing.T) {
+ root, err := getCgroupRoot()
+ if err != nil {
+ t.Errorf("couldn't get cgroup root: %v", err)
+ }
+
+ config := &configs.Cgroup{
+ Parent: "../../../../../../../../../../some/path",
+ Name: "name",
+ }
+
+ data, err := getCgroupData(config, 0)
+ if err != nil {
+ t.Errorf("couldn't get cgroup data: %v", err)
+ }
+
+ // Make sure the final innerPath doesn't go outside the cgroup mountpoint.
+ if strings.HasPrefix(data.innerPath, "..") {
+ t.Errorf("SECURITY: cgroup innerPath is outside cgroup mountpoint!")
+ }
+
+ // Double-check, using an actual cgroup.
+ deviceRoot := filepath.Join(root, "devices")
+ devicePath, err := data.path("devices")
+ if err != nil {
+ t.Errorf("couldn't get cgroup path: %v", err)
+ }
+ if !strings.HasPrefix(devicePath, deviceRoot) {
+ t.Errorf("SECURITY: cgroup path() is outside cgroup mountpoint!")
+ }
+}
+
+// XXX: Remove me after we get rid of configs.Cgroup.Name and configs.Cgroup.Parent.
+func TestInvalidAbsoluteCgroupParent(t *testing.T) {
+ root, err := getCgroupRoot()
+ if err != nil {
+ t.Errorf("couldn't get cgroup root: %v", err)
+ }
+
+ config := &configs.Cgroup{
+ Parent: "/../../../../../../../../../../some/path",
+ Name: "name",
+ }
+
+ data, err := getCgroupData(config, 0)
+ if err != nil {
+ t.Errorf("couldn't get cgroup data: %v", err)
+ }
+
+ // Make sure the final innerPath doesn't go outside the cgroup mountpoint.
+ if strings.HasPrefix(data.innerPath, "..") {
+ t.Errorf("SECURITY: cgroup innerPath is outside cgroup mountpoint!")
+ }
+
+ // Double-check, using an actual cgroup.
+ deviceRoot := filepath.Join(root, "devices")
+ devicePath, err := data.path("devices")
+ if err != nil {
+ t.Errorf("couldn't get cgroup path: %v", err)
+ }
+ if !strings.HasPrefix(devicePath, deviceRoot) {
+ t.Errorf("SECURITY: cgroup path() is outside cgroup mountpoint!")
+ }
+}
+
+// XXX: Remove me after we get rid of configs.Cgroup.Name and configs.Cgroup.Parent.
+func TestInvalidCgroupName(t *testing.T) {
+ root, err := getCgroupRoot()
+ if err != nil {
+ t.Errorf("couldn't get cgroup root: %v", err)
+ }
+
+ config := &configs.Cgroup{
+ Parent: "parent",
+ Name: "../../../../../../../../../../some/path",
+ }
+
+ data, err := getCgroupData(config, 0)
+ if err != nil {
+ t.Errorf("couldn't get cgroup data: %v", err)
+ }
+
+ // Make sure the final innerPath doesn't go outside the cgroup mountpoint.
+ if strings.HasPrefix(data.innerPath, "..") {
+ t.Errorf("SECURITY: cgroup innerPath is outside cgroup mountpoint!")
+ }
+
+ // Double-check, using an actual cgroup.
+ deviceRoot := filepath.Join(root, "devices")
+ devicePath, err := data.path("devices")
+ if err != nil {
+ t.Errorf("couldn't get cgroup path: %v", err)
+ }
+ if !strings.HasPrefix(devicePath, deviceRoot) {
+ t.Errorf("SECURITY: cgroup path() is outside cgroup mountpoint!")
+ }
+
+}
+
+// XXX: Remove me after we get rid of configs.Cgroup.Name and configs.Cgroup.Parent.
+func TestInvalidAbsoluteCgroupName(t *testing.T) {
+ root, err := getCgroupRoot()
+ if err != nil {
+ t.Errorf("couldn't get cgroup root: %v", err)
+ }
+
+ config := &configs.Cgroup{
+ Parent: "parent",
+ Name: "/../../../../../../../../../../some/path",
+ }
+
+ data, err := getCgroupData(config, 0)
+ if err != nil {
+ t.Errorf("couldn't get cgroup data: %v", err)
+ }
+
+ // Make sure the final innerPath doesn't go outside the cgroup mountpoint.
+ if strings.HasPrefix(data.innerPath, "..") {
+ t.Errorf("SECURITY: cgroup innerPath is outside cgroup mountpoint!")
+ }
+
+ // Double-check, using an actual cgroup.
+ deviceRoot := filepath.Join(root, "devices")
+ devicePath, err := data.path("devices")
+ if err != nil {
+ t.Errorf("couldn't get cgroup path: %v", err)
+ }
+ if !strings.HasPrefix(devicePath, deviceRoot) {
+ t.Errorf("SECURITY: cgroup path() is outside cgroup mountpoint!")
+ }
+}
+
+// XXX: Remove me after we get rid of configs.Cgroup.Name and configs.Cgroup.Parent.
+func TestInvalidCgroupNameAndParent(t *testing.T) {
+ root, err := getCgroupRoot()
+ if err != nil {
+ t.Errorf("couldn't get cgroup root: %v", err)
+ }
+
+ config := &configs.Cgroup{
+ Parent: "../../../../../../../../../../some/path",
+ Name: "../../../../../../../../../../some/path",
+ }
+
+ data, err := getCgroupData(config, 0)
+ if err != nil {
+ t.Errorf("couldn't get cgroup data: %v", err)
+ }
+
+ // Make sure the final innerPath doesn't go outside the cgroup mountpoint.
+ if strings.HasPrefix(data.innerPath, "..") {
+ t.Errorf("SECURITY: cgroup innerPath is outside cgroup mountpoint!")
+ }
+
+ // Double-check, using an actual cgroup.
+ deviceRoot := filepath.Join(root, "devices")
+ devicePath, err := data.path("devices")
+ if err != nil {
+ t.Errorf("couldn't get cgroup path: %v", err)
+ }
+ if !strings.HasPrefix(devicePath, deviceRoot) {
+ t.Errorf("SECURITY: cgroup path() is outside cgroup mountpoint!")
+ }
+}
+
+// XXX: Remove me after we get rid of configs.Cgroup.Name and configs.Cgroup.Parent.
+func TestInvalidAbsoluteCgroupNameAndParent(t *testing.T) {
+ root, err := getCgroupRoot()
+ if err != nil {
+ t.Errorf("couldn't get cgroup root: %v", err)
+ }
+
+ config := &configs.Cgroup{
+ Parent: "/../../../../../../../../../../some/path",
+ Name: "/../../../../../../../../../../some/path",
+ }
+
+ data, err := getCgroupData(config, 0)
+ if err != nil {
+ t.Errorf("couldn't get cgroup data: %v", err)
+ }
+
+ // Make sure the final innerPath doesn't go outside the cgroup mountpoint.
+ if strings.HasPrefix(data.innerPath, "..") {
+ t.Errorf("SECURITY: cgroup innerPath is outside cgroup mountpoint!")
+ }
+
+ // Double-check, using an actual cgroup.
+ deviceRoot := filepath.Join(root, "devices")
+ devicePath, err := data.path("devices")
+ if err != nil {
+ t.Errorf("couldn't get cgroup path: %v", err)
+ }
+ if !strings.HasPrefix(devicePath, deviceRoot) {
+ t.Errorf("SECURITY: cgroup path() is outside cgroup mountpoint!")
+ }
+}
--- /dev/null
+// +build linux
+
+package fs
+
+import (
+ "bufio"
+ "fmt"
+ "os"
+ "path/filepath"
+ "strconv"
+ "strings"
+
+ "github.com/opencontainers/runc/libcontainer/cgroups"
+ "github.com/opencontainers/runc/libcontainer/configs"
+)
+
+type BlkioGroup struct {
+}
+
+func (s *BlkioGroup) Name() string {
+ return "blkio"
+}
+
+func (s *BlkioGroup) Apply(d *cgroupData) error {
+ _, err := d.join("blkio")
+ if err != nil && !cgroups.IsNotFound(err) {
+ return err
+ }
+ return nil
+}
+
+func (s *BlkioGroup) Set(path string, cgroup *configs.Cgroup) error {
+ if cgroup.Resources.BlkioWeight != 0 {
+ if err := writeFile(path, "blkio.weight", strconv.FormatUint(uint64(cgroup.Resources.BlkioWeight), 10)); err != nil {
+ return err
+ }
+ }
+
+ if cgroup.Resources.BlkioLeafWeight != 0 {
+ if err := writeFile(path, "blkio.leaf_weight", strconv.FormatUint(uint64(cgroup.Resources.BlkioLeafWeight), 10)); err != nil {
+ return err
+ }
+ }
+ for _, wd := range cgroup.Resources.BlkioWeightDevice {
+ if err := writeFile(path, "blkio.weight_device", wd.WeightString()); err != nil {
+ return err
+ }
+ if err := writeFile(path, "blkio.leaf_weight_device", wd.LeafWeightString()); err != nil {
+ return err
+ }
+ }
+ for _, td := range cgroup.Resources.BlkioThrottleReadBpsDevice {
+ if err := writeFile(path, "blkio.throttle.read_bps_device", td.String()); err != nil {
+ return err
+ }
+ }
+ for _, td := range cgroup.Resources.BlkioThrottleWriteBpsDevice {
+ if err := writeFile(path, "blkio.throttle.write_bps_device", td.String()); err != nil {
+ return err
+ }
+ }
+ for _, td := range cgroup.Resources.BlkioThrottleReadIOPSDevice {
+ if err := writeFile(path, "blkio.throttle.read_iops_device", td.String()); err != nil {
+ return err
+ }
+ }
+ for _, td := range cgroup.Resources.BlkioThrottleWriteIOPSDevice {
+ if err := writeFile(path, "blkio.throttle.write_iops_device", td.String()); err != nil {
+ return err
+ }
+ }
+
+ return nil
+}
+
+func (s *BlkioGroup) Remove(d *cgroupData) error {
+ return removePath(d.path("blkio"))
+}
+
+/*
+examples:
+
+ blkio.sectors
+ 8:0 6792
+
+ blkio.io_service_bytes
+ 8:0 Read 1282048
+ 8:0 Write 2195456
+ 8:0 Sync 2195456
+ 8:0 Async 1282048
+ 8:0 Total 3477504
+ Total 3477504
+
+ blkio.io_serviced
+ 8:0 Read 124
+ 8:0 Write 104
+ 8:0 Sync 104
+ 8:0 Async 124
+ 8:0 Total 228
+ Total 228
+
+ blkio.io_queued
+ 8:0 Read 0
+ 8:0 Write 0
+ 8:0 Sync 0
+ 8:0 Async 0
+ 8:0 Total 0
+ Total 0
+*/
+
+func splitBlkioStatLine(r rune) bool {
+ return r == ' ' || r == ':'
+}
+
+func getBlkioStat(path string) ([]cgroups.BlkioStatEntry, error) {
+ var blkioStats []cgroups.BlkioStatEntry
+ f, err := os.Open(path)
+ if err != nil {
+ if os.IsNotExist(err) {
+ return blkioStats, nil
+ }
+ return nil, err
+ }
+ defer f.Close()
+
+ sc := bufio.NewScanner(f)
+ for sc.Scan() {
+ // format: dev type amount
+ fields := strings.FieldsFunc(sc.Text(), splitBlkioStatLine)
+ if len(fields) < 3 {
+ if len(fields) == 2 && fields[0] == "Total" {
+ // skip total line
+ continue
+ } else {
+ return nil, fmt.Errorf("Invalid line found while parsing %s: %s", path, sc.Text())
+ }
+ }
+
+ v, err := strconv.ParseUint(fields[0], 10, 64)
+ if err != nil {
+ return nil, err
+ }
+ major := v
+
+ v, err = strconv.ParseUint(fields[1], 10, 64)
+ if err != nil {
+ return nil, err
+ }
+ minor := v
+
+ op := ""
+ valueField := 2
+ if len(fields) == 4 {
+ op = fields[2]
+ valueField = 3
+ }
+ v, err = strconv.ParseUint(fields[valueField], 10, 64)
+ if err != nil {
+ return nil, err
+ }
+ blkioStats = append(blkioStats, cgroups.BlkioStatEntry{Major: major, Minor: minor, Op: op, Value: v})
+ }
+
+ return blkioStats, nil
+}
+
+func (s *BlkioGroup) GetStats(path string, stats *cgroups.Stats) error {
+ // Try to read CFQ stats available on all CFQ enabled kernels first
+ if blkioStats, err := getBlkioStat(filepath.Join(path, "blkio.io_serviced_recursive")); err == nil && blkioStats != nil {
+ return getCFQStats(path, stats)
+ }
+ return getStats(path, stats) // Use generic stats as fallback
+}
+
+func getCFQStats(path string, stats *cgroups.Stats) error {
+ var blkioStats []cgroups.BlkioStatEntry
+ var err error
+
+ if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.sectors_recursive")); err != nil {
+ return err
+ }
+ stats.BlkioStats.SectorsRecursive = blkioStats
+
+ if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.io_service_bytes_recursive")); err != nil {
+ return err
+ }
+ stats.BlkioStats.IoServiceBytesRecursive = blkioStats
+
+ if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.io_serviced_recursive")); err != nil {
+ return err
+ }
+ stats.BlkioStats.IoServicedRecursive = blkioStats
+
+ if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.io_queued_recursive")); err != nil {
+ return err
+ }
+ stats.BlkioStats.IoQueuedRecursive = blkioStats
+
+ if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.io_service_time_recursive")); err != nil {
+ return err
+ }
+ stats.BlkioStats.IoServiceTimeRecursive = blkioStats
+
+ if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.io_wait_time_recursive")); err != nil {
+ return err
+ }
+ stats.BlkioStats.IoWaitTimeRecursive = blkioStats
+
+ if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.io_merged_recursive")); err != nil {
+ return err
+ }
+ stats.BlkioStats.IoMergedRecursive = blkioStats
+
+ if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.time_recursive")); err != nil {
+ return err
+ }
+ stats.BlkioStats.IoTimeRecursive = blkioStats
+
+ return nil
+}
+
+func getStats(path string, stats *cgroups.Stats) error {
+ var blkioStats []cgroups.BlkioStatEntry
+ var err error
+
+ if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.throttle.io_service_bytes")); err != nil {
+ return err
+ }
+ stats.BlkioStats.IoServiceBytesRecursive = blkioStats
+
+ if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.throttle.io_serviced")); err != nil {
+ return err
+ }
+ stats.BlkioStats.IoServicedRecursive = blkioStats
+
+ return nil
+}
--- /dev/null
+// +build linux
+
+package fs
+
+import (
+ "strconv"
+ "testing"
+
+ "github.com/opencontainers/runc/libcontainer/cgroups"
+ "github.com/opencontainers/runc/libcontainer/configs"
+)
+
+const (
+ sectorsRecursiveContents = `8:0 1024`
+ serviceBytesRecursiveContents = `8:0 Read 100
+8:0 Write 200
+8:0 Sync 300
+8:0 Async 500
+8:0 Total 500
+Total 500`
+ servicedRecursiveContents = `8:0 Read 10
+8:0 Write 40
+8:0 Sync 20
+8:0 Async 30
+8:0 Total 50
+Total 50`
+ queuedRecursiveContents = `8:0 Read 1
+8:0 Write 4
+8:0 Sync 2
+8:0 Async 3
+8:0 Total 5
+Total 5`
+ serviceTimeRecursiveContents = `8:0 Read 173959
+8:0 Write 0
+8:0 Sync 0
+8:0 Async 173959
+8:0 Total 17395
+Total 17395`
+ waitTimeRecursiveContents = `8:0 Read 15571
+8:0 Write 0
+8:0 Sync 0
+8:0 Async 15571
+8:0 Total 15571`
+ mergedRecursiveContents = `8:0 Read 5
+8:0 Write 10
+8:0 Sync 0
+8:0 Async 0
+8:0 Total 15
+Total 15`
+ timeRecursiveContents = `8:0 8`
+ throttleServiceBytes = `8:0 Read 11030528
+8:0 Write 23
+8:0 Sync 42
+8:0 Async 11030528
+8:0 Total 11030528
+252:0 Read 11030528
+252:0 Write 23
+252:0 Sync 42
+252:0 Async 11030528
+252:0 Total 11030528
+Total 22061056`
+ throttleServiced = `8:0 Read 164
+8:0 Write 23
+8:0 Sync 42
+8:0 Async 164
+8:0 Total 164
+252:0 Read 164
+252:0 Write 23
+252:0 Sync 42
+252:0 Async 164
+252:0 Total 164
+Total 328`
+)
+
+func appendBlkioStatEntry(blkioStatEntries *[]cgroups.BlkioStatEntry, major, minor, value uint64, op string) {
+ *blkioStatEntries = append(*blkioStatEntries, cgroups.BlkioStatEntry{Major: major, Minor: minor, Value: value, Op: op})
+}
+
+func TestBlkioSetWeight(t *testing.T) {
+ helper := NewCgroupTestUtil("blkio", t)
+ defer helper.cleanup()
+
+ const (
+ weightBefore = 100
+ weightAfter = 200
+ )
+
+ helper.writeFileContents(map[string]string{
+ "blkio.weight": strconv.Itoa(weightBefore),
+ })
+
+ helper.CgroupData.config.Resources.BlkioWeight = weightAfter
+ blkio := &BlkioGroup{}
+ if err := blkio.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+ t.Fatal(err)
+ }
+
+ value, err := getCgroupParamUint(helper.CgroupPath, "blkio.weight")
+ if err != nil {
+ t.Fatalf("Failed to parse blkio.weight - %s", err)
+ }
+
+ if value != weightAfter {
+ t.Fatal("Got the wrong value, set blkio.weight failed.")
+ }
+}
+
+func TestBlkioSetWeightDevice(t *testing.T) {
+ helper := NewCgroupTestUtil("blkio", t)
+ defer helper.cleanup()
+
+ const (
+ weightDeviceBefore = "8:0 400"
+ )
+
+ wd := configs.NewWeightDevice(8, 0, 500, 0)
+ weightDeviceAfter := wd.WeightString()
+
+ helper.writeFileContents(map[string]string{
+ "blkio.weight_device": weightDeviceBefore,
+ })
+
+ helper.CgroupData.config.Resources.BlkioWeightDevice = []*configs.WeightDevice{wd}
+ blkio := &BlkioGroup{}
+ if err := blkio.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+ t.Fatal(err)
+ }
+
+ value, err := getCgroupParamString(helper.CgroupPath, "blkio.weight_device")
+ if err != nil {
+ t.Fatalf("Failed to parse blkio.weight_device - %s", err)
+ }
+
+ if value != weightDeviceAfter {
+ t.Fatal("Got the wrong value, set blkio.weight_device failed.")
+ }
+}
+
+// regression #274
+func TestBlkioSetMultipleWeightDevice(t *testing.T) {
+ helper := NewCgroupTestUtil("blkio", t)
+ defer helper.cleanup()
+
+ const (
+ weightDeviceBefore = "8:0 400"
+ )
+
+ wd1 := configs.NewWeightDevice(8, 0, 500, 0)
+ wd2 := configs.NewWeightDevice(8, 16, 500, 0)
+ // we cannot actually set and check both because normal ioutil.WriteFile
+ // when writing to cgroup file will overwrite the whole file content instead
+ // of updating it as the kernel is doing. Just check the second device
+ // is present will suffice for the test to ensure multiple writes are done.
+ weightDeviceAfter := wd2.WeightString()
+
+ helper.writeFileContents(map[string]string{
+ "blkio.weight_device": weightDeviceBefore,
+ })
+
+ helper.CgroupData.config.Resources.BlkioWeightDevice = []*configs.WeightDevice{wd1, wd2}
+ blkio := &BlkioGroup{}
+ if err := blkio.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+ t.Fatal(err)
+ }
+
+ value, err := getCgroupParamString(helper.CgroupPath, "blkio.weight_device")
+ if err != nil {
+ t.Fatalf("Failed to parse blkio.weight_device - %s", err)
+ }
+
+ if value != weightDeviceAfter {
+ t.Fatal("Got the wrong value, set blkio.weight_device failed.")
+ }
+}
+
+func TestBlkioStats(t *testing.T) {
+ helper := NewCgroupTestUtil("blkio", t)
+ defer helper.cleanup()
+ helper.writeFileContents(map[string]string{
+ "blkio.io_service_bytes_recursive": serviceBytesRecursiveContents,
+ "blkio.io_serviced_recursive": servicedRecursiveContents,
+ "blkio.io_queued_recursive": queuedRecursiveContents,
+ "blkio.io_service_time_recursive": serviceTimeRecursiveContents,
+ "blkio.io_wait_time_recursive": waitTimeRecursiveContents,
+ "blkio.io_merged_recursive": mergedRecursiveContents,
+ "blkio.time_recursive": timeRecursiveContents,
+ "blkio.sectors_recursive": sectorsRecursiveContents,
+ })
+
+ blkio := &BlkioGroup{}
+ actualStats := *cgroups.NewStats()
+ err := blkio.GetStats(helper.CgroupPath, &actualStats)
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ // Verify expected stats.
+ expectedStats := cgroups.BlkioStats{}
+ appendBlkioStatEntry(&expectedStats.SectorsRecursive, 8, 0, 1024, "")
+
+ appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 100, "Read")
+ appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 200, "Write")
+ appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 300, "Sync")
+ appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 500, "Async")
+ appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 500, "Total")
+
+ appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 10, "Read")
+ appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 40, "Write")
+ appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 20, "Sync")
+ appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 30, "Async")
+ appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 50, "Total")
+
+ appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 1, "Read")
+ appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 4, "Write")
+ appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 2, "Sync")
+ appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 3, "Async")
+ appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 5, "Total")
+
+ appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 173959, "Read")
+ appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 0, "Write")
+ appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 0, "Sync")
+ appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 173959, "Async")
+ appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 17395, "Total")
+
+ appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 15571, "Read")
+ appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 0, "Write")
+ appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 0, "Sync")
+ appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 15571, "Async")
+ appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 15571, "Total")
+
+ appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 5, "Read")
+ appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 10, "Write")
+ appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 0, "Sync")
+ appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 0, "Async")
+ appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 15, "Total")
+
+ appendBlkioStatEntry(&expectedStats.IoTimeRecursive, 8, 0, 8, "")
+
+ expectBlkioStatsEquals(t, expectedStats, actualStats.BlkioStats)
+}
+
+func TestBlkioStatsNoSectorsFile(t *testing.T) {
+ helper := NewCgroupTestUtil("blkio", t)
+ defer helper.cleanup()
+ helper.writeFileContents(map[string]string{
+ "blkio.io_service_bytes_recursive": serviceBytesRecursiveContents,
+ "blkio.io_serviced_recursive": servicedRecursiveContents,
+ "blkio.io_queued_recursive": queuedRecursiveContents,
+ "blkio.io_service_time_recursive": serviceTimeRecursiveContents,
+ "blkio.io_wait_time_recursive": waitTimeRecursiveContents,
+ "blkio.io_merged_recursive": mergedRecursiveContents,
+ "blkio.time_recursive": timeRecursiveContents,
+ })
+
+ blkio := &BlkioGroup{}
+ actualStats := *cgroups.NewStats()
+ err := blkio.GetStats(helper.CgroupPath, &actualStats)
+ if err != nil {
+ t.Fatalf("Failed unexpectedly: %s", err)
+ }
+}
+
+func TestBlkioStatsNoServiceBytesFile(t *testing.T) {
+ helper := NewCgroupTestUtil("blkio", t)
+ defer helper.cleanup()
+ helper.writeFileContents(map[string]string{
+ "blkio.io_serviced_recursive": servicedRecursiveContents,
+ "blkio.io_queued_recursive": queuedRecursiveContents,
+ "blkio.sectors_recursive": sectorsRecursiveContents,
+ "blkio.io_service_time_recursive": serviceTimeRecursiveContents,
+ "blkio.io_wait_time_recursive": waitTimeRecursiveContents,
+ "blkio.io_merged_recursive": mergedRecursiveContents,
+ "blkio.time_recursive": timeRecursiveContents,
+ })
+
+ blkio := &BlkioGroup{}
+ actualStats := *cgroups.NewStats()
+ err := blkio.GetStats(helper.CgroupPath, &actualStats)
+ if err != nil {
+ t.Fatalf("Failed unexpectedly: %s", err)
+ }
+}
+
+func TestBlkioStatsNoServicedFile(t *testing.T) {
+ helper := NewCgroupTestUtil("blkio", t)
+ defer helper.cleanup()
+ helper.writeFileContents(map[string]string{
+ "blkio.io_service_bytes_recursive": serviceBytesRecursiveContents,
+ "blkio.io_queued_recursive": queuedRecursiveContents,
+ "blkio.sectors_recursive": sectorsRecursiveContents,
+ "blkio.io_service_time_recursive": serviceTimeRecursiveContents,
+ "blkio.io_wait_time_recursive": waitTimeRecursiveContents,
+ "blkio.io_merged_recursive": mergedRecursiveContents,
+ "blkio.time_recursive": timeRecursiveContents,
+ })
+
+ blkio := &BlkioGroup{}
+ actualStats := *cgroups.NewStats()
+ err := blkio.GetStats(helper.CgroupPath, &actualStats)
+ if err != nil {
+ t.Fatalf("Failed unexpectedly: %s", err)
+ }
+}
+
+func TestBlkioStatsNoQueuedFile(t *testing.T) {
+ helper := NewCgroupTestUtil("blkio", t)
+ defer helper.cleanup()
+ helper.writeFileContents(map[string]string{
+ "blkio.io_service_bytes_recursive": serviceBytesRecursiveContents,
+ "blkio.io_serviced_recursive": servicedRecursiveContents,
+ "blkio.sectors_recursive": sectorsRecursiveContents,
+ "blkio.io_service_time_recursive": serviceTimeRecursiveContents,
+ "blkio.io_wait_time_recursive": waitTimeRecursiveContents,
+ "blkio.io_merged_recursive": mergedRecursiveContents,
+ "blkio.time_recursive": timeRecursiveContents,
+ })
+
+ blkio := &BlkioGroup{}
+ actualStats := *cgroups.NewStats()
+ err := blkio.GetStats(helper.CgroupPath, &actualStats)
+ if err != nil {
+ t.Fatalf("Failed unexpectedly: %s", err)
+ }
+}
+
+func TestBlkioStatsNoServiceTimeFile(t *testing.T) {
+ if testing.Short() {
+ t.Skip("skipping test in short mode.")
+ }
+ helper := NewCgroupTestUtil("blkio", t)
+ defer helper.cleanup()
+ helper.writeFileContents(map[string]string{
+ "blkio.io_service_bytes_recursive": serviceBytesRecursiveContents,
+ "blkio.io_serviced_recursive": servicedRecursiveContents,
+ "blkio.io_queued_recursive": queuedRecursiveContents,
+ "blkio.io_wait_time_recursive": waitTimeRecursiveContents,
+ "blkio.io_merged_recursive": mergedRecursiveContents,
+ "blkio.time_recursive": timeRecursiveContents,
+ "blkio.sectors_recursive": sectorsRecursiveContents,
+ })
+
+ blkio := &BlkioGroup{}
+ actualStats := *cgroups.NewStats()
+ err := blkio.GetStats(helper.CgroupPath, &actualStats)
+ if err != nil {
+ t.Fatalf("Failed unexpectedly: %s", err)
+ }
+}
+
+func TestBlkioStatsNoWaitTimeFile(t *testing.T) {
+ if testing.Short() {
+ t.Skip("skipping test in short mode.")
+ }
+ helper := NewCgroupTestUtil("blkio", t)
+ defer helper.cleanup()
+ helper.writeFileContents(map[string]string{
+ "blkio.io_service_bytes_recursive": serviceBytesRecursiveContents,
+ "blkio.io_serviced_recursive": servicedRecursiveContents,
+ "blkio.io_queued_recursive": queuedRecursiveContents,
+ "blkio.io_service_time_recursive": serviceTimeRecursiveContents,
+ "blkio.io_merged_recursive": mergedRecursiveContents,
+ "blkio.time_recursive": timeRecursiveContents,
+ "blkio.sectors_recursive": sectorsRecursiveContents,
+ })
+
+ blkio := &BlkioGroup{}
+ actualStats := *cgroups.NewStats()
+ err := blkio.GetStats(helper.CgroupPath, &actualStats)
+ if err != nil {
+ t.Fatalf("Failed unexpectedly: %s", err)
+ }
+}
+
+func TestBlkioStatsNoMergedFile(t *testing.T) {
+ if testing.Short() {
+ t.Skip("skipping test in short mode.")
+ }
+ helper := NewCgroupTestUtil("blkio", t)
+ defer helper.cleanup()
+ helper.writeFileContents(map[string]string{
+ "blkio.io_service_bytes_recursive": serviceBytesRecursiveContents,
+ "blkio.io_serviced_recursive": servicedRecursiveContents,
+ "blkio.io_queued_recursive": queuedRecursiveContents,
+ "blkio.io_service_time_recursive": serviceTimeRecursiveContents,
+ "blkio.io_wait_time_recursive": waitTimeRecursiveContents,
+ "blkio.time_recursive": timeRecursiveContents,
+ "blkio.sectors_recursive": sectorsRecursiveContents,
+ })
+
+ blkio := &BlkioGroup{}
+ actualStats := *cgroups.NewStats()
+ err := blkio.GetStats(helper.CgroupPath, &actualStats)
+ if err != nil {
+ t.Fatalf("Failed unexpectedly: %s", err)
+ }
+}
+
+func TestBlkioStatsNoTimeFile(t *testing.T) {
+ if testing.Short() {
+ t.Skip("skipping test in short mode.")
+ }
+ helper := NewCgroupTestUtil("blkio", t)
+ defer helper.cleanup()
+ helper.writeFileContents(map[string]string{
+ "blkio.io_service_bytes_recursive": serviceBytesRecursiveContents,
+ "blkio.io_serviced_recursive": servicedRecursiveContents,
+ "blkio.io_queued_recursive": queuedRecursiveContents,
+ "blkio.io_service_time_recursive": serviceTimeRecursiveContents,
+ "blkio.io_wait_time_recursive": waitTimeRecursiveContents,
+ "blkio.io_merged_recursive": mergedRecursiveContents,
+ "blkio.sectors_recursive": sectorsRecursiveContents,
+ })
+
+ blkio := &BlkioGroup{}
+ actualStats := *cgroups.NewStats()
+ err := blkio.GetStats(helper.CgroupPath, &actualStats)
+ if err != nil {
+ t.Fatalf("Failed unexpectedly: %s", err)
+ }
+}
+
+func TestBlkioStatsUnexpectedNumberOfFields(t *testing.T) {
+ helper := NewCgroupTestUtil("blkio", t)
+ defer helper.cleanup()
+ helper.writeFileContents(map[string]string{
+ "blkio.io_service_bytes_recursive": "8:0 Read 100 100",
+ "blkio.io_serviced_recursive": servicedRecursiveContents,
+ "blkio.io_queued_recursive": queuedRecursiveContents,
+ "blkio.sectors_recursive": sectorsRecursiveContents,
+ "blkio.io_service_time_recursive": serviceTimeRecursiveContents,
+ "blkio.io_wait_time_recursive": waitTimeRecursiveContents,
+ "blkio.io_merged_recursive": mergedRecursiveContents,
+ "blkio.time_recursive": timeRecursiveContents,
+ })
+
+ blkio := &BlkioGroup{}
+ actualStats := *cgroups.NewStats()
+ err := blkio.GetStats(helper.CgroupPath, &actualStats)
+ if err == nil {
+ t.Fatal("Expected to fail, but did not")
+ }
+}
+
+func TestBlkioStatsUnexpectedFieldType(t *testing.T) {
+ helper := NewCgroupTestUtil("blkio", t)
+ defer helper.cleanup()
+ helper.writeFileContents(map[string]string{
+ "blkio.io_service_bytes_recursive": "8:0 Read Write",
+ "blkio.io_serviced_recursive": servicedRecursiveContents,
+ "blkio.io_queued_recursive": queuedRecursiveContents,
+ "blkio.sectors_recursive": sectorsRecursiveContents,
+ "blkio.io_service_time_recursive": serviceTimeRecursiveContents,
+ "blkio.io_wait_time_recursive": waitTimeRecursiveContents,
+ "blkio.io_merged_recursive": mergedRecursiveContents,
+ "blkio.time_recursive": timeRecursiveContents,
+ })
+
+ blkio := &BlkioGroup{}
+ actualStats := *cgroups.NewStats()
+ err := blkio.GetStats(helper.CgroupPath, &actualStats)
+ if err == nil {
+ t.Fatal("Expected to fail, but did not")
+ }
+}
+
+func TestNonCFQBlkioStats(t *testing.T) {
+ helper := NewCgroupTestUtil("blkio", t)
+ defer helper.cleanup()
+ helper.writeFileContents(map[string]string{
+ "blkio.io_service_bytes_recursive": "",
+ "blkio.io_serviced_recursive": "",
+ "blkio.io_queued_recursive": "",
+ "blkio.sectors_recursive": "",
+ "blkio.io_service_time_recursive": "",
+ "blkio.io_wait_time_recursive": "",
+ "blkio.io_merged_recursive": "",
+ "blkio.time_recursive": "",
+ "blkio.throttle.io_service_bytes": throttleServiceBytes,
+ "blkio.throttle.io_serviced": throttleServiced,
+ })
+
+ blkio := &BlkioGroup{}
+ actualStats := *cgroups.NewStats()
+ err := blkio.GetStats(helper.CgroupPath, &actualStats)
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ // Verify expected stats.
+ expectedStats := cgroups.BlkioStats{}
+
+ appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 11030528, "Read")
+ appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 23, "Write")
+ appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 42, "Sync")
+ appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 11030528, "Async")
+ appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 11030528, "Total")
+ appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 252, 0, 11030528, "Read")
+ appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 252, 0, 23, "Write")
+ appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 252, 0, 42, "Sync")
+ appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 252, 0, 11030528, "Async")
+ appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 252, 0, 11030528, "Total")
+
+ appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 164, "Read")
+ appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 23, "Write")
+ appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 42, "Sync")
+ appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 164, "Async")
+ appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 164, "Total")
+ appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 252, 0, 164, "Read")
+ appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 252, 0, 23, "Write")
+ appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 252, 0, 42, "Sync")
+ appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 252, 0, 164, "Async")
+ appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 252, 0, 164, "Total")
+
+ expectBlkioStatsEquals(t, expectedStats, actualStats.BlkioStats)
+}
+
+func TestBlkioSetThrottleReadBpsDevice(t *testing.T) {
+ helper := NewCgroupTestUtil("blkio", t)
+ defer helper.cleanup()
+
+ const (
+ throttleBefore = `8:0 1024`
+ )
+
+ td := configs.NewThrottleDevice(8, 0, 2048)
+ throttleAfter := td.String()
+
+ helper.writeFileContents(map[string]string{
+ "blkio.throttle.read_bps_device": throttleBefore,
+ })
+
+ helper.CgroupData.config.Resources.BlkioThrottleReadBpsDevice = []*configs.ThrottleDevice{td}
+ blkio := &BlkioGroup{}
+ if err := blkio.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+ t.Fatal(err)
+ }
+
+ value, err := getCgroupParamString(helper.CgroupPath, "blkio.throttle.read_bps_device")
+ if err != nil {
+ t.Fatalf("Failed to parse blkio.throttle.read_bps_device - %s", err)
+ }
+
+ if value != throttleAfter {
+ t.Fatal("Got the wrong value, set blkio.throttle.read_bps_device failed.")
+ }
+}
+func TestBlkioSetThrottleWriteBpsDevice(t *testing.T) {
+ helper := NewCgroupTestUtil("blkio", t)
+ defer helper.cleanup()
+
+ const (
+ throttleBefore = `8:0 1024`
+ )
+
+ td := configs.NewThrottleDevice(8, 0, 2048)
+ throttleAfter := td.String()
+
+ helper.writeFileContents(map[string]string{
+ "blkio.throttle.write_bps_device": throttleBefore,
+ })
+
+ helper.CgroupData.config.Resources.BlkioThrottleWriteBpsDevice = []*configs.ThrottleDevice{td}
+ blkio := &BlkioGroup{}
+ if err := blkio.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+ t.Fatal(err)
+ }
+
+ value, err := getCgroupParamString(helper.CgroupPath, "blkio.throttle.write_bps_device")
+ if err != nil {
+ t.Fatalf("Failed to parse blkio.throttle.write_bps_device - %s", err)
+ }
+
+ if value != throttleAfter {
+ t.Fatal("Got the wrong value, set blkio.throttle.write_bps_device failed.")
+ }
+}
+func TestBlkioSetThrottleReadIOpsDevice(t *testing.T) {
+ helper := NewCgroupTestUtil("blkio", t)
+ defer helper.cleanup()
+
+ const (
+ throttleBefore = `8:0 1024`
+ )
+
+ td := configs.NewThrottleDevice(8, 0, 2048)
+ throttleAfter := td.String()
+
+ helper.writeFileContents(map[string]string{
+ "blkio.throttle.read_iops_device": throttleBefore,
+ })
+
+ helper.CgroupData.config.Resources.BlkioThrottleReadIOPSDevice = []*configs.ThrottleDevice{td}
+ blkio := &BlkioGroup{}
+ if err := blkio.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+ t.Fatal(err)
+ }
+
+ value, err := getCgroupParamString(helper.CgroupPath, "blkio.throttle.read_iops_device")
+ if err != nil {
+ t.Fatalf("Failed to parse blkio.throttle.read_iops_device - %s", err)
+ }
+
+ if value != throttleAfter {
+ t.Fatal("Got the wrong value, set blkio.throttle.read_iops_device failed.")
+ }
+}
+func TestBlkioSetThrottleWriteIOpsDevice(t *testing.T) {
+ helper := NewCgroupTestUtil("blkio", t)
+ defer helper.cleanup()
+
+ const (
+ throttleBefore = `8:0 1024`
+ )
+
+ td := configs.NewThrottleDevice(8, 0, 2048)
+ throttleAfter := td.String()
+
+ helper.writeFileContents(map[string]string{
+ "blkio.throttle.write_iops_device": throttleBefore,
+ })
+
+ helper.CgroupData.config.Resources.BlkioThrottleWriteIOPSDevice = []*configs.ThrottleDevice{td}
+ blkio := &BlkioGroup{}
+ if err := blkio.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+ t.Fatal(err)
+ }
+
+ value, err := getCgroupParamString(helper.CgroupPath, "blkio.throttle.write_iops_device")
+ if err != nil {
+ t.Fatalf("Failed to parse blkio.throttle.write_iops_device - %s", err)
+ }
+
+ if value != throttleAfter {
+ t.Fatal("Got the wrong value, set blkio.throttle.write_iops_device failed.")
+ }
+}
--- /dev/null
+// +build linux
+
+package fs
+
+import (
+ "bufio"
+ "os"
+ "path/filepath"
+ "strconv"
+
+ "github.com/opencontainers/runc/libcontainer/cgroups"
+ "github.com/opencontainers/runc/libcontainer/configs"
+)
+
+type CpuGroup struct {
+}
+
+func (s *CpuGroup) Name() string {
+ return "cpu"
+}
+
+func (s *CpuGroup) Apply(d *cgroupData) error {
+ // We always want to join the cpu group, to allow fair cpu scheduling
+ // on a container basis
+ path, err := d.path("cpu")
+ if err != nil && !cgroups.IsNotFound(err) {
+ return err
+ }
+ return s.ApplyDir(path, d.config, d.pid)
+}
+
+func (s *CpuGroup) ApplyDir(path string, cgroup *configs.Cgroup, pid int) error {
+ // This might happen if we have no cpu cgroup mounted.
+ // Just do nothing and don't fail.
+ if path == "" {
+ return nil
+ }
+ if err := os.MkdirAll(path, 0755); err != nil {
+ return err
+ }
+ // We should set the real-Time group scheduling settings before moving
+ // in the process because if the process is already in SCHED_RR mode
+ // and no RT bandwidth is set, adding it will fail.
+ if err := s.SetRtSched(path, cgroup); err != nil {
+ return err
+ }
+ // because we are not using d.join we need to place the pid into the procs file
+ // unlike the other subsystems
+ return cgroups.WriteCgroupProc(path, pid)
+}
+
+func (s *CpuGroup) SetRtSched(path string, cgroup *configs.Cgroup) error {
+ if cgroup.Resources.CpuRtPeriod != 0 {
+ if err := writeFile(path, "cpu.rt_period_us", strconv.FormatUint(cgroup.Resources.CpuRtPeriod, 10)); err != nil {
+ return err
+ }
+ }
+ if cgroup.Resources.CpuRtRuntime != 0 {
+ if err := writeFile(path, "cpu.rt_runtime_us", strconv.FormatInt(cgroup.Resources.CpuRtRuntime, 10)); err != nil {
+ return err
+ }
+ }
+ return nil
+}
+
+func (s *CpuGroup) Set(path string, cgroup *configs.Cgroup) error {
+ if cgroup.Resources.CpuShares != 0 {
+ if err := writeFile(path, "cpu.shares", strconv.FormatUint(cgroup.Resources.CpuShares, 10)); err != nil {
+ return err
+ }
+ }
+ if cgroup.Resources.CpuPeriod != 0 {
+ if err := writeFile(path, "cpu.cfs_period_us", strconv.FormatUint(cgroup.Resources.CpuPeriod, 10)); err != nil {
+ return err
+ }
+ }
+ if cgroup.Resources.CpuQuota != 0 {
+ if err := writeFile(path, "cpu.cfs_quota_us", strconv.FormatInt(cgroup.Resources.CpuQuota, 10)); err != nil {
+ return err
+ }
+ }
+ return s.SetRtSched(path, cgroup)
+}
+
+func (s *CpuGroup) Remove(d *cgroupData) error {
+ return removePath(d.path("cpu"))
+}
+
+func (s *CpuGroup) GetStats(path string, stats *cgroups.Stats) error {
+ f, err := os.Open(filepath.Join(path, "cpu.stat"))
+ if err != nil {
+ if os.IsNotExist(err) {
+ return nil
+ }
+ return err
+ }
+ defer f.Close()
+
+ sc := bufio.NewScanner(f)
+ for sc.Scan() {
+ t, v, err := getCgroupParamKeyValue(sc.Text())
+ if err != nil {
+ return err
+ }
+ switch t {
+ case "nr_periods":
+ stats.CpuStats.ThrottlingData.Periods = v
+
+ case "nr_throttled":
+ stats.CpuStats.ThrottlingData.ThrottledPeriods = v
+
+ case "throttled_time":
+ stats.CpuStats.ThrottlingData.ThrottledTime = v
+ }
+ }
+ return nil
+}
--- /dev/null
+// +build linux
+
+package fs
+
+import (
+ "fmt"
+ "strconv"
+ "testing"
+
+ "github.com/opencontainers/runc/libcontainer/cgroups"
+)
+
+func TestCpuSetShares(t *testing.T) {
+ helper := NewCgroupTestUtil("cpu", t)
+ defer helper.cleanup()
+
+ const (
+ sharesBefore = 1024
+ sharesAfter = 512
+ )
+
+ helper.writeFileContents(map[string]string{
+ "cpu.shares": strconv.Itoa(sharesBefore),
+ })
+
+ helper.CgroupData.config.Resources.CpuShares = sharesAfter
+ cpu := &CpuGroup{}
+ if err := cpu.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+ t.Fatal(err)
+ }
+
+ value, err := getCgroupParamUint(helper.CgroupPath, "cpu.shares")
+ if err != nil {
+ t.Fatalf("Failed to parse cpu.shares - %s", err)
+ }
+
+ if value != sharesAfter {
+ t.Fatal("Got the wrong value, set cpu.shares failed.")
+ }
+}
+
+func TestCpuSetBandWidth(t *testing.T) {
+ helper := NewCgroupTestUtil("cpu", t)
+ defer helper.cleanup()
+
+ const (
+ quotaBefore = 8000
+ quotaAfter = 5000
+ periodBefore = 10000
+ periodAfter = 7000
+ rtRuntimeBefore = 8000
+ rtRuntimeAfter = 5000
+ rtPeriodBefore = 10000
+ rtPeriodAfter = 7000
+ )
+
+ helper.writeFileContents(map[string]string{
+ "cpu.cfs_quota_us": strconv.Itoa(quotaBefore),
+ "cpu.cfs_period_us": strconv.Itoa(periodBefore),
+ "cpu.rt_runtime_us": strconv.Itoa(rtRuntimeBefore),
+ "cpu.rt_period_us": strconv.Itoa(rtPeriodBefore),
+ })
+
+ helper.CgroupData.config.Resources.CpuQuota = quotaAfter
+ helper.CgroupData.config.Resources.CpuPeriod = periodAfter
+ helper.CgroupData.config.Resources.CpuRtRuntime = rtRuntimeAfter
+ helper.CgroupData.config.Resources.CpuRtPeriod = rtPeriodAfter
+ cpu := &CpuGroup{}
+ if err := cpu.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+ t.Fatal(err)
+ }
+
+ quota, err := getCgroupParamUint(helper.CgroupPath, "cpu.cfs_quota_us")
+ if err != nil {
+ t.Fatalf("Failed to parse cpu.cfs_quota_us - %s", err)
+ }
+ if quota != quotaAfter {
+ t.Fatal("Got the wrong value, set cpu.cfs_quota_us failed.")
+ }
+
+ period, err := getCgroupParamUint(helper.CgroupPath, "cpu.cfs_period_us")
+ if err != nil {
+ t.Fatalf("Failed to parse cpu.cfs_period_us - %s", err)
+ }
+ if period != periodAfter {
+ t.Fatal("Got the wrong value, set cpu.cfs_period_us failed.")
+ }
+ rtRuntime, err := getCgroupParamUint(helper.CgroupPath, "cpu.rt_runtime_us")
+ if err != nil {
+ t.Fatalf("Failed to parse cpu.rt_runtime_us - %s", err)
+ }
+ if rtRuntime != rtRuntimeAfter {
+ t.Fatal("Got the wrong value, set cpu.rt_runtime_us failed.")
+ }
+ rtPeriod, err := getCgroupParamUint(helper.CgroupPath, "cpu.rt_period_us")
+ if err != nil {
+ t.Fatalf("Failed to parse cpu.rt_period_us - %s", err)
+ }
+ if rtPeriod != rtPeriodAfter {
+ t.Fatal("Got the wrong value, set cpu.rt_period_us failed.")
+ }
+}
+
+func TestCpuStats(t *testing.T) {
+ helper := NewCgroupTestUtil("cpu", t)
+ defer helper.cleanup()
+
+ const (
+ nrPeriods = 2000
+ nrThrottled = 200
+ throttledTime = uint64(18446744073709551615)
+ )
+
+ cpuStatContent := fmt.Sprintf("nr_periods %d\n nr_throttled %d\n throttled_time %d\n",
+ nrPeriods, nrThrottled, throttledTime)
+ helper.writeFileContents(map[string]string{
+ "cpu.stat": cpuStatContent,
+ })
+
+ cpu := &CpuGroup{}
+ actualStats := *cgroups.NewStats()
+ err := cpu.GetStats(helper.CgroupPath, &actualStats)
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ expectedStats := cgroups.ThrottlingData{
+ Periods: nrPeriods,
+ ThrottledPeriods: nrThrottled,
+ ThrottledTime: throttledTime}
+
+ expectThrottlingDataEquals(t, expectedStats, actualStats.CpuStats.ThrottlingData)
+}
+
+func TestNoCpuStatFile(t *testing.T) {
+ helper := NewCgroupTestUtil("cpu", t)
+ defer helper.cleanup()
+
+ cpu := &CpuGroup{}
+ actualStats := *cgroups.NewStats()
+ err := cpu.GetStats(helper.CgroupPath, &actualStats)
+ if err != nil {
+ t.Fatal("Expected not to fail, but did")
+ }
+}
+
+func TestInvalidCpuStat(t *testing.T) {
+ helper := NewCgroupTestUtil("cpu", t)
+ defer helper.cleanup()
+ cpuStatContent := `nr_periods 2000
+ nr_throttled 200
+ throttled_time fortytwo`
+ helper.writeFileContents(map[string]string{
+ "cpu.stat": cpuStatContent,
+ })
+
+ cpu := &CpuGroup{}
+ actualStats := *cgroups.NewStats()
+ err := cpu.GetStats(helper.CgroupPath, &actualStats)
+ if err == nil {
+ t.Fatal("Expected failed stat parsing.")
+ }
+}
+
+func TestCpuSetRtSchedAtApply(t *testing.T) {
+ helper := NewCgroupTestUtil("cpu", t)
+ defer helper.cleanup()
+
+ const (
+ rtRuntimeBefore = 0
+ rtRuntimeAfter = 5000
+ rtPeriodBefore = 0
+ rtPeriodAfter = 7000
+ )
+
+ helper.writeFileContents(map[string]string{
+ "cpu.rt_runtime_us": strconv.Itoa(rtRuntimeBefore),
+ "cpu.rt_period_us": strconv.Itoa(rtPeriodBefore),
+ })
+
+ helper.CgroupData.config.Resources.CpuRtRuntime = rtRuntimeAfter
+ helper.CgroupData.config.Resources.CpuRtPeriod = rtPeriodAfter
+ cpu := &CpuGroup{}
+ if err := cpu.ApplyDir(helper.CgroupPath, helper.CgroupData.config, 1234); err != nil {
+ t.Fatal(err)
+ }
+
+ rtRuntime, err := getCgroupParamUint(helper.CgroupPath, "cpu.rt_runtime_us")
+ if err != nil {
+ t.Fatalf("Failed to parse cpu.rt_runtime_us - %s", err)
+ }
+ if rtRuntime != rtRuntimeAfter {
+ t.Fatal("Got the wrong value, set cpu.rt_runtime_us failed.")
+ }
+ rtPeriod, err := getCgroupParamUint(helper.CgroupPath, "cpu.rt_period_us")
+ if err != nil {
+ t.Fatalf("Failed to parse cpu.rt_period_us - %s", err)
+ }
+ if rtPeriod != rtPeriodAfter {
+ t.Fatal("Got the wrong value, set cpu.rt_period_us failed.")
+ }
+ pid, err := getCgroupParamUint(helper.CgroupPath, "cgroup.procs")
+ if err != nil {
+ t.Fatalf("Failed to parse cgroup.procs - %s", err)
+ }
+ if pid != 1234 {
+ t.Fatal("Got the wrong value, set cgroup.procs failed.")
+ }
+}
--- /dev/null
+// +build linux
+
+package fs
+
+import (
+ "fmt"
+ "io/ioutil"
+ "path/filepath"
+ "strconv"
+ "strings"
+
+ "github.com/opencontainers/runc/libcontainer/cgroups"
+ "github.com/opencontainers/runc/libcontainer/configs"
+ "github.com/opencontainers/runc/libcontainer/system"
+)
+
+const (
+ cgroupCpuacctStat = "cpuacct.stat"
+ nanosecondsInSecond = 1000000000
+)
+
+var clockTicks = uint64(system.GetClockTicks())
+
+type CpuacctGroup struct {
+}
+
+func (s *CpuacctGroup) Name() string {
+ return "cpuacct"
+}
+
+func (s *CpuacctGroup) Apply(d *cgroupData) error {
+ // we just want to join this group even though we don't set anything
+ if _, err := d.join("cpuacct"); err != nil && !cgroups.IsNotFound(err) {
+ return err
+ }
+
+ return nil
+}
+
+func (s *CpuacctGroup) Set(path string, cgroup *configs.Cgroup) error {
+ return nil
+}
+
+func (s *CpuacctGroup) Remove(d *cgroupData) error {
+ return removePath(d.path("cpuacct"))
+}
+
+func (s *CpuacctGroup) GetStats(path string, stats *cgroups.Stats) error {
+ userModeUsage, kernelModeUsage, err := getCpuUsageBreakdown(path)
+ if err != nil {
+ return err
+ }
+
+ totalUsage, err := getCgroupParamUint(path, "cpuacct.usage")
+ if err != nil {
+ return err
+ }
+
+ percpuUsage, err := getPercpuUsage(path)
+ if err != nil {
+ return err
+ }
+
+ stats.CpuStats.CpuUsage.TotalUsage = totalUsage
+ stats.CpuStats.CpuUsage.PercpuUsage = percpuUsage
+ stats.CpuStats.CpuUsage.UsageInUsermode = userModeUsage
+ stats.CpuStats.CpuUsage.UsageInKernelmode = kernelModeUsage
+ return nil
+}
+
+// Returns user and kernel usage breakdown in nanoseconds.
+func getCpuUsageBreakdown(path string) (uint64, uint64, error) {
+ userModeUsage := uint64(0)
+ kernelModeUsage := uint64(0)
+ const (
+ userField = "user"
+ systemField = "system"
+ )
+
+ // Expected format:
+ // user <usage in ticks>
+ // system <usage in ticks>
+ data, err := ioutil.ReadFile(filepath.Join(path, cgroupCpuacctStat))
+ if err != nil {
+ return 0, 0, err
+ }
+ fields := strings.Fields(string(data))
+ if len(fields) != 4 {
+ return 0, 0, fmt.Errorf("failure - %s is expected to have 4 fields", filepath.Join(path, cgroupCpuacctStat))
+ }
+ if fields[0] != userField {
+ return 0, 0, fmt.Errorf("unexpected field %q in %q, expected %q", fields[0], cgroupCpuacctStat, userField)
+ }
+ if fields[2] != systemField {
+ return 0, 0, fmt.Errorf("unexpected field %q in %q, expected %q", fields[2], cgroupCpuacctStat, systemField)
+ }
+ if userModeUsage, err = strconv.ParseUint(fields[1], 10, 64); err != nil {
+ return 0, 0, err
+ }
+ if kernelModeUsage, err = strconv.ParseUint(fields[3], 10, 64); err != nil {
+ return 0, 0, err
+ }
+
+ return (userModeUsage * nanosecondsInSecond) / clockTicks, (kernelModeUsage * nanosecondsInSecond) / clockTicks, nil
+}
+
+func getPercpuUsage(path string) ([]uint64, error) {
+ percpuUsage := []uint64{}
+ data, err := ioutil.ReadFile(filepath.Join(path, "cpuacct.usage_percpu"))
+ if err != nil {
+ return percpuUsage, err
+ }
+ for _, value := range strings.Fields(string(data)) {
+ value, err := strconv.ParseUint(value, 10, 64)
+ if err != nil {
+ return percpuUsage, fmt.Errorf("Unable to convert param value to uint64: %s", err)
+ }
+ percpuUsage = append(percpuUsage, value)
+ }
+ return percpuUsage, nil
+}
--- /dev/null
+// +build linux
+
+package fs
+
+import (
+ "bytes"
+ "fmt"
+ "io/ioutil"
+ "os"
+ "path/filepath"
+
+ "github.com/opencontainers/runc/libcontainer/cgroups"
+ "github.com/opencontainers/runc/libcontainer/configs"
+ libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils"
+)
+
+type CpusetGroup struct {
+}
+
+func (s *CpusetGroup) Name() string {
+ return "cpuset"
+}
+
+func (s *CpusetGroup) Apply(d *cgroupData) error {
+ dir, err := d.path("cpuset")
+ if err != nil && !cgroups.IsNotFound(err) {
+ return err
+ }
+ return s.ApplyDir(dir, d.config, d.pid)
+}
+
+func (s *CpusetGroup) Set(path string, cgroup *configs.Cgroup) error {
+ if cgroup.Resources.CpusetCpus != "" {
+ if err := writeFile(path, "cpuset.cpus", cgroup.Resources.CpusetCpus); err != nil {
+ return err
+ }
+ }
+ if cgroup.Resources.CpusetMems != "" {
+ if err := writeFile(path, "cpuset.mems", cgroup.Resources.CpusetMems); err != nil {
+ return err
+ }
+ }
+ return nil
+}
+
+func (s *CpusetGroup) Remove(d *cgroupData) error {
+ return removePath(d.path("cpuset"))
+}
+
+func (s *CpusetGroup) GetStats(path string, stats *cgroups.Stats) error {
+ return nil
+}
+
+func (s *CpusetGroup) ApplyDir(dir string, cgroup *configs.Cgroup, pid int) error {
+ // This might happen if we have no cpuset cgroup mounted.
+ // Just do nothing and don't fail.
+ if dir == "" {
+ return nil
+ }
+ mountInfo, err := ioutil.ReadFile("/proc/self/mountinfo")
+ if err != nil {
+ return err
+ }
+ root := filepath.Dir(cgroups.GetClosestMountpointAncestor(dir, string(mountInfo)))
+ // 'ensureParent' start with parent because we don't want to
+ // explicitly inherit from parent, it could conflict with
+ // 'cpuset.cpu_exclusive'.
+ if err := s.ensureParent(filepath.Dir(dir), root); err != nil {
+ return err
+ }
+ if err := os.MkdirAll(dir, 0755); err != nil {
+ return err
+ }
+ // We didn't inherit cpuset configs from parent, but we have
+ // to ensure cpuset configs are set before moving task into the
+ // cgroup.
+ // The logic is, if user specified cpuset configs, use these
+ // specified configs, otherwise, inherit from parent. This makes
+ // cpuset configs work correctly with 'cpuset.cpu_exclusive', and
+ // keep backward compatibility.
+ if err := s.ensureCpusAndMems(dir, cgroup); err != nil {
+ return err
+ }
+
+ // because we are not using d.join we need to place the pid into the procs file
+ // unlike the other subsystems
+ return cgroups.WriteCgroupProc(dir, pid)
+}
+
+func (s *CpusetGroup) getSubsystemSettings(parent string) (cpus []byte, mems []byte, err error) {
+ if cpus, err = ioutil.ReadFile(filepath.Join(parent, "cpuset.cpus")); err != nil {
+ return
+ }
+ if mems, err = ioutil.ReadFile(filepath.Join(parent, "cpuset.mems")); err != nil {
+ return
+ }
+ return cpus, mems, nil
+}
+
+// ensureParent makes sure that the parent directory of current is created
+// and populated with the proper cpus and mems files copied from
+// it's parent.
+func (s *CpusetGroup) ensureParent(current, root string) error {
+ parent := filepath.Dir(current)
+ if libcontainerUtils.CleanPath(parent) == root {
+ return nil
+ }
+ // Avoid infinite recursion.
+ if parent == current {
+ return fmt.Errorf("cpuset: cgroup parent path outside cgroup root")
+ }
+ if err := s.ensureParent(parent, root); err != nil {
+ return err
+ }
+ if err := os.MkdirAll(current, 0755); err != nil {
+ return err
+ }
+ return s.copyIfNeeded(current, parent)
+}
+
+// copyIfNeeded copies the cpuset.cpus and cpuset.mems from the parent
+// directory to the current directory if the file's contents are 0
+func (s *CpusetGroup) copyIfNeeded(current, parent string) error {
+ var (
+ err error
+ currentCpus, currentMems []byte
+ parentCpus, parentMems []byte
+ )
+
+ if currentCpus, currentMems, err = s.getSubsystemSettings(current); err != nil {
+ return err
+ }
+ if parentCpus, parentMems, err = s.getSubsystemSettings(parent); err != nil {
+ return err
+ }
+
+ if s.isEmpty(currentCpus) {
+ if err := writeFile(current, "cpuset.cpus", string(parentCpus)); err != nil {
+ return err
+ }
+ }
+ if s.isEmpty(currentMems) {
+ if err := writeFile(current, "cpuset.mems", string(parentMems)); err != nil {
+ return err
+ }
+ }
+ return nil
+}
+
+func (s *CpusetGroup) isEmpty(b []byte) bool {
+ return len(bytes.Trim(b, "\n")) == 0
+}
+
+func (s *CpusetGroup) ensureCpusAndMems(path string, cgroup *configs.Cgroup) error {
+ if err := s.Set(path, cgroup); err != nil {
+ return err
+ }
+ return s.copyIfNeeded(path, filepath.Dir(path))
+}
--- /dev/null
+// +build linux
+
+package fs
+
+import (
+ "testing"
+)
+
+func TestCpusetSetCpus(t *testing.T) {
+ helper := NewCgroupTestUtil("cpuset", t)
+ defer helper.cleanup()
+
+ const (
+ cpusBefore = "0"
+ cpusAfter = "1-3"
+ )
+
+ helper.writeFileContents(map[string]string{
+ "cpuset.cpus": cpusBefore,
+ })
+
+ helper.CgroupData.config.Resources.CpusetCpus = cpusAfter
+ cpuset := &CpusetGroup{}
+ if err := cpuset.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+ t.Fatal(err)
+ }
+
+ value, err := getCgroupParamString(helper.CgroupPath, "cpuset.cpus")
+ if err != nil {
+ t.Fatalf("Failed to parse cpuset.cpus - %s", err)
+ }
+
+ if value != cpusAfter {
+ t.Fatal("Got the wrong value, set cpuset.cpus failed.")
+ }
+}
+
+func TestCpusetSetMems(t *testing.T) {
+ helper := NewCgroupTestUtil("cpuset", t)
+ defer helper.cleanup()
+
+ const (
+ memsBefore = "0"
+ memsAfter = "1"
+ )
+
+ helper.writeFileContents(map[string]string{
+ "cpuset.mems": memsBefore,
+ })
+
+ helper.CgroupData.config.Resources.CpusetMems = memsAfter
+ cpuset := &CpusetGroup{}
+ if err := cpuset.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+ t.Fatal(err)
+ }
+
+ value, err := getCgroupParamString(helper.CgroupPath, "cpuset.mems")
+ if err != nil {
+ t.Fatalf("Failed to parse cpuset.mems - %s", err)
+ }
+
+ if value != memsAfter {
+ t.Fatal("Got the wrong value, set cpuset.mems failed.")
+ }
+}
--- /dev/null
+// +build linux
+
+package fs
+
+import (
+ "github.com/opencontainers/runc/libcontainer/cgroups"
+ "github.com/opencontainers/runc/libcontainer/configs"
+ "github.com/opencontainers/runc/libcontainer/system"
+)
+
+type DevicesGroup struct {
+}
+
+func (s *DevicesGroup) Name() string {
+ return "devices"
+}
+
+func (s *DevicesGroup) Apply(d *cgroupData) error {
+ _, err := d.join("devices")
+ if err != nil {
+ // We will return error even it's `not found` error, devices
+ // cgroup is hard requirement for container's security.
+ return err
+ }
+ return nil
+}
+
+func (s *DevicesGroup) Set(path string, cgroup *configs.Cgroup) error {
+ if system.RunningInUserNS() {
+ return nil
+ }
+
+ devices := cgroup.Resources.Devices
+ if len(devices) > 0 {
+ for _, dev := range devices {
+ file := "devices.deny"
+ if dev.Allow {
+ file = "devices.allow"
+ }
+ if err := writeFile(path, file, dev.CgroupString()); err != nil {
+ return err
+ }
+ }
+ return nil
+ }
+ if cgroup.Resources.AllowAllDevices != nil {
+ if *cgroup.Resources.AllowAllDevices == false {
+ if err := writeFile(path, "devices.deny", "a"); err != nil {
+ return err
+ }
+
+ for _, dev := range cgroup.Resources.AllowedDevices {
+ if err := writeFile(path, "devices.allow", dev.CgroupString()); err != nil {
+ return err
+ }
+ }
+ return nil
+ }
+
+ if err := writeFile(path, "devices.allow", "a"); err != nil {
+ return err
+ }
+ }
+
+ for _, dev := range cgroup.Resources.DeniedDevices {
+ if err := writeFile(path, "devices.deny", dev.CgroupString()); err != nil {
+ return err
+ }
+ }
+
+ return nil
+}
+
+func (s *DevicesGroup) Remove(d *cgroupData) error {
+ return removePath(d.path("devices"))
+}
+
+func (s *DevicesGroup) GetStats(path string, stats *cgroups.Stats) error {
+ return nil
+}
--- /dev/null
+// +build linux
+
+package fs
+
+import (
+ "testing"
+
+ "github.com/opencontainers/runc/libcontainer/configs"
+)
+
+var (
+ allowedDevices = []*configs.Device{
+ {
+ Path: "/dev/zero",
+ Type: 'c',
+ Major: 1,
+ Minor: 5,
+ Permissions: "rwm",
+ FileMode: 0666,
+ },
+ }
+ allowedList = "c 1:5 rwm"
+ deniedDevices = []*configs.Device{
+ {
+ Path: "/dev/null",
+ Type: 'c',
+ Major: 1,
+ Minor: 3,
+ Permissions: "rwm",
+ FileMode: 0666,
+ },
+ }
+ deniedList = "c 1:3 rwm"
+)
+
+func TestDevicesSetAllow(t *testing.T) {
+ helper := NewCgroupTestUtil("devices", t)
+ defer helper.cleanup()
+
+ helper.writeFileContents(map[string]string{
+ "devices.deny": "a",
+ })
+ allowAllDevices := false
+ helper.CgroupData.config.Resources.AllowAllDevices = &allowAllDevices
+ helper.CgroupData.config.Resources.AllowedDevices = allowedDevices
+ devices := &DevicesGroup{}
+ if err := devices.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+ t.Fatal(err)
+ }
+
+ value, err := getCgroupParamString(helper.CgroupPath, "devices.allow")
+ if err != nil {
+ t.Fatalf("Failed to parse devices.allow - %s", err)
+ }
+
+ if value != allowedList {
+ t.Fatal("Got the wrong value, set devices.allow failed.")
+ }
+
+ // When AllowAllDevices is nil, devices.allow file should not be modified.
+ helper.CgroupData.config.Resources.AllowAllDevices = nil
+ if err := devices.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+ t.Fatal(err)
+ }
+ value, err = getCgroupParamString(helper.CgroupPath, "devices.allow")
+ if err != nil {
+ t.Fatalf("Failed to parse devices.allow - %s", err)
+ }
+ if value != allowedList {
+ t.Fatal("devices policy shouldn't have changed on AllowedAllDevices=nil.")
+ }
+}
+
+func TestDevicesSetDeny(t *testing.T) {
+ helper := NewCgroupTestUtil("devices", t)
+ defer helper.cleanup()
+
+ helper.writeFileContents(map[string]string{
+ "devices.allow": "a",
+ })
+
+ allowAllDevices := true
+ helper.CgroupData.config.Resources.AllowAllDevices = &allowAllDevices
+ helper.CgroupData.config.Resources.DeniedDevices = deniedDevices
+ devices := &DevicesGroup{}
+ if err := devices.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+ t.Fatal(err)
+ }
+
+ value, err := getCgroupParamString(helper.CgroupPath, "devices.deny")
+ if err != nil {
+ t.Fatalf("Failed to parse devices.deny - %s", err)
+ }
+
+ if value != deniedList {
+ t.Fatal("Got the wrong value, set devices.deny failed.")
+ }
+}
--- /dev/null
+// +build linux
+
+package fs
+
+import (
+ "fmt"
+ "strings"
+ "time"
+
+ "github.com/opencontainers/runc/libcontainer/cgroups"
+ "github.com/opencontainers/runc/libcontainer/configs"
+)
+
+type FreezerGroup struct {
+}
+
+func (s *FreezerGroup) Name() string {
+ return "freezer"
+}
+
+func (s *FreezerGroup) Apply(d *cgroupData) error {
+ _, err := d.join("freezer")
+ if err != nil && !cgroups.IsNotFound(err) {
+ return err
+ }
+ return nil
+}
+
+func (s *FreezerGroup) Set(path string, cgroup *configs.Cgroup) error {
+ switch cgroup.Resources.Freezer {
+ case configs.Frozen, configs.Thawed:
+ for {
+ // In case this loop does not exit because it doesn't get the expected
+ // state, let's write again this state, hoping it's going to be properly
+ // set this time. Otherwise, this loop could run infinitely, waiting for
+ // a state change that would never happen.
+ if err := writeFile(path, "freezer.state", string(cgroup.Resources.Freezer)); err != nil {
+ return err
+ }
+
+ state, err := readFile(path, "freezer.state")
+ if err != nil {
+ return err
+ }
+ if strings.TrimSpace(state) == string(cgroup.Resources.Freezer) {
+ break
+ }
+
+ time.Sleep(1 * time.Millisecond)
+ }
+ case configs.Undefined:
+ return nil
+ default:
+ return fmt.Errorf("Invalid argument '%s' to freezer.state", string(cgroup.Resources.Freezer))
+ }
+
+ return nil
+}
+
+func (s *FreezerGroup) Remove(d *cgroupData) error {
+ return removePath(d.path("freezer"))
+}
+
+func (s *FreezerGroup) GetStats(path string, stats *cgroups.Stats) error {
+ return nil
+}
--- /dev/null
+// +build linux
+
+package fs
+
+import (
+ "testing"
+
+ "github.com/opencontainers/runc/libcontainer/configs"
+)
+
+func TestFreezerSetState(t *testing.T) {
+ helper := NewCgroupTestUtil("freezer", t)
+ defer helper.cleanup()
+
+ helper.writeFileContents(map[string]string{
+ "freezer.state": string(configs.Frozen),
+ })
+
+ helper.CgroupData.config.Resources.Freezer = configs.Thawed
+ freezer := &FreezerGroup{}
+ if err := freezer.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+ t.Fatal(err)
+ }
+
+ value, err := getCgroupParamString(helper.CgroupPath, "freezer.state")
+ if err != nil {
+ t.Fatalf("Failed to parse freezer.state - %s", err)
+ }
+ if value != string(configs.Thawed) {
+ t.Fatal("Got the wrong value, set freezer.state failed.")
+ }
+}
+
+func TestFreezerSetInvalidState(t *testing.T) {
+ helper := NewCgroupTestUtil("freezer", t)
+ defer helper.cleanup()
+
+ const (
+ invalidArg configs.FreezerState = "Invalid"
+ )
+
+ helper.CgroupData.config.Resources.Freezer = invalidArg
+ freezer := &FreezerGroup{}
+ if err := freezer.Set(helper.CgroupPath, helper.CgroupData.config); err == nil {
+ t.Fatal("Failed to return invalid argument error")
+ }
+}
--- /dev/null
+// +build !linux
+
+package fs
--- /dev/null
+// +build linux
+
+package fs
+
+import (
+ "fmt"
+ "strconv"
+ "strings"
+
+ "github.com/opencontainers/runc/libcontainer/cgroups"
+ "github.com/opencontainers/runc/libcontainer/configs"
+)
+
+type HugetlbGroup struct {
+}
+
+func (s *HugetlbGroup) Name() string {
+ return "hugetlb"
+}
+
+func (s *HugetlbGroup) Apply(d *cgroupData) error {
+ _, err := d.join("hugetlb")
+ if err != nil && !cgroups.IsNotFound(err) {
+ return err
+ }
+ return nil
+}
+
+func (s *HugetlbGroup) Set(path string, cgroup *configs.Cgroup) error {
+ for _, hugetlb := range cgroup.Resources.HugetlbLimit {
+ if err := writeFile(path, strings.Join([]string{"hugetlb", hugetlb.Pagesize, "limit_in_bytes"}, "."), strconv.FormatUint(hugetlb.Limit, 10)); err != nil {
+ return err
+ }
+ }
+
+ return nil
+}
+
+func (s *HugetlbGroup) Remove(d *cgroupData) error {
+ return removePath(d.path("hugetlb"))
+}
+
+func (s *HugetlbGroup) GetStats(path string, stats *cgroups.Stats) error {
+ hugetlbStats := cgroups.HugetlbStats{}
+ for _, pageSize := range HugePageSizes {
+ usage := strings.Join([]string{"hugetlb", pageSize, "usage_in_bytes"}, ".")
+ value, err := getCgroupParamUint(path, usage)
+ if err != nil {
+ return fmt.Errorf("failed to parse %s - %v", usage, err)
+ }
+ hugetlbStats.Usage = value
+
+ maxUsage := strings.Join([]string{"hugetlb", pageSize, "max_usage_in_bytes"}, ".")
+ value, err = getCgroupParamUint(path, maxUsage)
+ if err != nil {
+ return fmt.Errorf("failed to parse %s - %v", maxUsage, err)
+ }
+ hugetlbStats.MaxUsage = value
+
+ failcnt := strings.Join([]string{"hugetlb", pageSize, "failcnt"}, ".")
+ value, err = getCgroupParamUint(path, failcnt)
+ if err != nil {
+ return fmt.Errorf("failed to parse %s - %v", failcnt, err)
+ }
+ hugetlbStats.Failcnt = value
+
+ stats.HugetlbStats[pageSize] = hugetlbStats
+ }
+
+ return nil
+}
--- /dev/null
+// +build linux
+
+package fs
+
+import (
+ "fmt"
+ "strconv"
+ "testing"
+
+ "github.com/opencontainers/runc/libcontainer/cgroups"
+ "github.com/opencontainers/runc/libcontainer/configs"
+)
+
+const (
+ hugetlbUsageContents = "128\n"
+ hugetlbMaxUsageContents = "256\n"
+ hugetlbFailcnt = "100\n"
+)
+
+var (
+ usage = "hugetlb.%s.usage_in_bytes"
+ limit = "hugetlb.%s.limit_in_bytes"
+ maxUsage = "hugetlb.%s.max_usage_in_bytes"
+ failcnt = "hugetlb.%s.failcnt"
+)
+
+func TestHugetlbSetHugetlb(t *testing.T) {
+ helper := NewCgroupTestUtil("hugetlb", t)
+ defer helper.cleanup()
+
+ const (
+ hugetlbBefore = 256
+ hugetlbAfter = 512
+ )
+
+ for _, pageSize := range HugePageSizes {
+ helper.writeFileContents(map[string]string{
+ fmt.Sprintf(limit, pageSize): strconv.Itoa(hugetlbBefore),
+ })
+ }
+
+ for _, pageSize := range HugePageSizes {
+ helper.CgroupData.config.Resources.HugetlbLimit = []*configs.HugepageLimit{
+ {
+ Pagesize: pageSize,
+ Limit: hugetlbAfter,
+ },
+ }
+ hugetlb := &HugetlbGroup{}
+ if err := hugetlb.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+ t.Fatal(err)
+ }
+ }
+
+ for _, pageSize := range HugePageSizes {
+ limit := fmt.Sprintf(limit, pageSize)
+ value, err := getCgroupParamUint(helper.CgroupPath, limit)
+ if err != nil {
+ t.Fatalf("Failed to parse %s - %s", limit, err)
+ }
+ if value != hugetlbAfter {
+ t.Fatalf("Set hugetlb.limit_in_bytes failed. Expected: %v, Got: %v", hugetlbAfter, value)
+ }
+ }
+}
+
+func TestHugetlbStats(t *testing.T) {
+ helper := NewCgroupTestUtil("hugetlb", t)
+ defer helper.cleanup()
+ for _, pageSize := range HugePageSizes {
+ helper.writeFileContents(map[string]string{
+ fmt.Sprintf(usage, pageSize): hugetlbUsageContents,
+ fmt.Sprintf(maxUsage, pageSize): hugetlbMaxUsageContents,
+ fmt.Sprintf(failcnt, pageSize): hugetlbFailcnt,
+ })
+ }
+
+ hugetlb := &HugetlbGroup{}
+ actualStats := *cgroups.NewStats()
+ err := hugetlb.GetStats(helper.CgroupPath, &actualStats)
+ if err != nil {
+ t.Fatal(err)
+ }
+ expectedStats := cgroups.HugetlbStats{Usage: 128, MaxUsage: 256, Failcnt: 100}
+ for _, pageSize := range HugePageSizes {
+ expectHugetlbStatEquals(t, expectedStats, actualStats.HugetlbStats[pageSize])
+ }
+}
+
+func TestHugetlbStatsNoUsageFile(t *testing.T) {
+ helper := NewCgroupTestUtil("hugetlb", t)
+ defer helper.cleanup()
+ helper.writeFileContents(map[string]string{
+ maxUsage: hugetlbMaxUsageContents,
+ })
+
+ hugetlb := &HugetlbGroup{}
+ actualStats := *cgroups.NewStats()
+ err := hugetlb.GetStats(helper.CgroupPath, &actualStats)
+ if err == nil {
+ t.Fatal("Expected failure")
+ }
+}
+
+func TestHugetlbStatsNoMaxUsageFile(t *testing.T) {
+ helper := NewCgroupTestUtil("hugetlb", t)
+ defer helper.cleanup()
+ for _, pageSize := range HugePageSizes {
+ helper.writeFileContents(map[string]string{
+ fmt.Sprintf(usage, pageSize): hugetlbUsageContents,
+ })
+ }
+
+ hugetlb := &HugetlbGroup{}
+ actualStats := *cgroups.NewStats()
+ err := hugetlb.GetStats(helper.CgroupPath, &actualStats)
+ if err == nil {
+ t.Fatal("Expected failure")
+ }
+}
+
+func TestHugetlbStatsBadUsageFile(t *testing.T) {
+ helper := NewCgroupTestUtil("hugetlb", t)
+ defer helper.cleanup()
+ for _, pageSize := range HugePageSizes {
+ helper.writeFileContents(map[string]string{
+ fmt.Sprintf(usage, pageSize): "bad",
+ maxUsage: hugetlbMaxUsageContents,
+ })
+ }
+
+ hugetlb := &HugetlbGroup{}
+ actualStats := *cgroups.NewStats()
+ err := hugetlb.GetStats(helper.CgroupPath, &actualStats)
+ if err == nil {
+ t.Fatal("Expected failure")
+ }
+}
+
+func TestHugetlbStatsBadMaxUsageFile(t *testing.T) {
+ helper := NewCgroupTestUtil("hugetlb", t)
+ defer helper.cleanup()
+ helper.writeFileContents(map[string]string{
+ usage: hugetlbUsageContents,
+ maxUsage: "bad",
+ })
+
+ hugetlb := &HugetlbGroup{}
+ actualStats := *cgroups.NewStats()
+ err := hugetlb.GetStats(helper.CgroupPath, &actualStats)
+ if err == nil {
+ t.Fatal("Expected failure")
+ }
+}
--- /dev/null
+// +build linux,!nokmem
+
+package fs
+
+import (
+ "fmt"
+ "io/ioutil"
+ "os"
+ "path/filepath"
+ "strconv"
+ "syscall" // for Errno type only
+
+ "github.com/opencontainers/runc/libcontainer/cgroups"
+ "golang.org/x/sys/unix"
+)
+
+const cgroupKernelMemoryLimit = "memory.kmem.limit_in_bytes"
+
+func EnableKernelMemoryAccounting(path string) error {
+ // Check if kernel memory is enabled
+ // We have to limit the kernel memory here as it won't be accounted at all
+ // until a limit is set on the cgroup and limit cannot be set once the
+ // cgroup has children, or if there are already tasks in the cgroup.
+ for _, i := range []int64{1, -1} {
+ if err := setKernelMemory(path, i); err != nil {
+ return err
+ }
+ }
+ return nil
+}
+
+func setKernelMemory(path string, kernelMemoryLimit int64) error {
+ if path == "" {
+ return fmt.Errorf("no such directory for %s", cgroupKernelMemoryLimit)
+ }
+ if !cgroups.PathExists(filepath.Join(path, cgroupKernelMemoryLimit)) {
+ // kernel memory is not enabled on the system so we should do nothing
+ return nil
+ }
+ if err := ioutil.WriteFile(filepath.Join(path, cgroupKernelMemoryLimit), []byte(strconv.FormatInt(kernelMemoryLimit, 10)), 0700); err != nil {
+ // Check if the error number returned by the syscall is "EBUSY"
+ // The EBUSY signal is returned on attempts to write to the
+ // memory.kmem.limit_in_bytes file if the cgroup has children or
+ // once tasks have been attached to the cgroup
+ if pathErr, ok := err.(*os.PathError); ok {
+ if errNo, ok := pathErr.Err.(syscall.Errno); ok {
+ if errNo == unix.EBUSY {
+ return fmt.Errorf("failed to set %s, because either tasks have already joined this cgroup or it has children", cgroupKernelMemoryLimit)
+ }
+ }
+ }
+ return fmt.Errorf("failed to write %v to %v: %v", kernelMemoryLimit, cgroupKernelMemoryLimit, err)
+ }
+ return nil
+}
--- /dev/null
+// +build linux,nokmem
+
+package fs
+
+func EnableKernelMemoryAccounting(path string) error {
+ return nil
+}
+
+func setKernelMemory(path string, kernelMemoryLimit int64) error {
+ return nil
+}
--- /dev/null
+// +build linux
+
+package fs
+
+import (
+ "bufio"
+ "fmt"
+ "os"
+ "path/filepath"
+ "strconv"
+ "strings"
+
+ "github.com/opencontainers/runc/libcontainer/cgroups"
+ "github.com/opencontainers/runc/libcontainer/configs"
+)
+
+const (
+ cgroupMemorySwapLimit = "memory.memsw.limit_in_bytes"
+ cgroupMemoryLimit = "memory.limit_in_bytes"
+)
+
+type MemoryGroup struct {
+}
+
+func (s *MemoryGroup) Name() string {
+ return "memory"
+}
+
+func (s *MemoryGroup) Apply(d *cgroupData) (err error) {
+ path, err := d.path("memory")
+ if err != nil && !cgroups.IsNotFound(err) {
+ return err
+ } else if path == "" {
+ return nil
+ }
+ if memoryAssigned(d.config) {
+ if _, err := os.Stat(path); os.IsNotExist(err) {
+ if err := os.MkdirAll(path, 0755); err != nil {
+ return err
+ }
+ // Only enable kernel memory accouting when this cgroup
+ // is created by libcontainer, otherwise we might get
+ // error when people use `cgroupsPath` to join an existed
+ // cgroup whose kernel memory is not initialized.
+ if err := EnableKernelMemoryAccounting(path); err != nil {
+ return err
+ }
+ }
+ }
+ defer func() {
+ if err != nil {
+ os.RemoveAll(path)
+ }
+ }()
+
+ // We need to join memory cgroup after set memory limits, because
+ // kmem.limit_in_bytes can only be set when the cgroup is empty.
+ _, err = d.join("memory")
+ if err != nil && !cgroups.IsNotFound(err) {
+ return err
+ }
+ return nil
+}
+
+func setMemoryAndSwap(path string, cgroup *configs.Cgroup) error {
+ // If the memory update is set to -1 we should also
+ // set swap to -1, it means unlimited memory.
+ if cgroup.Resources.Memory == -1 {
+ // Only set swap if it's enabled in kernel
+ if cgroups.PathExists(filepath.Join(path, cgroupMemorySwapLimit)) {
+ cgroup.Resources.MemorySwap = -1
+ }
+ }
+
+ // When memory and swap memory are both set, we need to handle the cases
+ // for updating container.
+ if cgroup.Resources.Memory != 0 && cgroup.Resources.MemorySwap != 0 {
+ memoryUsage, err := getMemoryData(path, "")
+ if err != nil {
+ return err
+ }
+
+ // When update memory limit, we should adapt the write sequence
+ // for memory and swap memory, so it won't fail because the new
+ // value and the old value don't fit kernel's validation.
+ if cgroup.Resources.MemorySwap == -1 || memoryUsage.Limit < uint64(cgroup.Resources.MemorySwap) {
+ if err := writeFile(path, cgroupMemorySwapLimit, strconv.FormatInt(cgroup.Resources.MemorySwap, 10)); err != nil {
+ return err
+ }
+ if err := writeFile(path, cgroupMemoryLimit, strconv.FormatInt(cgroup.Resources.Memory, 10)); err != nil {
+ return err
+ }
+ } else {
+ if err := writeFile(path, cgroupMemoryLimit, strconv.FormatInt(cgroup.Resources.Memory, 10)); err != nil {
+ return err
+ }
+ if err := writeFile(path, cgroupMemorySwapLimit, strconv.FormatInt(cgroup.Resources.MemorySwap, 10)); err != nil {
+ return err
+ }
+ }
+ } else {
+ if cgroup.Resources.Memory != 0 {
+ if err := writeFile(path, cgroupMemoryLimit, strconv.FormatInt(cgroup.Resources.Memory, 10)); err != nil {
+ return err
+ }
+ }
+ if cgroup.Resources.MemorySwap != 0 {
+ if err := writeFile(path, cgroupMemorySwapLimit, strconv.FormatInt(cgroup.Resources.MemorySwap, 10)); err != nil {
+ return err
+ }
+ }
+ }
+
+ return nil
+}
+
+func (s *MemoryGroup) Set(path string, cgroup *configs.Cgroup) error {
+ if err := setMemoryAndSwap(path, cgroup); err != nil {
+ return err
+ }
+
+ if cgroup.Resources.KernelMemory != 0 {
+ if err := setKernelMemory(path, cgroup.Resources.KernelMemory); err != nil {
+ return err
+ }
+ }
+
+ if cgroup.Resources.MemoryReservation != 0 {
+ if err := writeFile(path, "memory.soft_limit_in_bytes", strconv.FormatInt(cgroup.Resources.MemoryReservation, 10)); err != nil {
+ return err
+ }
+ }
+
+ if cgroup.Resources.KernelMemoryTCP != 0 {
+ if err := writeFile(path, "memory.kmem.tcp.limit_in_bytes", strconv.FormatInt(cgroup.Resources.KernelMemoryTCP, 10)); err != nil {
+ return err
+ }
+ }
+ if cgroup.Resources.OomKillDisable {
+ if err := writeFile(path, "memory.oom_control", "1"); err != nil {
+ return err
+ }
+ }
+ if cgroup.Resources.MemorySwappiness == nil || int64(*cgroup.Resources.MemorySwappiness) == -1 {
+ return nil
+ } else if *cgroup.Resources.MemorySwappiness <= 100 {
+ if err := writeFile(path, "memory.swappiness", strconv.FormatUint(*cgroup.Resources.MemorySwappiness, 10)); err != nil {
+ return err
+ }
+ } else {
+ return fmt.Errorf("invalid value:%d. valid memory swappiness range is 0-100", *cgroup.Resources.MemorySwappiness)
+ }
+
+ return nil
+}
+
+func (s *MemoryGroup) Remove(d *cgroupData) error {
+ return removePath(d.path("memory"))
+}
+
+func (s *MemoryGroup) GetStats(path string, stats *cgroups.Stats) error {
+ // Set stats from memory.stat.
+ statsFile, err := os.Open(filepath.Join(path, "memory.stat"))
+ if err != nil {
+ if os.IsNotExist(err) {
+ return nil
+ }
+ return err
+ }
+ defer statsFile.Close()
+
+ sc := bufio.NewScanner(statsFile)
+ for sc.Scan() {
+ t, v, err := getCgroupParamKeyValue(sc.Text())
+ if err != nil {
+ return fmt.Errorf("failed to parse memory.stat (%q) - %v", sc.Text(), err)
+ }
+ stats.MemoryStats.Stats[t] = v
+ }
+ stats.MemoryStats.Cache = stats.MemoryStats.Stats["cache"]
+
+ memoryUsage, err := getMemoryData(path, "")
+ if err != nil {
+ return err
+ }
+ stats.MemoryStats.Usage = memoryUsage
+ swapUsage, err := getMemoryData(path, "memsw")
+ if err != nil {
+ return err
+ }
+ stats.MemoryStats.SwapUsage = swapUsage
+ kernelUsage, err := getMemoryData(path, "kmem")
+ if err != nil {
+ return err
+ }
+ stats.MemoryStats.KernelUsage = kernelUsage
+ kernelTCPUsage, err := getMemoryData(path, "kmem.tcp")
+ if err != nil {
+ return err
+ }
+ stats.MemoryStats.KernelTCPUsage = kernelTCPUsage
+
+ useHierarchy := strings.Join([]string{"memory", "use_hierarchy"}, ".")
+ value, err := getCgroupParamUint(path, useHierarchy)
+ if err != nil {
+ return err
+ }
+ if value == 1 {
+ stats.MemoryStats.UseHierarchy = true
+ }
+ return nil
+}
+
+func memoryAssigned(cgroup *configs.Cgroup) bool {
+ return cgroup.Resources.Memory != 0 ||
+ cgroup.Resources.MemoryReservation != 0 ||
+ cgroup.Resources.MemorySwap > 0 ||
+ cgroup.Resources.KernelMemory > 0 ||
+ cgroup.Resources.KernelMemoryTCP > 0 ||
+ cgroup.Resources.OomKillDisable ||
+ (cgroup.Resources.MemorySwappiness != nil && int64(*cgroup.Resources.MemorySwappiness) != -1)
+}
+
+func getMemoryData(path, name string) (cgroups.MemoryData, error) {
+ memoryData := cgroups.MemoryData{}
+
+ moduleName := "memory"
+ if name != "" {
+ moduleName = strings.Join([]string{"memory", name}, ".")
+ }
+ usage := strings.Join([]string{moduleName, "usage_in_bytes"}, ".")
+ maxUsage := strings.Join([]string{moduleName, "max_usage_in_bytes"}, ".")
+ failcnt := strings.Join([]string{moduleName, "failcnt"}, ".")
+ limit := strings.Join([]string{moduleName, "limit_in_bytes"}, ".")
+
+ value, err := getCgroupParamUint(path, usage)
+ if err != nil {
+ if moduleName != "memory" && os.IsNotExist(err) {
+ return cgroups.MemoryData{}, nil
+ }
+ return cgroups.MemoryData{}, fmt.Errorf("failed to parse %s - %v", usage, err)
+ }
+ memoryData.Usage = value
+ value, err = getCgroupParamUint(path, maxUsage)
+ if err != nil {
+ if moduleName != "memory" && os.IsNotExist(err) {
+ return cgroups.MemoryData{}, nil
+ }
+ return cgroups.MemoryData{}, fmt.Errorf("failed to parse %s - %v", maxUsage, err)
+ }
+ memoryData.MaxUsage = value
+ value, err = getCgroupParamUint(path, failcnt)
+ if err != nil {
+ if moduleName != "memory" && os.IsNotExist(err) {
+ return cgroups.MemoryData{}, nil
+ }
+ return cgroups.MemoryData{}, fmt.Errorf("failed to parse %s - %v", failcnt, err)
+ }
+ memoryData.Failcnt = value
+ value, err = getCgroupParamUint(path, limit)
+ if err != nil {
+ if moduleName != "memory" && os.IsNotExist(err) {
+ return cgroups.MemoryData{}, nil
+ }
+ return cgroups.MemoryData{}, fmt.Errorf("failed to parse %s - %v", limit, err)
+ }
+ memoryData.Limit = value
+
+ return memoryData, nil
+}
--- /dev/null
+// +build linux
+
+package fs
+
+import (
+ "strconv"
+ "testing"
+
+ "github.com/opencontainers/runc/libcontainer/cgroups"
+)
+
+const (
+ memoryStatContents = `cache 512
+rss 1024`
+ memoryUsageContents = "2048\n"
+ memoryMaxUsageContents = "4096\n"
+ memoryFailcnt = "100\n"
+ memoryLimitContents = "8192\n"
+ memoryUseHierarchyContents = "1\n"
+)
+
+func TestMemorySetMemory(t *testing.T) {
+ helper := NewCgroupTestUtil("memory", t)
+ defer helper.cleanup()
+
+ const (
+ memoryBefore = 314572800 // 300M
+ memoryAfter = 524288000 // 500M
+ reservationBefore = 209715200 // 200M
+ reservationAfter = 314572800 // 300M
+ )
+
+ helper.writeFileContents(map[string]string{
+ "memory.limit_in_bytes": strconv.Itoa(memoryBefore),
+ "memory.soft_limit_in_bytes": strconv.Itoa(reservationBefore),
+ })
+
+ helper.CgroupData.config.Resources.Memory = memoryAfter
+ helper.CgroupData.config.Resources.MemoryReservation = reservationAfter
+ memory := &MemoryGroup{}
+ if err := memory.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+ t.Fatal(err)
+ }
+
+ value, err := getCgroupParamUint(helper.CgroupPath, "memory.limit_in_bytes")
+ if err != nil {
+ t.Fatalf("Failed to parse memory.limit_in_bytes - %s", err)
+ }
+ if value != memoryAfter {
+ t.Fatal("Got the wrong value, set memory.limit_in_bytes failed.")
+ }
+
+ value, err = getCgroupParamUint(helper.CgroupPath, "memory.soft_limit_in_bytes")
+ if err != nil {
+ t.Fatalf("Failed to parse memory.soft_limit_in_bytes - %s", err)
+ }
+ if value != reservationAfter {
+ t.Fatal("Got the wrong value, set memory.soft_limit_in_bytes failed.")
+ }
+}
+
+func TestMemorySetMemoryswap(t *testing.T) {
+ helper := NewCgroupTestUtil("memory", t)
+ defer helper.cleanup()
+
+ const (
+ memoryswapBefore = 314572800 // 300M
+ memoryswapAfter = 524288000 // 500M
+ )
+
+ helper.writeFileContents(map[string]string{
+ "memory.memsw.limit_in_bytes": strconv.Itoa(memoryswapBefore),
+ })
+
+ helper.CgroupData.config.Resources.MemorySwap = memoryswapAfter
+ memory := &MemoryGroup{}
+ if err := memory.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+ t.Fatal(err)
+ }
+
+ value, err := getCgroupParamUint(helper.CgroupPath, "memory.memsw.limit_in_bytes")
+ if err != nil {
+ t.Fatalf("Failed to parse memory.memsw.limit_in_bytes - %s", err)
+ }
+ if value != memoryswapAfter {
+ t.Fatal("Got the wrong value, set memory.memsw.limit_in_bytes failed.")
+ }
+}
+
+func TestMemorySetMemoryLargerThanSwap(t *testing.T) {
+ helper := NewCgroupTestUtil("memory", t)
+ defer helper.cleanup()
+
+ const (
+ memoryBefore = 314572800 // 300M
+ memoryswapBefore = 524288000 // 500M
+ memoryAfter = 629145600 // 600M
+ memoryswapAfter = 838860800 // 800M
+ )
+
+ helper.writeFileContents(map[string]string{
+ "memory.limit_in_bytes": strconv.Itoa(memoryBefore),
+ "memory.memsw.limit_in_bytes": strconv.Itoa(memoryswapBefore),
+ // Set will call getMemoryData when memory and swap memory are
+ // both set, fake these fields so we don't get error.
+ "memory.usage_in_bytes": "0",
+ "memory.max_usage_in_bytes": "0",
+ "memory.failcnt": "0",
+ })
+
+ helper.CgroupData.config.Resources.Memory = memoryAfter
+ helper.CgroupData.config.Resources.MemorySwap = memoryswapAfter
+ memory := &MemoryGroup{}
+ if err := memory.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+ t.Fatal(err)
+ }
+
+ value, err := getCgroupParamUint(helper.CgroupPath, "memory.limit_in_bytes")
+ if err != nil {
+ t.Fatalf("Failed to parse memory.limit_in_bytes - %s", err)
+ }
+ if value != memoryAfter {
+ t.Fatal("Got the wrong value, set memory.limit_in_bytes failed.")
+ }
+ value, err = getCgroupParamUint(helper.CgroupPath, "memory.memsw.limit_in_bytes")
+ if err != nil {
+ t.Fatalf("Failed to parse memory.memsw.limit_in_bytes - %s", err)
+ }
+ if value != memoryswapAfter {
+ t.Fatal("Got the wrong value, set memory.memsw.limit_in_bytes failed.")
+ }
+}
+
+func TestMemorySetSwapSmallerThanMemory(t *testing.T) {
+ helper := NewCgroupTestUtil("memory", t)
+ defer helper.cleanup()
+
+ const (
+ memoryBefore = 629145600 // 600M
+ memoryswapBefore = 838860800 // 800M
+ memoryAfter = 314572800 // 300M
+ memoryswapAfter = 524288000 // 500M
+ )
+
+ helper.writeFileContents(map[string]string{
+ "memory.limit_in_bytes": strconv.Itoa(memoryBefore),
+ "memory.memsw.limit_in_bytes": strconv.Itoa(memoryswapBefore),
+ // Set will call getMemoryData when memory and swap memory are
+ // both set, fake these fields so we don't get error.
+ "memory.usage_in_bytes": "0",
+ "memory.max_usage_in_bytes": "0",
+ "memory.failcnt": "0",
+ })
+
+ helper.CgroupData.config.Resources.Memory = memoryAfter
+ helper.CgroupData.config.Resources.MemorySwap = memoryswapAfter
+ memory := &MemoryGroup{}
+ if err := memory.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+ t.Fatal(err)
+ }
+
+ value, err := getCgroupParamUint(helper.CgroupPath, "memory.limit_in_bytes")
+ if err != nil {
+ t.Fatalf("Failed to parse memory.limit_in_bytes - %s", err)
+ }
+ if value != memoryAfter {
+ t.Fatal("Got the wrong value, set memory.limit_in_bytes failed.")
+ }
+ value, err = getCgroupParamUint(helper.CgroupPath, "memory.memsw.limit_in_bytes")
+ if err != nil {
+ t.Fatalf("Failed to parse memory.memsw.limit_in_bytes - %s", err)
+ }
+ if value != memoryswapAfter {
+ t.Fatal("Got the wrong value, set memory.memsw.limit_in_bytes failed.")
+ }
+}
+
+func TestMemorySetKernelMemory(t *testing.T) {
+ helper := NewCgroupTestUtil("memory", t)
+ defer helper.cleanup()
+
+ const (
+ kernelMemoryBefore = 314572800 // 300M
+ kernelMemoryAfter = 524288000 // 500M
+ )
+
+ helper.writeFileContents(map[string]string{
+ "memory.kmem.limit_in_bytes": strconv.Itoa(kernelMemoryBefore),
+ })
+
+ helper.CgroupData.config.Resources.KernelMemory = kernelMemoryAfter
+ memory := &MemoryGroup{}
+ if err := memory.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+ t.Fatal(err)
+ }
+
+ value, err := getCgroupParamUint(helper.CgroupPath, "memory.kmem.limit_in_bytes")
+ if err != nil {
+ t.Fatalf("Failed to parse memory.kmem.limit_in_bytes - %s", err)
+ }
+ if value != kernelMemoryAfter {
+ t.Fatal("Got the wrong value, set memory.kmem.limit_in_bytes failed.")
+ }
+}
+
+func TestMemorySetKernelMemoryTCP(t *testing.T) {
+ helper := NewCgroupTestUtil("memory", t)
+ defer helper.cleanup()
+
+ const (
+ kernelMemoryTCPBefore = 314572800 // 300M
+ kernelMemoryTCPAfter = 524288000 // 500M
+ )
+
+ helper.writeFileContents(map[string]string{
+ "memory.kmem.tcp.limit_in_bytes": strconv.Itoa(kernelMemoryTCPBefore),
+ })
+
+ helper.CgroupData.config.Resources.KernelMemoryTCP = kernelMemoryTCPAfter
+ memory := &MemoryGroup{}
+ if err := memory.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+ t.Fatal(err)
+ }
+
+ value, err := getCgroupParamUint(helper.CgroupPath, "memory.kmem.tcp.limit_in_bytes")
+ if err != nil {
+ t.Fatalf("Failed to parse memory.kmem.tcp.limit_in_bytes - %s", err)
+ }
+ if value != kernelMemoryTCPAfter {
+ t.Fatal("Got the wrong value, set memory.kmem.tcp.limit_in_bytes failed.")
+ }
+}
+
+func TestMemorySetMemorySwappinessDefault(t *testing.T) {
+ helper := NewCgroupTestUtil("memory", t)
+ defer helper.cleanup()
+
+ swappinessBefore := 60 //default is 60
+ swappinessAfter := uint64(0)
+
+ helper.writeFileContents(map[string]string{
+ "memory.swappiness": strconv.Itoa(swappinessBefore),
+ })
+
+ helper.CgroupData.config.Resources.MemorySwappiness = &swappinessAfter
+ memory := &MemoryGroup{}
+ if err := memory.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+ t.Fatal(err)
+ }
+
+ value, err := getCgroupParamUint(helper.CgroupPath, "memory.swappiness")
+ if err != nil {
+ t.Fatalf("Failed to parse memory.swappiness - %s", err)
+ }
+ if value != swappinessAfter {
+ t.Fatalf("Got the wrong value (%d), set memory.swappiness = %d failed.", value, swappinessAfter)
+ }
+}
+
+func TestMemoryStats(t *testing.T) {
+ helper := NewCgroupTestUtil("memory", t)
+ defer helper.cleanup()
+ helper.writeFileContents(map[string]string{
+ "memory.stat": memoryStatContents,
+ "memory.usage_in_bytes": memoryUsageContents,
+ "memory.limit_in_bytes": memoryLimitContents,
+ "memory.max_usage_in_bytes": memoryMaxUsageContents,
+ "memory.failcnt": memoryFailcnt,
+ "memory.memsw.usage_in_bytes": memoryUsageContents,
+ "memory.memsw.max_usage_in_bytes": memoryMaxUsageContents,
+ "memory.memsw.failcnt": memoryFailcnt,
+ "memory.memsw.limit_in_bytes": memoryLimitContents,
+ "memory.kmem.usage_in_bytes": memoryUsageContents,
+ "memory.kmem.max_usage_in_bytes": memoryMaxUsageContents,
+ "memory.kmem.failcnt": memoryFailcnt,
+ "memory.kmem.limit_in_bytes": memoryLimitContents,
+ "memory.use_hierarchy": memoryUseHierarchyContents,
+ })
+
+ memory := &MemoryGroup{}
+ actualStats := *cgroups.NewStats()
+ err := memory.GetStats(helper.CgroupPath, &actualStats)
+ if err != nil {
+ t.Fatal(err)
+ }
+ expectedStats := cgroups.MemoryStats{Cache: 512, Usage: cgroups.MemoryData{Usage: 2048, MaxUsage: 4096, Failcnt: 100, Limit: 8192}, SwapUsage: cgroups.MemoryData{Usage: 2048, MaxUsage: 4096, Failcnt: 100, Limit: 8192}, KernelUsage: cgroups.MemoryData{Usage: 2048, MaxUsage: 4096, Failcnt: 100, Limit: 8192}, Stats: map[string]uint64{"cache": 512, "rss": 1024}, UseHierarchy: true}
+ expectMemoryStatEquals(t, expectedStats, actualStats.MemoryStats)
+}
+
+func TestMemoryStatsNoStatFile(t *testing.T) {
+ helper := NewCgroupTestUtil("memory", t)
+ defer helper.cleanup()
+ helper.writeFileContents(map[string]string{
+ "memory.usage_in_bytes": memoryUsageContents,
+ "memory.max_usage_in_bytes": memoryMaxUsageContents,
+ "memory.limit_in_bytes": memoryLimitContents,
+ })
+
+ memory := &MemoryGroup{}
+ actualStats := *cgroups.NewStats()
+ err := memory.GetStats(helper.CgroupPath, &actualStats)
+ if err != nil {
+ t.Fatal(err)
+ }
+}
+
+func TestMemoryStatsNoUsageFile(t *testing.T) {
+ helper := NewCgroupTestUtil("memory", t)
+ defer helper.cleanup()
+ helper.writeFileContents(map[string]string{
+ "memory.stat": memoryStatContents,
+ "memory.max_usage_in_bytes": memoryMaxUsageContents,
+ "memory.limit_in_bytes": memoryLimitContents,
+ })
+
+ memory := &MemoryGroup{}
+ actualStats := *cgroups.NewStats()
+ err := memory.GetStats(helper.CgroupPath, &actualStats)
+ if err == nil {
+ t.Fatal("Expected failure")
+ }
+}
+
+func TestMemoryStatsNoMaxUsageFile(t *testing.T) {
+ helper := NewCgroupTestUtil("memory", t)
+ defer helper.cleanup()
+ helper.writeFileContents(map[string]string{
+ "memory.stat": memoryStatContents,
+ "memory.usage_in_bytes": memoryUsageContents,
+ "memory.limit_in_bytes": memoryLimitContents,
+ })
+
+ memory := &MemoryGroup{}
+ actualStats := *cgroups.NewStats()
+ err := memory.GetStats(helper.CgroupPath, &actualStats)
+ if err == nil {
+ t.Fatal("Expected failure")
+ }
+}
+
+func TestMemoryStatsNoLimitInBytesFile(t *testing.T) {
+ helper := NewCgroupTestUtil("memory", t)
+ defer helper.cleanup()
+ helper.writeFileContents(map[string]string{
+ "memory.stat": memoryStatContents,
+ "memory.usage_in_bytes": memoryUsageContents,
+ "memory.max_usage_in_bytes": memoryMaxUsageContents,
+ })
+
+ memory := &MemoryGroup{}
+ actualStats := *cgroups.NewStats()
+ err := memory.GetStats(helper.CgroupPath, &actualStats)
+ if err == nil {
+ t.Fatal("Expected failure")
+ }
+}
+
+func TestMemoryStatsBadStatFile(t *testing.T) {
+ helper := NewCgroupTestUtil("memory", t)
+ defer helper.cleanup()
+ helper.writeFileContents(map[string]string{
+ "memory.stat": "rss rss",
+ "memory.usage_in_bytes": memoryUsageContents,
+ "memory.max_usage_in_bytes": memoryMaxUsageContents,
+ "memory.limit_in_bytes": memoryLimitContents,
+ })
+
+ memory := &MemoryGroup{}
+ actualStats := *cgroups.NewStats()
+ err := memory.GetStats(helper.CgroupPath, &actualStats)
+ if err == nil {
+ t.Fatal("Expected failure")
+ }
+}
+
+func TestMemoryStatsBadUsageFile(t *testing.T) {
+ helper := NewCgroupTestUtil("memory", t)
+ defer helper.cleanup()
+ helper.writeFileContents(map[string]string{
+ "memory.stat": memoryStatContents,
+ "memory.usage_in_bytes": "bad",
+ "memory.max_usage_in_bytes": memoryMaxUsageContents,
+ "memory.limit_in_bytes": memoryLimitContents,
+ })
+
+ memory := &MemoryGroup{}
+ actualStats := *cgroups.NewStats()
+ err := memory.GetStats(helper.CgroupPath, &actualStats)
+ if err == nil {
+ t.Fatal("Expected failure")
+ }
+}
+
+func TestMemoryStatsBadMaxUsageFile(t *testing.T) {
+ helper := NewCgroupTestUtil("memory", t)
+ defer helper.cleanup()
+ helper.writeFileContents(map[string]string{
+ "memory.stat": memoryStatContents,
+ "memory.usage_in_bytes": memoryUsageContents,
+ "memory.max_usage_in_bytes": "bad",
+ "memory.limit_in_bytes": memoryLimitContents,
+ })
+
+ memory := &MemoryGroup{}
+ actualStats := *cgroups.NewStats()
+ err := memory.GetStats(helper.CgroupPath, &actualStats)
+ if err == nil {
+ t.Fatal("Expected failure")
+ }
+}
+
+func TestMemoryStatsBadLimitInBytesFile(t *testing.T) {
+ helper := NewCgroupTestUtil("memory", t)
+ defer helper.cleanup()
+ helper.writeFileContents(map[string]string{
+ "memory.stat": memoryStatContents,
+ "memory.usage_in_bytes": memoryUsageContents,
+ "memory.max_usage_in_bytes": memoryMaxUsageContents,
+ "memory.limit_in_bytes": "bad",
+ })
+
+ memory := &MemoryGroup{}
+ actualStats := *cgroups.NewStats()
+ err := memory.GetStats(helper.CgroupPath, &actualStats)
+ if err == nil {
+ t.Fatal("Expected failure")
+ }
+}
+
+func TestMemorySetOomControl(t *testing.T) {
+ helper := NewCgroupTestUtil("memory", t)
+ defer helper.cleanup()
+
+ const (
+ oomKillDisable = 1 // disable oom killer, default is 0
+ )
+
+ helper.writeFileContents(map[string]string{
+ "memory.oom_control": strconv.Itoa(oomKillDisable),
+ })
+
+ memory := &MemoryGroup{}
+ if err := memory.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+ t.Fatal(err)
+ }
+
+ value, err := getCgroupParamUint(helper.CgroupPath, "memory.oom_control")
+ if err != nil {
+ t.Fatalf("Failed to parse memory.oom_control - %s", err)
+ }
+
+ if value != oomKillDisable {
+ t.Fatalf("Got the wrong value, set memory.oom_control failed.")
+ }
+}
--- /dev/null
+// +build linux
+
+package fs
+
+import (
+ "github.com/opencontainers/runc/libcontainer/cgroups"
+ "github.com/opencontainers/runc/libcontainer/configs"
+)
+
+type NameGroup struct {
+ GroupName string
+ Join bool
+}
+
+func (s *NameGroup) Name() string {
+ return s.GroupName
+}
+
+func (s *NameGroup) Apply(d *cgroupData) error {
+ if s.Join {
+ // ignore errors if the named cgroup does not exist
+ d.join(s.GroupName)
+ }
+ return nil
+}
+
+func (s *NameGroup) Set(path string, cgroup *configs.Cgroup) error {
+ return nil
+}
+
+func (s *NameGroup) Remove(d *cgroupData) error {
+ if s.Join {
+ removePath(d.path(s.GroupName))
+ }
+ return nil
+}
+
+func (s *NameGroup) GetStats(path string, stats *cgroups.Stats) error {
+ return nil
+}
--- /dev/null
+// +build linux
+
+package fs
+
+import (
+ "strconv"
+
+ "github.com/opencontainers/runc/libcontainer/cgroups"
+ "github.com/opencontainers/runc/libcontainer/configs"
+)
+
+type NetClsGroup struct {
+}
+
+func (s *NetClsGroup) Name() string {
+ return "net_cls"
+}
+
+func (s *NetClsGroup) Apply(d *cgroupData) error {
+ _, err := d.join("net_cls")
+ if err != nil && !cgroups.IsNotFound(err) {
+ return err
+ }
+ return nil
+}
+
+func (s *NetClsGroup) Set(path string, cgroup *configs.Cgroup) error {
+ if cgroup.Resources.NetClsClassid != 0 {
+ if err := writeFile(path, "net_cls.classid", strconv.FormatUint(uint64(cgroup.Resources.NetClsClassid), 10)); err != nil {
+ return err
+ }
+ }
+
+ return nil
+}
+
+func (s *NetClsGroup) Remove(d *cgroupData) error {
+ return removePath(d.path("net_cls"))
+}
+
+func (s *NetClsGroup) GetStats(path string, stats *cgroups.Stats) error {
+ return nil
+}
--- /dev/null
+// +build linux
+
+package fs
+
+import (
+ "strconv"
+ "testing"
+)
+
+const (
+ classidBefore = 0x100002
+ classidAfter = 0x100001
+)
+
+func TestNetClsSetClassid(t *testing.T) {
+ helper := NewCgroupTestUtil("net_cls", t)
+ defer helper.cleanup()
+
+ helper.writeFileContents(map[string]string{
+ "net_cls.classid": strconv.FormatUint(classidBefore, 10),
+ })
+
+ helper.CgroupData.config.Resources.NetClsClassid = classidAfter
+ netcls := &NetClsGroup{}
+ if err := netcls.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+ t.Fatal(err)
+ }
+
+ // As we are in mock environment, we can't get correct value of classid from
+ // net_cls.classid.
+ // So. we just judge if we successfully write classid into file
+ value, err := getCgroupParamUint(helper.CgroupPath, "net_cls.classid")
+ if err != nil {
+ t.Fatalf("Failed to parse net_cls.classid - %s", err)
+ }
+ if value != classidAfter {
+ t.Fatal("Got the wrong value, set net_cls.classid failed.")
+ }
+}
--- /dev/null
+// +build linux
+
+package fs
+
+import (
+ "github.com/opencontainers/runc/libcontainer/cgroups"
+ "github.com/opencontainers/runc/libcontainer/configs"
+)
+
+type NetPrioGroup struct {
+}
+
+func (s *NetPrioGroup) Name() string {
+ return "net_prio"
+}
+
+func (s *NetPrioGroup) Apply(d *cgroupData) error {
+ _, err := d.join("net_prio")
+ if err != nil && !cgroups.IsNotFound(err) {
+ return err
+ }
+ return nil
+}
+
+func (s *NetPrioGroup) Set(path string, cgroup *configs.Cgroup) error {
+ for _, prioMap := range cgroup.Resources.NetPrioIfpriomap {
+ if err := writeFile(path, "net_prio.ifpriomap", prioMap.CgroupString()); err != nil {
+ return err
+ }
+ }
+
+ return nil
+}
+
+func (s *NetPrioGroup) Remove(d *cgroupData) error {
+ return removePath(d.path("net_prio"))
+}
+
+func (s *NetPrioGroup) GetStats(path string, stats *cgroups.Stats) error {
+ return nil
+}
--- /dev/null
+// +build linux
+
+package fs
+
+import (
+ "strings"
+ "testing"
+
+ "github.com/opencontainers/runc/libcontainer/configs"
+)
+
+var (
+ prioMap = []*configs.IfPrioMap{
+ {
+ Interface: "test",
+ Priority: 5,
+ },
+ }
+)
+
+func TestNetPrioSetIfPrio(t *testing.T) {
+ helper := NewCgroupTestUtil("net_prio", t)
+ defer helper.cleanup()
+
+ helper.CgroupData.config.Resources.NetPrioIfpriomap = prioMap
+ netPrio := &NetPrioGroup{}
+ if err := netPrio.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+ t.Fatal(err)
+ }
+
+ value, err := getCgroupParamString(helper.CgroupPath, "net_prio.ifpriomap")
+ if err != nil {
+ t.Fatalf("Failed to parse net_prio.ifpriomap - %s", err)
+ }
+ if !strings.Contains(value, "test 5") {
+ t.Fatal("Got the wrong value, set net_prio.ifpriomap failed.")
+ }
+}
--- /dev/null
+// +build linux
+
+package fs
+
+import (
+ "github.com/opencontainers/runc/libcontainer/cgroups"
+ "github.com/opencontainers/runc/libcontainer/configs"
+)
+
+type PerfEventGroup struct {
+}
+
+func (s *PerfEventGroup) Name() string {
+ return "perf_event"
+}
+
+func (s *PerfEventGroup) Apply(d *cgroupData) error {
+ // we just want to join this group even though we don't set anything
+ if _, err := d.join("perf_event"); err != nil && !cgroups.IsNotFound(err) {
+ return err
+ }
+ return nil
+}
+
+func (s *PerfEventGroup) Set(path string, cgroup *configs.Cgroup) error {
+ return nil
+}
+
+func (s *PerfEventGroup) Remove(d *cgroupData) error {
+ return removePath(d.path("perf_event"))
+}
+
+func (s *PerfEventGroup) GetStats(path string, stats *cgroups.Stats) error {
+ return nil
+}
--- /dev/null
+// +build linux
+
+package fs
+
+import (
+ "fmt"
+ "path/filepath"
+ "strconv"
+
+ "github.com/opencontainers/runc/libcontainer/cgroups"
+ "github.com/opencontainers/runc/libcontainer/configs"
+)
+
+type PidsGroup struct {
+}
+
+func (s *PidsGroup) Name() string {
+ return "pids"
+}
+
+func (s *PidsGroup) Apply(d *cgroupData) error {
+ _, err := d.join("pids")
+ if err != nil && !cgroups.IsNotFound(err) {
+ return err
+ }
+ return nil
+}
+
+func (s *PidsGroup) Set(path string, cgroup *configs.Cgroup) error {
+ if cgroup.Resources.PidsLimit != 0 {
+ // "max" is the fallback value.
+ limit := "max"
+
+ if cgroup.Resources.PidsLimit > 0 {
+ limit = strconv.FormatInt(cgroup.Resources.PidsLimit, 10)
+ }
+
+ if err := writeFile(path, "pids.max", limit); err != nil {
+ return err
+ }
+ }
+
+ return nil
+}
+
+func (s *PidsGroup) Remove(d *cgroupData) error {
+ return removePath(d.path("pids"))
+}
+
+func (s *PidsGroup) GetStats(path string, stats *cgroups.Stats) error {
+ current, err := getCgroupParamUint(path, "pids.current")
+ if err != nil {
+ return fmt.Errorf("failed to parse pids.current - %s", err)
+ }
+
+ maxString, err := getCgroupParamString(path, "pids.max")
+ if err != nil {
+ return fmt.Errorf("failed to parse pids.max - %s", err)
+ }
+
+ // Default if pids.max == "max" is 0 -- which represents "no limit".
+ var max uint64
+ if maxString != "max" {
+ max, err = parseUint(maxString, 10, 64)
+ if err != nil {
+ return fmt.Errorf("failed to parse pids.max - unable to parse %q as a uint from Cgroup file %q", maxString, filepath.Join(path, "pids.max"))
+ }
+ }
+
+ stats.PidsStats.Current = current
+ stats.PidsStats.Limit = max
+ return nil
+}
--- /dev/null
+// +build linux
+
+package fs
+
+import (
+ "strconv"
+ "testing"
+
+ "github.com/opencontainers/runc/libcontainer/cgroups"
+)
+
+const (
+ maxUnlimited = -1
+ maxLimited = 1024
+)
+
+func TestPidsSetMax(t *testing.T) {
+ helper := NewCgroupTestUtil("pids", t)
+ defer helper.cleanup()
+
+ helper.writeFileContents(map[string]string{
+ "pids.max": "max",
+ })
+
+ helper.CgroupData.config.Resources.PidsLimit = maxLimited
+ pids := &PidsGroup{}
+ if err := pids.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+ t.Fatal(err)
+ }
+
+ value, err := getCgroupParamUint(helper.CgroupPath, "pids.max")
+ if err != nil {
+ t.Fatalf("Failed to parse pids.max - %s", err)
+ }
+
+ if value != maxLimited {
+ t.Fatalf("Expected %d, got %d for setting pids.max - limited", maxLimited, value)
+ }
+}
+
+func TestPidsSetUnlimited(t *testing.T) {
+ helper := NewCgroupTestUtil("pids", t)
+ defer helper.cleanup()
+
+ helper.writeFileContents(map[string]string{
+ "pids.max": strconv.Itoa(maxLimited),
+ })
+
+ helper.CgroupData.config.Resources.PidsLimit = maxUnlimited
+ pids := &PidsGroup{}
+ if err := pids.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+ t.Fatal(err)
+ }
+
+ value, err := getCgroupParamString(helper.CgroupPath, "pids.max")
+ if err != nil {
+ t.Fatalf("Failed to parse pids.max - %s", err)
+ }
+
+ if value != "max" {
+ t.Fatalf("Expected %s, got %s for setting pids.max - unlimited", "max", value)
+ }
+}
+
+func TestPidsStats(t *testing.T) {
+ helper := NewCgroupTestUtil("pids", t)
+ defer helper.cleanup()
+
+ helper.writeFileContents(map[string]string{
+ "pids.current": strconv.Itoa(1337),
+ "pids.max": strconv.Itoa(maxLimited),
+ })
+
+ pids := &PidsGroup{}
+ stats := *cgroups.NewStats()
+ if err := pids.GetStats(helper.CgroupPath, &stats); err != nil {
+ t.Fatal(err)
+ }
+
+ if stats.PidsStats.Current != 1337 {
+ t.Fatalf("Expected %d, got %d for pids.current", 1337, stats.PidsStats.Current)
+ }
+
+ if stats.PidsStats.Limit != maxLimited {
+ t.Fatalf("Expected %d, got %d for pids.max", maxLimited, stats.PidsStats.Limit)
+ }
+}
+
+func TestPidsStatsUnlimited(t *testing.T) {
+ helper := NewCgroupTestUtil("pids", t)
+ defer helper.cleanup()
+
+ helper.writeFileContents(map[string]string{
+ "pids.current": strconv.Itoa(4096),
+ "pids.max": "max",
+ })
+
+ pids := &PidsGroup{}
+ stats := *cgroups.NewStats()
+ if err := pids.GetStats(helper.CgroupPath, &stats); err != nil {
+ t.Fatal(err)
+ }
+
+ if stats.PidsStats.Current != 4096 {
+ t.Fatalf("Expected %d, got %d for pids.current", 4096, stats.PidsStats.Current)
+ }
+
+ if stats.PidsStats.Limit != 0 {
+ t.Fatalf("Expected %d, got %d for pids.max", 0, stats.PidsStats.Limit)
+ }
+}
--- /dev/null
+// +build linux
+
+package fs
+
+import (
+ "fmt"
+ "testing"
+
+ "github.com/opencontainers/runc/libcontainer/cgroups"
+
+ "github.com/sirupsen/logrus"
+)
+
+func blkioStatEntryEquals(expected, actual []cgroups.BlkioStatEntry) error {
+ if len(expected) != len(actual) {
+ return fmt.Errorf("blkioStatEntries length do not match")
+ }
+ for i, expValue := range expected {
+ actValue := actual[i]
+ if expValue != actValue {
+ return fmt.Errorf("Expected blkio stat entry %v but found %v", expValue, actValue)
+ }
+ }
+ return nil
+}
+
+func expectBlkioStatsEquals(t *testing.T, expected, actual cgroups.BlkioStats) {
+ if err := blkioStatEntryEquals(expected.IoServiceBytesRecursive, actual.IoServiceBytesRecursive); err != nil {
+ logrus.Printf("blkio IoServiceBytesRecursive do not match - %s\n", err)
+ t.Fail()
+ }
+
+ if err := blkioStatEntryEquals(expected.IoServicedRecursive, actual.IoServicedRecursive); err != nil {
+ logrus.Printf("blkio IoServicedRecursive do not match - %s\n", err)
+ t.Fail()
+ }
+
+ if err := blkioStatEntryEquals(expected.IoQueuedRecursive, actual.IoQueuedRecursive); err != nil {
+ logrus.Printf("blkio IoQueuedRecursive do not match - %s\n", err)
+ t.Fail()
+ }
+
+ if err := blkioStatEntryEquals(expected.SectorsRecursive, actual.SectorsRecursive); err != nil {
+ logrus.Printf("blkio SectorsRecursive do not match - %s\n", err)
+ t.Fail()
+ }
+
+ if err := blkioStatEntryEquals(expected.IoServiceTimeRecursive, actual.IoServiceTimeRecursive); err != nil {
+ logrus.Printf("blkio IoServiceTimeRecursive do not match - %s\n", err)
+ t.Fail()
+ }
+
+ if err := blkioStatEntryEquals(expected.IoWaitTimeRecursive, actual.IoWaitTimeRecursive); err != nil {
+ logrus.Printf("blkio IoWaitTimeRecursive do not match - %s\n", err)
+ t.Fail()
+ }
+
+ if err := blkioStatEntryEquals(expected.IoMergedRecursive, actual.IoMergedRecursive); err != nil {
+ logrus.Printf("blkio IoMergedRecursive do not match - %v vs %v\n", expected.IoMergedRecursive, actual.IoMergedRecursive)
+ t.Fail()
+ }
+
+ if err := blkioStatEntryEquals(expected.IoTimeRecursive, actual.IoTimeRecursive); err != nil {
+ logrus.Printf("blkio IoTimeRecursive do not match - %s\n", err)
+ t.Fail()
+ }
+}
+
+func expectThrottlingDataEquals(t *testing.T, expected, actual cgroups.ThrottlingData) {
+ if expected != actual {
+ logrus.Printf("Expected throttling data %v but found %v\n", expected, actual)
+ t.Fail()
+ }
+}
+
+func expectHugetlbStatEquals(t *testing.T, expected, actual cgroups.HugetlbStats) {
+ if expected != actual {
+ logrus.Printf("Expected hugetlb stats %v but found %v\n", expected, actual)
+ t.Fail()
+ }
+}
+
+func expectMemoryStatEquals(t *testing.T, expected, actual cgroups.MemoryStats) {
+ expectMemoryDataEquals(t, expected.Usage, actual.Usage)
+ expectMemoryDataEquals(t, expected.SwapUsage, actual.SwapUsage)
+ expectMemoryDataEquals(t, expected.KernelUsage, actual.KernelUsage)
+
+ if expected.UseHierarchy != actual.UseHierarchy {
+ logrus.Printf("Expected memory use hierarchy %v, but found %v\n", expected.UseHierarchy, actual.UseHierarchy)
+ t.Fail()
+ }
+
+ for key, expValue := range expected.Stats {
+ actValue, ok := actual.Stats[key]
+ if !ok {
+ logrus.Printf("Expected memory stat key %s not found\n", key)
+ t.Fail()
+ }
+ if expValue != actValue {
+ logrus.Printf("Expected memory stat value %d but found %d\n", expValue, actValue)
+ t.Fail()
+ }
+ }
+}
+
+func expectMemoryDataEquals(t *testing.T, expected, actual cgroups.MemoryData) {
+ if expected.Usage != actual.Usage {
+ logrus.Printf("Expected memory usage %d but found %d\n", expected.Usage, actual.Usage)
+ t.Fail()
+ }
+ if expected.MaxUsage != actual.MaxUsage {
+ logrus.Printf("Expected memory max usage %d but found %d\n", expected.MaxUsage, actual.MaxUsage)
+ t.Fail()
+ }
+ if expected.Failcnt != actual.Failcnt {
+ logrus.Printf("Expected memory failcnt %d but found %d\n", expected.Failcnt, actual.Failcnt)
+ t.Fail()
+ }
+ if expected.Limit != actual.Limit {
+ logrus.Printf("Expected memory limit %d but found %d\n", expected.Limit, actual.Limit)
+ t.Fail()
+ }
+}
--- /dev/null
+// +build linux
+
+/*
+Utility for testing cgroup operations.
+
+Creates a mock of the cgroup filesystem for the duration of the test.
+*/
+package fs
+
+import (
+ "io/ioutil"
+ "os"
+ "path/filepath"
+ "testing"
+
+ "github.com/opencontainers/runc/libcontainer/configs"
+)
+
+type cgroupTestUtil struct {
+ // cgroup data to use in tests.
+ CgroupData *cgroupData
+
+ // Path to the mock cgroup directory.
+ CgroupPath string
+
+ // Temporary directory to store mock cgroup filesystem.
+ tempDir string
+ t *testing.T
+}
+
+// Creates a new test util for the specified subsystem
+func NewCgroupTestUtil(subsystem string, t *testing.T) *cgroupTestUtil {
+ d := &cgroupData{
+ config: &configs.Cgroup{},
+ }
+ d.config.Resources = &configs.Resources{}
+ tempDir, err := ioutil.TempDir("", "cgroup_test")
+ if err != nil {
+ t.Fatal(err)
+ }
+ d.root = tempDir
+ testCgroupPath := filepath.Join(d.root, subsystem)
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ // Ensure the full mock cgroup path exists.
+ err = os.MkdirAll(testCgroupPath, 0755)
+ if err != nil {
+ t.Fatal(err)
+ }
+ return &cgroupTestUtil{CgroupData: d, CgroupPath: testCgroupPath, tempDir: tempDir, t: t}
+}
+
+func (c *cgroupTestUtil) cleanup() {
+ os.RemoveAll(c.tempDir)
+}
+
+// Write the specified contents on the mock of the specified cgroup files.
+func (c *cgroupTestUtil) writeFileContents(fileContents map[string]string) {
+ for file, contents := range fileContents {
+ err := writeFile(c.CgroupPath, file, contents)
+ if err != nil {
+ c.t.Fatal(err)
+ }
+ }
+}
--- /dev/null
+// +build linux
+
+package fs
+
+import (
+ "errors"
+ "fmt"
+ "io/ioutil"
+ "path/filepath"
+ "strconv"
+ "strings"
+)
+
+var (
+ ErrNotValidFormat = errors.New("line is not a valid key value format")
+)
+
+// Saturates negative values at zero and returns a uint64.
+// Due to kernel bugs, some of the memory cgroup stats can be negative.
+func parseUint(s string, base, bitSize int) (uint64, error) {
+ value, err := strconv.ParseUint(s, base, bitSize)
+ if err != nil {
+ intValue, intErr := strconv.ParseInt(s, base, bitSize)
+ // 1. Handle negative values greater than MinInt64 (and)
+ // 2. Handle negative values lesser than MinInt64
+ if intErr == nil && intValue < 0 {
+ return 0, nil
+ } else if intErr != nil && intErr.(*strconv.NumError).Err == strconv.ErrRange && intValue < 0 {
+ return 0, nil
+ }
+
+ return value, err
+ }
+
+ return value, nil
+}
+
+// Parses a cgroup param and returns as name, value
+// i.e. "io_service_bytes 1234" will return as io_service_bytes, 1234
+func getCgroupParamKeyValue(t string) (string, uint64, error) {
+ parts := strings.Fields(t)
+ switch len(parts) {
+ case 2:
+ value, err := parseUint(parts[1], 10, 64)
+ if err != nil {
+ return "", 0, fmt.Errorf("unable to convert param value (%q) to uint64: %v", parts[1], err)
+ }
+
+ return parts[0], value, nil
+ default:
+ return "", 0, ErrNotValidFormat
+ }
+}
+
+// Gets a single uint64 value from the specified cgroup file.
+func getCgroupParamUint(cgroupPath, cgroupFile string) (uint64, error) {
+ fileName := filepath.Join(cgroupPath, cgroupFile)
+ contents, err := ioutil.ReadFile(fileName)
+ if err != nil {
+ return 0, err
+ }
+
+ res, err := parseUint(strings.TrimSpace(string(contents)), 10, 64)
+ if err != nil {
+ return res, fmt.Errorf("unable to parse %q as a uint from Cgroup file %q", string(contents), fileName)
+ }
+ return res, nil
+}
+
+// Gets a string value from the specified cgroup file
+func getCgroupParamString(cgroupPath, cgroupFile string) (string, error) {
+ contents, err := ioutil.ReadFile(filepath.Join(cgroupPath, cgroupFile))
+ if err != nil {
+ return "", err
+ }
+
+ return strings.TrimSpace(string(contents)), nil
+}
--- /dev/null
+// +build linux
+
+package fs
+
+import (
+ "io/ioutil"
+ "math"
+ "os"
+ "path/filepath"
+ "strconv"
+ "testing"
+)
+
+const (
+ cgroupFile = "cgroup.file"
+ floatValue = 2048.0
+ floatString = "2048"
+)
+
+func TestGetCgroupParamsInt(t *testing.T) {
+ // Setup tempdir.
+ tempDir, err := ioutil.TempDir("", "cgroup_utils_test")
+ if err != nil {
+ t.Fatal(err)
+ }
+ defer os.RemoveAll(tempDir)
+ tempFile := filepath.Join(tempDir, cgroupFile)
+
+ // Success.
+ err = ioutil.WriteFile(tempFile, []byte(floatString), 0755)
+ if err != nil {
+ t.Fatal(err)
+ }
+ value, err := getCgroupParamUint(tempDir, cgroupFile)
+ if err != nil {
+ t.Fatal(err)
+ } else if value != floatValue {
+ t.Fatalf("Expected %d to equal %f", value, floatValue)
+ }
+
+ // Success with new line.
+ err = ioutil.WriteFile(tempFile, []byte(floatString+"\n"), 0755)
+ if err != nil {
+ t.Fatal(err)
+ }
+ value, err = getCgroupParamUint(tempDir, cgroupFile)
+ if err != nil {
+ t.Fatal(err)
+ } else if value != floatValue {
+ t.Fatalf("Expected %d to equal %f", value, floatValue)
+ }
+
+ // Success with negative values
+ err = ioutil.WriteFile(tempFile, []byte("-12345"), 0755)
+ if err != nil {
+ t.Fatal(err)
+ }
+ value, err = getCgroupParamUint(tempDir, cgroupFile)
+ if err != nil {
+ t.Fatal(err)
+ } else if value != 0 {
+ t.Fatalf("Expected %d to equal %d", value, 0)
+ }
+
+ // Success with negative values lesser than min int64
+ s := strconv.FormatFloat(math.MinInt64, 'f', -1, 64)
+ err = ioutil.WriteFile(tempFile, []byte(s), 0755)
+ if err != nil {
+ t.Fatal(err)
+ }
+ value, err = getCgroupParamUint(tempDir, cgroupFile)
+ if err != nil {
+ t.Fatal(err)
+ } else if value != 0 {
+ t.Fatalf("Expected %d to equal %d", value, 0)
+ }
+
+ // Not a float.
+ err = ioutil.WriteFile(tempFile, []byte("not-a-float"), 0755)
+ if err != nil {
+ t.Fatal(err)
+ }
+ _, err = getCgroupParamUint(tempDir, cgroupFile)
+ if err == nil {
+ t.Fatal("Expecting error, got none")
+ }
+
+ // Unknown file.
+ err = os.Remove(tempFile)
+ if err != nil {
+ t.Fatal(err)
+ }
+ _, err = getCgroupParamUint(tempDir, cgroupFile)
+ if err == nil {
+ t.Fatal("Expecting error, got none")
+ }
+}
--- /dev/null
+// +build linux
+
+package cgroups
+
+type ThrottlingData struct {
+ // Number of periods with throttling active
+ Periods uint64 `json:"periods,omitempty"`
+ // Number of periods when the container hit its throttling limit.
+ ThrottledPeriods uint64 `json:"throttled_periods,omitempty"`
+ // Aggregate time the container was throttled for in nanoseconds.
+ ThrottledTime uint64 `json:"throttled_time,omitempty"`
+}
+
+// CpuUsage denotes the usage of a CPU.
+// All CPU stats are aggregate since container inception.
+type CpuUsage struct {
+ // Total CPU time consumed.
+ // Units: nanoseconds.
+ TotalUsage uint64 `json:"total_usage,omitempty"`
+ // Total CPU time consumed per core.
+ // Units: nanoseconds.
+ PercpuUsage []uint64 `json:"percpu_usage,omitempty"`
+ // Time spent by tasks of the cgroup in kernel mode.
+ // Units: nanoseconds.
+ UsageInKernelmode uint64 `json:"usage_in_kernelmode"`
+ // Time spent by tasks of the cgroup in user mode.
+ // Units: nanoseconds.
+ UsageInUsermode uint64 `json:"usage_in_usermode"`
+}
+
+type CpuStats struct {
+ CpuUsage CpuUsage `json:"cpu_usage,omitempty"`
+ ThrottlingData ThrottlingData `json:"throttling_data,omitempty"`
+}
+
+type MemoryData struct {
+ Usage uint64 `json:"usage,omitempty"`
+ MaxUsage uint64 `json:"max_usage,omitempty"`
+ Failcnt uint64 `json:"failcnt"`
+ Limit uint64 `json:"limit"`
+}
+
+type MemoryStats struct {
+ // memory used for cache
+ Cache uint64 `json:"cache,omitempty"`
+ // usage of memory
+ Usage MemoryData `json:"usage,omitempty"`
+ // usage of memory + swap
+ SwapUsage MemoryData `json:"swap_usage,omitempty"`
+ // usage of kernel memory
+ KernelUsage MemoryData `json:"kernel_usage,omitempty"`
+ // usage of kernel TCP memory
+ KernelTCPUsage MemoryData `json:"kernel_tcp_usage,omitempty"`
+ // if true, memory usage is accounted for throughout a hierarchy of cgroups.
+ UseHierarchy bool `json:"use_hierarchy"`
+
+ Stats map[string]uint64 `json:"stats,omitempty"`
+}
+
+type PidsStats struct {
+ // number of pids in the cgroup
+ Current uint64 `json:"current,omitempty"`
+ // active pids hard limit
+ Limit uint64 `json:"limit,omitempty"`
+}
+
+type BlkioStatEntry struct {
+ Major uint64 `json:"major,omitempty"`
+ Minor uint64 `json:"minor,omitempty"`
+ Op string `json:"op,omitempty"`
+ Value uint64 `json:"value,omitempty"`
+}
+
+type BlkioStats struct {
+ // number of bytes tranferred to and from the block device
+ IoServiceBytesRecursive []BlkioStatEntry `json:"io_service_bytes_recursive,omitempty"`
+ IoServicedRecursive []BlkioStatEntry `json:"io_serviced_recursive,omitempty"`
+ IoQueuedRecursive []BlkioStatEntry `json:"io_queue_recursive,omitempty"`
+ IoServiceTimeRecursive []BlkioStatEntry `json:"io_service_time_recursive,omitempty"`
+ IoWaitTimeRecursive []BlkioStatEntry `json:"io_wait_time_recursive,omitempty"`
+ IoMergedRecursive []BlkioStatEntry `json:"io_merged_recursive,omitempty"`
+ IoTimeRecursive []BlkioStatEntry `json:"io_time_recursive,omitempty"`
+ SectorsRecursive []BlkioStatEntry `json:"sectors_recursive,omitempty"`
+}
+
+type HugetlbStats struct {
+ // current res_counter usage for hugetlb
+ Usage uint64 `json:"usage,omitempty"`
+ // maximum usage ever recorded.
+ MaxUsage uint64 `json:"max_usage,omitempty"`
+ // number of times hugetlb usage allocation failure.
+ Failcnt uint64 `json:"failcnt"`
+}
+
+type Stats struct {
+ CpuStats CpuStats `json:"cpu_stats,omitempty"`
+ MemoryStats MemoryStats `json:"memory_stats,omitempty"`
+ PidsStats PidsStats `json:"pids_stats,omitempty"`
+ BlkioStats BlkioStats `json:"blkio_stats,omitempty"`
+ // the map is in the format "size of hugepage: stats of the hugepage"
+ HugetlbStats map[string]HugetlbStats `json:"hugetlb_stats,omitempty"`
+}
+
+func NewStats() *Stats {
+ memoryStats := MemoryStats{Stats: make(map[string]uint64)}
+ hugetlbStats := make(map[string]HugetlbStats)
+ return &Stats{MemoryStats: memoryStats, HugetlbStats: hugetlbStats}
+}
--- /dev/null
+// +build !linux static_build
+
+package systemd
+
+import (
+ "fmt"
+
+ "github.com/opencontainers/runc/libcontainer/cgroups"
+ "github.com/opencontainers/runc/libcontainer/configs"
+)
+
+type Manager struct {
+ Cgroups *configs.Cgroup
+ Paths map[string]string
+}
+
+func UseSystemd() bool {
+ return false
+}
+
+func (m *Manager) Apply(pid int) error {
+ return fmt.Errorf("Systemd not supported")
+}
+
+func (m *Manager) GetPids() ([]int, error) {
+ return nil, fmt.Errorf("Systemd not supported")
+}
+
+func (m *Manager) GetAllPids() ([]int, error) {
+ return nil, fmt.Errorf("Systemd not supported")
+}
+
+func (m *Manager) Destroy() error {
+ return fmt.Errorf("Systemd not supported")
+}
+
+func (m *Manager) GetPaths() map[string]string {
+ return nil
+}
+
+func (m *Manager) GetStats() (*cgroups.Stats, error) {
+ return nil, fmt.Errorf("Systemd not supported")
+}
+
+func (m *Manager) Set(container *configs.Config) error {
+ return fmt.Errorf("Systemd not supported")
+}
+
+func (m *Manager) Freeze(state configs.FreezerState) error {
+ return fmt.Errorf("Systemd not supported")
+}
+
+func Freeze(c *configs.Cgroup, state configs.FreezerState) error {
+ return fmt.Errorf("Systemd not supported")
+}
--- /dev/null
+// +build linux,!static_build
+
+package systemd
+
+import (
+ "errors"
+ "fmt"
+ "math"
+ "os"
+ "path/filepath"
+ "strings"
+ "sync"
+ "time"
+
+ systemdDbus "github.com/coreos/go-systemd/dbus"
+ systemdUtil "github.com/coreos/go-systemd/util"
+ "github.com/godbus/dbus"
+ "github.com/opencontainers/runc/libcontainer/cgroups"
+ "github.com/opencontainers/runc/libcontainer/cgroups/fs"
+ "github.com/opencontainers/runc/libcontainer/configs"
+ "github.com/sirupsen/logrus"
+)
+
+type Manager struct {
+ mu sync.Mutex
+ Cgroups *configs.Cgroup
+ Paths map[string]string
+}
+
+type subsystem interface {
+ // Name returns the name of the subsystem.
+ Name() string
+ // Returns the stats, as 'stats', corresponding to the cgroup under 'path'.
+ GetStats(path string, stats *cgroups.Stats) error
+ // Set the cgroup represented by cgroup.
+ Set(path string, cgroup *configs.Cgroup) error
+}
+
+var errSubsystemDoesNotExist = errors.New("cgroup: subsystem does not exist")
+
+type subsystemSet []subsystem
+
+func (s subsystemSet) Get(name string) (subsystem, error) {
+ for _, ss := range s {
+ if ss.Name() == name {
+ return ss, nil
+ }
+ }
+ return nil, errSubsystemDoesNotExist
+}
+
+var subsystems = subsystemSet{
+ &fs.CpusetGroup{},
+ &fs.DevicesGroup{},
+ &fs.MemoryGroup{},
+ &fs.CpuGroup{},
+ &fs.CpuacctGroup{},
+ &fs.PidsGroup{},
+ &fs.BlkioGroup{},
+ &fs.HugetlbGroup{},
+ &fs.PerfEventGroup{},
+ &fs.FreezerGroup{},
+ &fs.NetPrioGroup{},
+ &fs.NetClsGroup{},
+ &fs.NameGroup{GroupName: "name=systemd"},
+}
+
+const (
+ testScopeWait = 4
+ testSliceWait = 4
+)
+
+var (
+ connLock sync.Mutex
+ theConn *systemdDbus.Conn
+ hasStartTransientUnit bool
+ hasStartTransientSliceUnit bool
+ hasTransientDefaultDependencies bool
+ hasDelegateScope bool
+ hasDelegateSlice bool
+)
+
+func newProp(name string, units interface{}) systemdDbus.Property {
+ return systemdDbus.Property{
+ Name: name,
+ Value: dbus.MakeVariant(units),
+ }
+}
+
+func UseSystemd() bool {
+ if !systemdUtil.IsRunningSystemd() {
+ return false
+ }
+
+ connLock.Lock()
+ defer connLock.Unlock()
+
+ if theConn == nil {
+ var err error
+ theConn, err = systemdDbus.New()
+ if err != nil {
+ return false
+ }
+
+ // Assume we have StartTransientUnit
+ hasStartTransientUnit = true
+
+ // But if we get UnknownMethod error we don't
+ if _, err := theConn.StartTransientUnit("test.scope", "invalid", nil, nil); err != nil {
+ if dbusError, ok := err.(dbus.Error); ok {
+ if dbusError.Name == "org.freedesktop.DBus.Error.UnknownMethod" {
+ hasStartTransientUnit = false
+ return hasStartTransientUnit
+ }
+ }
+ }
+
+ // Ensure the scope name we use doesn't exist. Use the Pid to
+ // avoid collisions between multiple libcontainer users on a
+ // single host.
+ scope := fmt.Sprintf("libcontainer-%d-systemd-test-default-dependencies.scope", os.Getpid())
+ testScopeExists := true
+ for i := 0; i <= testScopeWait; i++ {
+ if _, err := theConn.StopUnit(scope, "replace", nil); err != nil {
+ if dbusError, ok := err.(dbus.Error); ok {
+ if strings.Contains(dbusError.Name, "org.freedesktop.systemd1.NoSuchUnit") {
+ testScopeExists = false
+ break
+ }
+ }
+ }
+ time.Sleep(time.Millisecond)
+ }
+
+ // Bail out if we can't kill this scope without testing for DefaultDependencies
+ if testScopeExists {
+ return hasStartTransientUnit
+ }
+
+ // Assume StartTransientUnit on a scope allows DefaultDependencies
+ hasTransientDefaultDependencies = true
+ ddf := newProp("DefaultDependencies", false)
+ if _, err := theConn.StartTransientUnit(scope, "replace", []systemdDbus.Property{ddf}, nil); err != nil {
+ if dbusError, ok := err.(dbus.Error); ok {
+ if strings.Contains(dbusError.Name, "org.freedesktop.DBus.Error.PropertyReadOnly") {
+ hasTransientDefaultDependencies = false
+ }
+ }
+ }
+
+ // Not critical because of the stop unit logic above.
+ theConn.StopUnit(scope, "replace", nil)
+
+ // Assume StartTransientUnit on a scope allows Delegate
+ hasDelegateScope = true
+ dlScope := newProp("Delegate", true)
+ if _, err := theConn.StartTransientUnit(scope, "replace", []systemdDbus.Property{dlScope}, nil); err != nil {
+ if dbusError, ok := err.(dbus.Error); ok {
+ if strings.Contains(dbusError.Name, "org.freedesktop.DBus.Error.PropertyReadOnly") {
+ hasDelegateScope = false
+ }
+ }
+ }
+
+ // Assume we have the ability to start a transient unit as a slice
+ // This was broken until systemd v229, but has been back-ported on RHEL environments >= 219
+ // For details, see: https://bugzilla.redhat.com/show_bug.cgi?id=1370299
+ hasStartTransientSliceUnit = true
+
+ // To ensure simple clean-up, we create a slice off the root with no hierarchy
+ slice := fmt.Sprintf("libcontainer_%d_systemd_test_default.slice", os.Getpid())
+ if _, err := theConn.StartTransientUnit(slice, "replace", nil, nil); err != nil {
+ if _, ok := err.(dbus.Error); ok {
+ hasStartTransientSliceUnit = false
+ }
+ }
+
+ for i := 0; i <= testSliceWait; i++ {
+ if _, err := theConn.StopUnit(slice, "replace", nil); err != nil {
+ if dbusError, ok := err.(dbus.Error); ok {
+ if strings.Contains(dbusError.Name, "org.freedesktop.systemd1.NoSuchUnit") {
+ hasStartTransientSliceUnit = false
+ break
+ }
+ }
+ } else {
+ break
+ }
+ time.Sleep(time.Millisecond)
+ }
+
+ // Not critical because of the stop unit logic above.
+ theConn.StopUnit(slice, "replace", nil)
+
+ // Assume StartTransientUnit on a slice allows Delegate
+ hasDelegateSlice = true
+ dlSlice := newProp("Delegate", true)
+ if _, err := theConn.StartTransientUnit(slice, "replace", []systemdDbus.Property{dlSlice}, nil); err != nil {
+ if dbusError, ok := err.(dbus.Error); ok {
+ // Starting with systemd v237, Delegate is not even a property of slices anymore,
+ // so the D-Bus call fails with "InvalidArgs" error.
+ if strings.Contains(dbusError.Name, "org.freedesktop.DBus.Error.PropertyReadOnly") || strings.Contains(dbusError.Name, "org.freedesktop.DBus.Error.InvalidArgs") {
+ hasDelegateSlice = false
+ }
+ }
+ }
+
+ // Not critical because of the stop unit logic above.
+ theConn.StopUnit(scope, "replace", nil)
+ theConn.StopUnit(slice, "replace", nil)
+ }
+ return hasStartTransientUnit
+}
+
+func (m *Manager) Apply(pid int) error {
+ var (
+ c = m.Cgroups
+ unitName = getUnitName(c)
+ slice = "system.slice"
+ properties []systemdDbus.Property
+ )
+
+ if c.Paths != nil {
+ paths := make(map[string]string)
+ for name, path := range c.Paths {
+ _, err := getSubsystemPath(m.Cgroups, name)
+ if err != nil {
+ // Don't fail if a cgroup hierarchy was not found, just skip this subsystem
+ if cgroups.IsNotFound(err) {
+ continue
+ }
+ return err
+ }
+ paths[name] = path
+ }
+ m.Paths = paths
+ return cgroups.EnterPid(m.Paths, pid)
+ }
+
+ if c.Parent != "" {
+ slice = c.Parent
+ }
+
+ properties = append(properties, systemdDbus.PropDescription("libcontainer container "+c.Name))
+
+ // if we create a slice, the parent is defined via a Wants=
+ if strings.HasSuffix(unitName, ".slice") {
+ // This was broken until systemd v229, but has been back-ported on RHEL environments >= 219
+ if !hasStartTransientSliceUnit {
+ return fmt.Errorf("systemd version does not support ability to start a slice as transient unit")
+ }
+ properties = append(properties, systemdDbus.PropWants(slice))
+ } else {
+ // otherwise, we use Slice=
+ properties = append(properties, systemdDbus.PropSlice(slice))
+ }
+
+ // only add pid if its valid, -1 is used w/ general slice creation.
+ if pid != -1 {
+ properties = append(properties, newProp("PIDs", []uint32{uint32(pid)}))
+ }
+
+ // Check if we can delegate. This is only supported on systemd versions 218 and above.
+ if strings.HasSuffix(unitName, ".slice") {
+ if hasDelegateSlice {
+ // systemd 237 and above no longer allows delegation on a slice
+ properties = append(properties, newProp("Delegate", true))
+ }
+ } else {
+ if hasDelegateScope {
+ properties = append(properties, newProp("Delegate", true))
+ }
+ }
+
+ // Always enable accounting, this gets us the same behaviour as the fs implementation,
+ // plus the kernel has some problems with joining the memory cgroup at a later time.
+ properties = append(properties,
+ newProp("MemoryAccounting", true),
+ newProp("CPUAccounting", true),
+ newProp("BlockIOAccounting", true))
+
+ if hasTransientDefaultDependencies {
+ properties = append(properties,
+ newProp("DefaultDependencies", false))
+ }
+
+ if c.Resources.Memory != 0 {
+ properties = append(properties,
+ newProp("MemoryLimit", uint64(c.Resources.Memory)))
+ }
+
+ if c.Resources.CpuShares != 0 {
+ properties = append(properties,
+ newProp("CPUShares", c.Resources.CpuShares))
+ }
+
+ // cpu.cfs_quota_us and cpu.cfs_period_us are controlled by systemd.
+ if c.Resources.CpuQuota != 0 && c.Resources.CpuPeriod != 0 {
+ // corresponds to USEC_INFINITY in systemd
+ // if USEC_INFINITY is provided, CPUQuota is left unbound by systemd
+ // always setting a property value ensures we can apply a quota and remove it later
+ cpuQuotaPerSecUSec := uint64(math.MaxUint64)
+ if c.Resources.CpuQuota > 0 {
+ // systemd converts CPUQuotaPerSecUSec (microseconds per CPU second) to CPUQuota
+ // (integer percentage of CPU) internally. This means that if a fractional percent of
+ // CPU is indicated by Resources.CpuQuota, we need to round up to the nearest
+ // 10ms (1% of a second) such that child cgroups can set the cpu.cfs_quota_us they expect.
+ cpuQuotaPerSecUSec = uint64(c.Resources.CpuQuota*1000000) / c.Resources.CpuPeriod
+ if cpuQuotaPerSecUSec%10000 != 0 {
+ cpuQuotaPerSecUSec = ((cpuQuotaPerSecUSec / 10000) + 1) * 10000
+ }
+ }
+ properties = append(properties,
+ newProp("CPUQuotaPerSecUSec", cpuQuotaPerSecUSec))
+ }
+
+ if c.Resources.BlkioWeight != 0 {
+ properties = append(properties,
+ newProp("BlockIOWeight", uint64(c.Resources.BlkioWeight)))
+ }
+
+ if c.Resources.PidsLimit > 0 {
+ properties = append(properties,
+ newProp("TasksAccounting", true),
+ newProp("TasksMax", uint64(c.Resources.PidsLimit)))
+ }
+
+ // We have to set kernel memory here, as we can't change it once
+ // processes have been attached to the cgroup.
+ if c.Resources.KernelMemory != 0 {
+ if err := setKernelMemory(c); err != nil {
+ return err
+ }
+ }
+
+ statusChan := make(chan string, 1)
+ if _, err := theConn.StartTransientUnit(unitName, "replace", properties, statusChan); err == nil {
+ select {
+ case <-statusChan:
+ case <-time.After(time.Second):
+ logrus.Warnf("Timed out while waiting for StartTransientUnit(%s) completion signal from dbus. Continuing...", unitName)
+ }
+ } else if !isUnitExists(err) {
+ return err
+ }
+
+ if err := joinCgroups(c, pid); err != nil {
+ return err
+ }
+
+ paths := make(map[string]string)
+ for _, s := range subsystems {
+ subsystemPath, err := getSubsystemPath(m.Cgroups, s.Name())
+ if err != nil {
+ // Don't fail if a cgroup hierarchy was not found, just skip this subsystem
+ if cgroups.IsNotFound(err) {
+ continue
+ }
+ return err
+ }
+ paths[s.Name()] = subsystemPath
+ }
+ m.Paths = paths
+ return nil
+}
+
+func (m *Manager) Destroy() error {
+ if m.Cgroups.Paths != nil {
+ return nil
+ }
+ m.mu.Lock()
+ defer m.mu.Unlock()
+ theConn.StopUnit(getUnitName(m.Cgroups), "replace", nil)
+ if err := cgroups.RemovePaths(m.Paths); err != nil {
+ return err
+ }
+ m.Paths = make(map[string]string)
+ return nil
+}
+
+func (m *Manager) GetPaths() map[string]string {
+ m.mu.Lock()
+ paths := m.Paths
+ m.mu.Unlock()
+ return paths
+}
+
+func join(c *configs.Cgroup, subsystem string, pid int) (string, error) {
+ path, err := getSubsystemPath(c, subsystem)
+ if err != nil {
+ return "", err
+ }
+ if err := os.MkdirAll(path, 0755); err != nil {
+ return "", err
+ }
+ if err := cgroups.WriteCgroupProc(path, pid); err != nil {
+ return "", err
+ }
+ return path, nil
+}
+
+func joinCgroups(c *configs.Cgroup, pid int) error {
+ for _, sys := range subsystems {
+ name := sys.Name()
+ switch name {
+ case "name=systemd":
+ // let systemd handle this
+ case "cpuset":
+ path, err := getSubsystemPath(c, name)
+ if err != nil && !cgroups.IsNotFound(err) {
+ return err
+ }
+ s := &fs.CpusetGroup{}
+ if err := s.ApplyDir(path, c, pid); err != nil {
+ return err
+ }
+ default:
+ _, err := join(c, name, pid)
+ if err != nil {
+ // Even if it's `not found` error, we'll return err
+ // because devices cgroup is hard requirement for
+ // container security.
+ if name == "devices" {
+ return err
+ }
+ // For other subsystems, omit the `not found` error
+ // because they are optional.
+ if !cgroups.IsNotFound(err) {
+ return err
+ }
+ }
+ }
+ }
+
+ return nil
+}
+
+// systemd represents slice hierarchy using `-`, so we need to follow suit when
+// generating the path of slice. Essentially, test-a-b.slice becomes
+// /test.slice/test-a.slice/test-a-b.slice.
+func ExpandSlice(slice string) (string, error) {
+ suffix := ".slice"
+ // Name has to end with ".slice", but can't be just ".slice".
+ if len(slice) < len(suffix) || !strings.HasSuffix(slice, suffix) {
+ return "", fmt.Errorf("invalid slice name: %s", slice)
+ }
+
+ // Path-separators are not allowed.
+ if strings.Contains(slice, "/") {
+ return "", fmt.Errorf("invalid slice name: %s", slice)
+ }
+
+ var path, prefix string
+ sliceName := strings.TrimSuffix(slice, suffix)
+ // if input was -.slice, we should just return root now
+ if sliceName == "-" {
+ return "/", nil
+ }
+ for _, component := range strings.Split(sliceName, "-") {
+ // test--a.slice isn't permitted, nor is -test.slice.
+ if component == "" {
+ return "", fmt.Errorf("invalid slice name: %s", slice)
+ }
+
+ // Append the component to the path and to the prefix.
+ path += "/" + prefix + component + suffix
+ prefix += component + "-"
+ }
+ return path, nil
+}
+
+func getSubsystemPath(c *configs.Cgroup, subsystem string) (string, error) {
+ mountpoint, err := cgroups.FindCgroupMountpoint(c.Path, subsystem)
+ if err != nil {
+ return "", err
+ }
+
+ initPath, err := cgroups.GetInitCgroup(subsystem)
+ if err != nil {
+ return "", err
+ }
+ // if pid 1 is systemd 226 or later, it will be in init.scope, not the root
+ initPath = strings.TrimSuffix(filepath.Clean(initPath), "init.scope")
+
+ slice := "system.slice"
+ if c.Parent != "" {
+ slice = c.Parent
+ }
+
+ slice, err = ExpandSlice(slice)
+ if err != nil {
+ return "", err
+ }
+
+ return filepath.Join(mountpoint, initPath, slice, getUnitName(c)), nil
+}
+
+func (m *Manager) Freeze(state configs.FreezerState) error {
+ path, err := getSubsystemPath(m.Cgroups, "freezer")
+ if err != nil {
+ return err
+ }
+ prevState := m.Cgroups.Resources.Freezer
+ m.Cgroups.Resources.Freezer = state
+ freezer, err := subsystems.Get("freezer")
+ if err != nil {
+ return err
+ }
+ err = freezer.Set(path, m.Cgroups)
+ if err != nil {
+ m.Cgroups.Resources.Freezer = prevState
+ return err
+ }
+ return nil
+}
+
+func (m *Manager) GetPids() ([]int, error) {
+ path, err := getSubsystemPath(m.Cgroups, "devices")
+ if err != nil {
+ return nil, err
+ }
+ return cgroups.GetPids(path)
+}
+
+func (m *Manager) GetAllPids() ([]int, error) {
+ path, err := getSubsystemPath(m.Cgroups, "devices")
+ if err != nil {
+ return nil, err
+ }
+ return cgroups.GetAllPids(path)
+}
+
+func (m *Manager) GetStats() (*cgroups.Stats, error) {
+ m.mu.Lock()
+ defer m.mu.Unlock()
+ stats := cgroups.NewStats()
+ for name, path := range m.Paths {
+ sys, err := subsystems.Get(name)
+ if err == errSubsystemDoesNotExist || !cgroups.PathExists(path) {
+ continue
+ }
+ if err := sys.GetStats(path, stats); err != nil {
+ return nil, err
+ }
+ }
+
+ return stats, nil
+}
+
+func (m *Manager) Set(container *configs.Config) error {
+ // If Paths are set, then we are just joining cgroups paths
+ // and there is no need to set any values.
+ if m.Cgroups.Paths != nil {
+ return nil
+ }
+ for _, sys := range subsystems {
+ // Get the subsystem path, but don't error out for not found cgroups.
+ path, err := getSubsystemPath(container.Cgroups, sys.Name())
+ if err != nil && !cgroups.IsNotFound(err) {
+ return err
+ }
+
+ if err := sys.Set(path, container.Cgroups); err != nil {
+ return err
+ }
+ }
+
+ if m.Paths["cpu"] != "" {
+ if err := fs.CheckCpushares(m.Paths["cpu"], container.Cgroups.Resources.CpuShares); err != nil {
+ return err
+ }
+ }
+ return nil
+}
+
+func getUnitName(c *configs.Cgroup) string {
+ // by default, we create a scope unless the user explicitly asks for a slice.
+ if !strings.HasSuffix(c.Name, ".slice") {
+ return fmt.Sprintf("%s-%s.scope", c.ScopePrefix, c.Name)
+ }
+ return c.Name
+}
+
+func setKernelMemory(c *configs.Cgroup) error {
+ path, err := getSubsystemPath(c, "memory")
+ if err != nil && !cgroups.IsNotFound(err) {
+ return err
+ }
+
+ if err := os.MkdirAll(path, 0755); err != nil {
+ return err
+ }
+ return fs.EnableKernelMemoryAccounting(path)
+}
+
+// isUnitExists returns true if the error is that a systemd unit already exists.
+func isUnitExists(err error) bool {
+ if err != nil {
+ if dbusError, ok := err.(dbus.Error); ok {
+ return strings.Contains(dbusError.Name, "org.freedesktop.systemd1.UnitExists")
+ }
+ }
+ return false
+}
--- /dev/null
+// +build linux
+
+package cgroups
+
+import (
+ "bufio"
+ "fmt"
+ "io"
+ "io/ioutil"
+ "os"
+ "path/filepath"
+ "strconv"
+ "strings"
+ "time"
+
+ units "github.com/docker/go-units"
+)
+
+const (
+ CgroupNamePrefix = "name="
+ CgroupProcesses = "cgroup.procs"
+)
+
+// https://www.kernel.org/doc/Documentation/cgroup-v1/cgroups.txt
+func FindCgroupMountpoint(cgroupPath, subsystem string) (string, error) {
+ mnt, _, err := FindCgroupMountpointAndRoot(cgroupPath, subsystem)
+ return mnt, err
+}
+
+func FindCgroupMountpointAndRoot(cgroupPath, subsystem string) (string, string, error) {
+ // We are not using mount.GetMounts() because it's super-inefficient,
+ // parsing it directly sped up x10 times because of not using Sscanf.
+ // It was one of two major performance drawbacks in container start.
+ if !isSubsystemAvailable(subsystem) {
+ return "", "", NewNotFoundError(subsystem)
+ }
+
+ f, err := os.Open("/proc/self/mountinfo")
+ if err != nil {
+ return "", "", err
+ }
+ defer f.Close()
+
+ return findCgroupMountpointAndRootFromReader(f, cgroupPath, subsystem)
+}
+
+func findCgroupMountpointAndRootFromReader(reader io.Reader, cgroupPath, subsystem string) (string, string, error) {
+ scanner := bufio.NewScanner(reader)
+ for scanner.Scan() {
+ txt := scanner.Text()
+ fields := strings.Fields(txt)
+ if len(fields) < 5 {
+ continue
+ }
+ if strings.HasPrefix(fields[4], cgroupPath) {
+ for _, opt := range strings.Split(fields[len(fields)-1], ",") {
+ if opt == subsystem {
+ return fields[4], fields[3], nil
+ }
+ }
+ }
+ }
+ if err := scanner.Err(); err != nil {
+ return "", "", err
+ }
+
+ return "", "", NewNotFoundError(subsystem)
+}
+
+func isSubsystemAvailable(subsystem string) bool {
+ cgroups, err := ParseCgroupFile("/proc/self/cgroup")
+ if err != nil {
+ return false
+ }
+ _, avail := cgroups[subsystem]
+ return avail
+}
+
+func GetClosestMountpointAncestor(dir, mountinfo string) string {
+ deepestMountPoint := ""
+ for _, mountInfoEntry := range strings.Split(mountinfo, "\n") {
+ mountInfoParts := strings.Fields(mountInfoEntry)
+ if len(mountInfoParts) < 5 {
+ continue
+ }
+ mountPoint := mountInfoParts[4]
+ if strings.HasPrefix(mountPoint, deepestMountPoint) && strings.HasPrefix(dir, mountPoint) {
+ deepestMountPoint = mountPoint
+ }
+ }
+ return deepestMountPoint
+}
+
+func FindCgroupMountpointDir() (string, error) {
+ f, err := os.Open("/proc/self/mountinfo")
+ if err != nil {
+ return "", err
+ }
+ defer f.Close()
+
+ scanner := bufio.NewScanner(f)
+ for scanner.Scan() {
+ text := scanner.Text()
+ fields := strings.Split(text, " ")
+ // Safe as mountinfo encodes mountpoints with spaces as \040.
+ index := strings.Index(text, " - ")
+ postSeparatorFields := strings.Fields(text[index+3:])
+ numPostFields := len(postSeparatorFields)
+
+ // This is an error as we can't detect if the mount is for "cgroup"
+ if numPostFields == 0 {
+ return "", fmt.Errorf("Found no fields post '-' in %q", text)
+ }
+
+ if postSeparatorFields[0] == "cgroup" {
+ // Check that the mount is properly formatted.
+ if numPostFields < 3 {
+ return "", fmt.Errorf("Error found less than 3 fields post '-' in %q", text)
+ }
+
+ return filepath.Dir(fields[4]), nil
+ }
+ }
+ if err := scanner.Err(); err != nil {
+ return "", err
+ }
+
+ return "", NewNotFoundError("cgroup")
+}
+
+type Mount struct {
+ Mountpoint string
+ Root string
+ Subsystems []string
+}
+
+func (m Mount) GetOwnCgroup(cgroups map[string]string) (string, error) {
+ if len(m.Subsystems) == 0 {
+ return "", fmt.Errorf("no subsystem for mount")
+ }
+
+ return getControllerPath(m.Subsystems[0], cgroups)
+}
+
+func getCgroupMountsHelper(ss map[string]bool, mi io.Reader, all bool) ([]Mount, error) {
+ res := make([]Mount, 0, len(ss))
+ scanner := bufio.NewScanner(mi)
+ numFound := 0
+ for scanner.Scan() && numFound < len(ss) {
+ txt := scanner.Text()
+ sepIdx := strings.Index(txt, " - ")
+ if sepIdx == -1 {
+ return nil, fmt.Errorf("invalid mountinfo format")
+ }
+ if txt[sepIdx+3:sepIdx+10] == "cgroup2" || txt[sepIdx+3:sepIdx+9] != "cgroup" {
+ continue
+ }
+ fields := strings.Split(txt, " ")
+ m := Mount{
+ Mountpoint: fields[4],
+ Root: fields[3],
+ }
+ for _, opt := range strings.Split(fields[len(fields)-1], ",") {
+ seen, known := ss[opt]
+ if !known || (!all && seen) {
+ continue
+ }
+ ss[opt] = true
+ if strings.HasPrefix(opt, CgroupNamePrefix) {
+ opt = opt[len(CgroupNamePrefix):]
+ }
+ m.Subsystems = append(m.Subsystems, opt)
+ numFound++
+ }
+ if len(m.Subsystems) > 0 || all {
+ res = append(res, m)
+ }
+ }
+ if err := scanner.Err(); err != nil {
+ return nil, err
+ }
+ return res, nil
+}
+
+// GetCgroupMounts returns the mounts for the cgroup subsystems.
+// all indicates whether to return just the first instance or all the mounts.
+func GetCgroupMounts(all bool) ([]Mount, error) {
+ f, err := os.Open("/proc/self/mountinfo")
+ if err != nil {
+ return nil, err
+ }
+ defer f.Close()
+
+ allSubsystems, err := ParseCgroupFile("/proc/self/cgroup")
+ if err != nil {
+ return nil, err
+ }
+
+ allMap := make(map[string]bool)
+ for s := range allSubsystems {
+ allMap[s] = false
+ }
+ return getCgroupMountsHelper(allMap, f, all)
+}
+
+// GetAllSubsystems returns all the cgroup subsystems supported by the kernel
+func GetAllSubsystems() ([]string, error) {
+ f, err := os.Open("/proc/cgroups")
+ if err != nil {
+ return nil, err
+ }
+ defer f.Close()
+
+ subsystems := []string{}
+
+ s := bufio.NewScanner(f)
+ for s.Scan() {
+ text := s.Text()
+ if text[0] != '#' {
+ parts := strings.Fields(text)
+ if len(parts) >= 4 && parts[3] != "0" {
+ subsystems = append(subsystems, parts[0])
+ }
+ }
+ }
+ if err := s.Err(); err != nil {
+ return nil, err
+ }
+ return subsystems, nil
+}
+
+// GetOwnCgroup returns the relative path to the cgroup docker is running in.
+func GetOwnCgroup(subsystem string) (string, error) {
+ cgroups, err := ParseCgroupFile("/proc/self/cgroup")
+ if err != nil {
+ return "", err
+ }
+
+ return getControllerPath(subsystem, cgroups)
+}
+
+func GetOwnCgroupPath(subsystem string) (string, error) {
+ cgroup, err := GetOwnCgroup(subsystem)
+ if err != nil {
+ return "", err
+ }
+
+ return getCgroupPathHelper(subsystem, cgroup)
+}
+
+func GetInitCgroup(subsystem string) (string, error) {
+ cgroups, err := ParseCgroupFile("/proc/1/cgroup")
+ if err != nil {
+ return "", err
+ }
+
+ return getControllerPath(subsystem, cgroups)
+}
+
+func GetInitCgroupPath(subsystem string) (string, error) {
+ cgroup, err := GetInitCgroup(subsystem)
+ if err != nil {
+ return "", err
+ }
+
+ return getCgroupPathHelper(subsystem, cgroup)
+}
+
+func getCgroupPathHelper(subsystem, cgroup string) (string, error) {
+ mnt, root, err := FindCgroupMountpointAndRoot("", subsystem)
+ if err != nil {
+ return "", err
+ }
+
+ // This is needed for nested containers, because in /proc/self/cgroup we
+ // see paths from host, which don't exist in container.
+ relCgroup, err := filepath.Rel(root, cgroup)
+ if err != nil {
+ return "", err
+ }
+
+ return filepath.Join(mnt, relCgroup), nil
+}
+
+func readProcsFile(dir string) ([]int, error) {
+ f, err := os.Open(filepath.Join(dir, CgroupProcesses))
+ if err != nil {
+ return nil, err
+ }
+ defer f.Close()
+
+ var (
+ s = bufio.NewScanner(f)
+ out = []int{}
+ )
+
+ for s.Scan() {
+ if t := s.Text(); t != "" {
+ pid, err := strconv.Atoi(t)
+ if err != nil {
+ return nil, err
+ }
+ out = append(out, pid)
+ }
+ }
+ return out, nil
+}
+
+// ParseCgroupFile parses the given cgroup file, typically from
+// /proc/<pid>/cgroup, into a map of subgroups to cgroup names.
+func ParseCgroupFile(path string) (map[string]string, error) {
+ f, err := os.Open(path)
+ if err != nil {
+ return nil, err
+ }
+ defer f.Close()
+
+ return parseCgroupFromReader(f)
+}
+
+// helper function for ParseCgroupFile to make testing easier
+func parseCgroupFromReader(r io.Reader) (map[string]string, error) {
+ s := bufio.NewScanner(r)
+ cgroups := make(map[string]string)
+
+ for s.Scan() {
+ text := s.Text()
+ // from cgroups(7):
+ // /proc/[pid]/cgroup
+ // ...
+ // For each cgroup hierarchy ... there is one entry
+ // containing three colon-separated fields of the form:
+ // hierarchy-ID:subsystem-list:cgroup-path
+ parts := strings.SplitN(text, ":", 3)
+ if len(parts) < 3 {
+ return nil, fmt.Errorf("invalid cgroup entry: must contain at least two colons: %v", text)
+ }
+
+ for _, subs := range strings.Split(parts[1], ",") {
+ cgroups[subs] = parts[2]
+ }
+ }
+ if err := s.Err(); err != nil {
+ return nil, err
+ }
+
+ return cgroups, nil
+}
+
+func getControllerPath(subsystem string, cgroups map[string]string) (string, error) {
+
+ if p, ok := cgroups[subsystem]; ok {
+ return p, nil
+ }
+
+ if p, ok := cgroups[CgroupNamePrefix+subsystem]; ok {
+ return p, nil
+ }
+
+ return "", NewNotFoundError(subsystem)
+}
+
+func PathExists(path string) bool {
+ if _, err := os.Stat(path); err != nil {
+ return false
+ }
+ return true
+}
+
+func EnterPid(cgroupPaths map[string]string, pid int) error {
+ for _, path := range cgroupPaths {
+ if PathExists(path) {
+ if err := WriteCgroupProc(path, pid); err != nil {
+ return err
+ }
+ }
+ }
+ return nil
+}
+
+// RemovePaths iterates over the provided paths removing them.
+// We trying to remove all paths five times with increasing delay between tries.
+// If after all there are not removed cgroups - appropriate error will be
+// returned.
+func RemovePaths(paths map[string]string) (err error) {
+ delay := 10 * time.Millisecond
+ for i := 0; i < 5; i++ {
+ if i != 0 {
+ time.Sleep(delay)
+ delay *= 2
+ }
+ for s, p := range paths {
+ os.RemoveAll(p)
+ // TODO: here probably should be logging
+ _, err := os.Stat(p)
+ // We need this strange way of checking cgroups existence because
+ // RemoveAll almost always returns error, even on already removed
+ // cgroups
+ if os.IsNotExist(err) {
+ delete(paths, s)
+ }
+ }
+ if len(paths) == 0 {
+ return nil
+ }
+ }
+ return fmt.Errorf("Failed to remove paths: %v", paths)
+}
+
+func GetHugePageSize() ([]string, error) {
+ var pageSizes []string
+ sizeList := []string{"B", "kB", "MB", "GB", "TB", "PB"}
+ files, err := ioutil.ReadDir("/sys/kernel/mm/hugepages")
+ if err != nil {
+ return pageSizes, err
+ }
+ for _, st := range files {
+ nameArray := strings.Split(st.Name(), "-")
+ pageSize, err := units.RAMInBytes(nameArray[1])
+ if err != nil {
+ return []string{}, err
+ }
+ sizeString := units.CustomSize("%g%s", float64(pageSize), 1024.0, sizeList)
+ pageSizes = append(pageSizes, sizeString)
+ }
+
+ return pageSizes, nil
+}
+
+// GetPids returns all pids, that were added to cgroup at path.
+func GetPids(path string) ([]int, error) {
+ return readProcsFile(path)
+}
+
+// GetAllPids returns all pids, that were added to cgroup at path and to all its
+// subcgroups.
+func GetAllPids(path string) ([]int, error) {
+ var pids []int
+ // collect pids from all sub-cgroups
+ err := filepath.Walk(path, func(p string, info os.FileInfo, iErr error) error {
+ dir, file := filepath.Split(p)
+ if file != CgroupProcesses {
+ return nil
+ }
+ if iErr != nil {
+ return iErr
+ }
+ cPids, err := readProcsFile(dir)
+ if err != nil {
+ return err
+ }
+ pids = append(pids, cPids...)
+ return nil
+ })
+ return pids, err
+}
+
+// WriteCgroupProc writes the specified pid into the cgroup's cgroup.procs file
+func WriteCgroupProc(dir string, pid int) error {
+ // Normally dir should not be empty, one case is that cgroup subsystem
+ // is not mounted, we will get empty dir, and we want it fail here.
+ if dir == "" {
+ return fmt.Errorf("no such directory for %s", CgroupProcesses)
+ }
+
+ // Dont attach any pid to the cgroup if -1 is specified as a pid
+ if pid != -1 {
+ if err := ioutil.WriteFile(filepath.Join(dir, CgroupProcesses), []byte(strconv.Itoa(pid)), 0700); err != nil {
+ return fmt.Errorf("failed to write %v to %v: %v", pid, CgroupProcesses, err)
+ }
+ }
+ return nil
+}
--- /dev/null
+// +build linux
+
+package cgroups
+
+import (
+ "bytes"
+ "fmt"
+ "reflect"
+ "strings"
+ "testing"
+)
+
+const fedoraMountinfo = `15 35 0:3 / /proc rw,nosuid,nodev,noexec,relatime shared:5 - proc proc rw
+16 35 0:14 / /sys rw,nosuid,nodev,noexec,relatime shared:6 - sysfs sysfs rw,seclabel
+17 35 0:5 / /dev rw,nosuid shared:2 - devtmpfs devtmpfs rw,seclabel,size=8056484k,nr_inodes=2014121,mode=755
+18 16 0:15 / /sys/kernel/security rw,nosuid,nodev,noexec,relatime shared:7 - securityfs securityfs rw
+19 16 0:13 / /sys/fs/selinux rw,relatime shared:8 - selinuxfs selinuxfs rw
+20 17 0:16 / /dev/shm rw,nosuid,nodev shared:3 - tmpfs tmpfs rw,seclabel
+21 17 0:10 / /dev/pts rw,nosuid,noexec,relatime shared:4 - devpts devpts rw,seclabel,gid=5,mode=620,ptmxmode=000
+22 35 0:17 / /run rw,nosuid,nodev shared:21 - tmpfs tmpfs rw,seclabel,mode=755
+23 16 0:18 / /sys/fs/cgroup rw,nosuid,nodev,noexec shared:9 - tmpfs tmpfs rw,seclabel,mode=755
+24 23 0:19 / /sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime shared:10 - cgroup cgroup rw,xattr,release_agent=/usr/lib/systemd/systemd-cgroups-agent,name=systemd
+25 16 0:20 / /sys/fs/pstore rw,nosuid,nodev,noexec,relatime shared:20 - pstore pstore rw
+26 23 0:21 / /sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime shared:11 - cgroup cgroup rw,cpuset,clone_children
+27 23 0:22 / /sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime shared:12 - cgroup cgroup rw,cpuacct,cpu,clone_children
+28 23 0:23 / /sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:13 - cgroup cgroup rw,memory,clone_children
+29 23 0:24 / /sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:14 - cgroup cgroup rw,devices,clone_children
+30 23 0:25 / /sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime shared:15 - cgroup cgroup rw,freezer,clone_children
+31 23 0:26 / /sys/fs/cgroup/net_cls rw,nosuid,nodev,noexec,relatime shared:16 - cgroup cgroup rw,net_cls,clone_children
+32 23 0:27 / /sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime shared:17 - cgroup cgroup rw,blkio,clone_children
+33 23 0:28 / /sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime shared:18 - cgroup cgroup rw,perf_event,clone_children
+34 23 0:29 / /sys/fs/cgroup/hugetlb rw,nosuid,nodev,noexec,relatime shared:19 - cgroup cgroup rw,hugetlb,clone_children
+35 1 253:2 / / rw,relatime shared:1 - ext4 /dev/mapper/ssd-root--f20 rw,seclabel,data=ordered
+36 15 0:30 / /proc/sys/fs/binfmt_misc rw,relatime shared:22 - autofs systemd-1 rw,fd=38,pgrp=1,timeout=300,minproto=5,maxproto=5,direct
+37 17 0:12 / /dev/mqueue rw,relatime shared:23 - mqueue mqueue rw,seclabel
+38 35 0:31 / /tmp rw shared:24 - tmpfs tmpfs rw,seclabel
+39 17 0:32 / /dev/hugepages rw,relatime shared:25 - hugetlbfs hugetlbfs rw,seclabel
+40 16 0:7 / /sys/kernel/debug rw,relatime shared:26 - debugfs debugfs rw
+41 16 0:33 / /sys/kernel/config rw,relatime shared:27 - configfs configfs rw
+42 35 0:34 / /var/lib/nfs/rpc_pipefs rw,relatime shared:28 - rpc_pipefs sunrpc rw
+43 15 0:35 / /proc/fs/nfsd rw,relatime shared:29 - nfsd sunrpc rw
+45 35 8:17 / /boot rw,relatime shared:30 - ext4 /dev/sdb1 rw,seclabel,data=ordered
+46 35 253:4 / /home rw,relatime shared:31 - ext4 /dev/mapper/ssd-home rw,seclabel,data=ordered
+47 35 253:5 / /var/lib/libvirt/images rw,noatime,nodiratime shared:32 - ext4 /dev/mapper/ssd-virt rw,seclabel,discard,data=ordered
+48 35 253:12 / /mnt/old rw,relatime shared:33 - ext4 /dev/mapper/HelpDeskRHEL6-FedoraRoot rw,seclabel,data=ordered
+121 22 0:36 / /run/user/1000/gvfs rw,nosuid,nodev,relatime shared:104 - fuse.gvfsd-fuse gvfsd-fuse rw,user_id=1000,group_id=1000
+124 16 0:37 / /sys/fs/fuse/connections rw,relatime shared:107 - fusectl fusectl rw
+165 38 253:3 / /tmp/mnt rw,relatime shared:147 - ext4 /dev/mapper/ssd-root rw,seclabel,data=ordered
+167 35 253:15 / /var/lib/docker/devicemapper/mnt/aae4076022f0e2b80a2afbf8fc6df450c52080191fcef7fb679a73e6f073e5c2 rw,relatime shared:149 - ext4 /dev/mapper/docker-253:2-425882-aae4076022f0e2b80a2afbf8fc6df450c52080191fcef7fb679a73e6f073e5c2 rw,seclabel,discard,stripe=16,data=ordered
+171 35 253:16 / /var/lib/docker/devicemapper/mnt/c71be651f114db95180e472f7871b74fa597ee70a58ccc35cb87139ddea15373 rw,relatime shared:153 - ext4 /dev/mapper/docker-253:2-425882-c71be651f114db95180e472f7871b74fa597ee70a58ccc35cb87139ddea15373 rw,seclabel,discard,stripe=16,data=ordered
+175 35 253:17 / /var/lib/docker/devicemapper/mnt/1bac6ab72862d2d5626560df6197cf12036b82e258c53d981fa29adce6f06c3c rw,relatime shared:157 - ext4 /dev/mapper/docker-253:2-425882-1bac6ab72862d2d5626560df6197cf12036b82e258c53d981fa29adce6f06c3c rw,seclabel,discard,stripe=16,data=ordered
+179 35 253:18 / /var/lib/docker/devicemapper/mnt/d710a357d77158e80d5b2c55710ae07c94e76d34d21ee7bae65ce5418f739b09 rw,relatime shared:161 - ext4 /dev/mapper/docker-253:2-425882-d710a357d77158e80d5b2c55710ae07c94e76d34d21ee7bae65ce5418f739b09 rw,seclabel,discard,stripe=16,data=ordered
+183 35 253:19 / /var/lib/docker/devicemapper/mnt/6479f52366114d5f518db6837254baab48fab39f2ac38d5099250e9a6ceae6c7 rw,relatime shared:165 - ext4 /dev/mapper/docker-253:2-425882-6479f52366114d5f518db6837254baab48fab39f2ac38d5099250e9a6ceae6c7 rw,seclabel,discard,stripe=16,data=ordered
+187 35 253:20 / /var/lib/docker/devicemapper/mnt/8d9df91c4cca5aef49eeb2725292aab324646f723a7feab56be34c2ad08268e1 rw,relatime shared:169 - ext4 /dev/mapper/docker-253:2-425882-8d9df91c4cca5aef49eeb2725292aab324646f723a7feab56be34c2ad08268e1 rw,seclabel,discard,stripe=16,data=ordered
+191 35 253:21 / /var/lib/docker/devicemapper/mnt/c8240b768603d32e920d365dc9d1dc2a6af46cd23e7ae819947f969e1b4ec661 rw,relatime shared:173 - ext4 /dev/mapper/docker-253:2-425882-c8240b768603d32e920d365dc9d1dc2a6af46cd23e7ae819947f969e1b4ec661 rw,seclabel,discard,stripe=16,data=ordered
+195 35 253:22 / /var/lib/docker/devicemapper/mnt/2eb3a01278380bbf3ed12d86ac629eaa70a4351301ee307a5cabe7b5f3b1615f rw,relatime shared:177 - ext4 /dev/mapper/docker-253:2-425882-2eb3a01278380bbf3ed12d86ac629eaa70a4351301ee307a5cabe7b5f3b1615f rw,seclabel,discard,stripe=16,data=ordered
+199 35 253:23 / /var/lib/docker/devicemapper/mnt/37a17fb7c9d9b80821235d5f2662879bd3483915f245f9b49cdaa0e38779b70b rw,relatime shared:181 - ext4 /dev/mapper/docker-253:2-425882-37a17fb7c9d9b80821235d5f2662879bd3483915f245f9b49cdaa0e38779b70b rw,seclabel,discard,stripe=16,data=ordered
+203 35 253:24 / /var/lib/docker/devicemapper/mnt/aea459ae930bf1de913e2f29428fd80ee678a1e962d4080019d9f9774331ee2b rw,relatime shared:185 - ext4 /dev/mapper/docker-253:2-425882-aea459ae930bf1de913e2f29428fd80ee678a1e962d4080019d9f9774331ee2b rw,seclabel,discard,stripe=16,data=ordered
+207 35 253:25 / /var/lib/docker/devicemapper/mnt/928ead0bc06c454bd9f269e8585aeae0a6bd697f46dc8754c2a91309bc810882 rw,relatime shared:189 - ext4 /dev/mapper/docker-253:2-425882-928ead0bc06c454bd9f269e8585aeae0a6bd697f46dc8754c2a91309bc810882 rw,seclabel,discard,stripe=16,data=ordered
+211 35 253:26 / /var/lib/docker/devicemapper/mnt/0f284d18481d671644706e7a7244cbcf63d590d634cc882cb8721821929d0420 rw,relatime shared:193 - ext4 /dev/mapper/docker-253:2-425882-0f284d18481d671644706e7a7244cbcf63d590d634cc882cb8721821929d0420 rw,seclabel,discard,stripe=16,data=ordered
+215 35 253:27 / /var/lib/docker/devicemapper/mnt/d9dd16722ab34c38db2733e23f69e8f4803ce59658250dd63e98adff95d04919 rw,relatime shared:197 - ext4 /dev/mapper/docker-253:2-425882-d9dd16722ab34c38db2733e23f69e8f4803ce59658250dd63e98adff95d04919 rw,seclabel,discard,stripe=16,data=ordered
+219 35 253:28 / /var/lib/docker/devicemapper/mnt/bc4500479f18c2c08c21ad5282e5f826a016a386177d9874c2764751c031d634 rw,relatime shared:201 - ext4 /dev/mapper/docker-253:2-425882-bc4500479f18c2c08c21ad5282e5f826a016a386177d9874c2764751c031d634 rw,seclabel,discard,stripe=16,data=ordered
+223 35 253:29 / /var/lib/docker/devicemapper/mnt/7770c8b24eb3d5cc159a065910076938910d307ab2f5d94e1dc3b24c06ee2c8a rw,relatime shared:205 - ext4 /dev/mapper/docker-253:2-425882-7770c8b24eb3d5cc159a065910076938910d307ab2f5d94e1dc3b24c06ee2c8a rw,seclabel,discard,stripe=16,data=ordered
+227 35 253:30 / /var/lib/docker/devicemapper/mnt/c280cd3d0bf0aa36b478b292279671624cceafc1a67eaa920fa1082601297adf rw,relatime shared:209 - ext4 /dev/mapper/docker-253:2-425882-c280cd3d0bf0aa36b478b292279671624cceafc1a67eaa920fa1082601297adf rw,seclabel,discard,stripe=16,data=ordered
+231 35 253:31 / /var/lib/docker/devicemapper/mnt/8b59a7d9340279f09fea67fd6ad89ddef711e9e7050eb647984f8b5ef006335f rw,relatime shared:213 - ext4 /dev/mapper/docker-253:2-425882-8b59a7d9340279f09fea67fd6ad89ddef711e9e7050eb647984f8b5ef006335f rw,seclabel,discard,stripe=16,data=ordered
+235 35 253:32 / /var/lib/docker/devicemapper/mnt/1a28059f29eda821578b1bb27a60cc71f76f846a551abefabce6efd0146dce9f rw,relatime shared:217 - ext4 /dev/mapper/docker-253:2-425882-1a28059f29eda821578b1bb27a60cc71f76f846a551abefabce6efd0146dce9f rw,seclabel,discard,stripe=16,data=ordered
+239 35 253:33 / /var/lib/docker/devicemapper/mnt/e9aa60c60128cad1 rw,relatime shared:221 - ext4 /dev/mapper/docker-253:2-425882-e9aa60c60128cad1 rw,seclabel,discard,stripe=16,data=ordered
+243 35 253:34 / /var/lib/docker/devicemapper/mnt/5fec11304b6f4713fea7b6ccdcc1adc0a1966187f590fe25a8227428a8df275d-init rw,relatime shared:225 - ext4 /dev/mapper/docker-253:2-425882-5fec11304b6f4713fea7b6ccdcc1adc0a1966187f590fe25a8227428a8df275d-init rw,seclabel,discard,stripe=16,data=ordered
+247 35 253:35 / /var/lib/docker/devicemapper/mnt/5fec11304b6f4713fea7b6ccdcc1adc0a1966187f590fe25a8227428a8df275d rw,relatime shared:229 - ext4 /dev/mapper/docker-253:2-425882-5fec11304b6f4713fea7b6ccdcc1adc0a1966187f590fe25a8227428a8df275d rw,seclabel,discard,stripe=16,data=ordered
+31 21 0:23 / /DATA/foo_bla_bla rw,relatime - cifs //foo/BLA\040BLA\040BLA/ rw,sec=ntlm,cache=loose,unc=\\foo\BLA BLA BLA,username=my_login,domain=mydomain.com,uid=12345678,forceuid,gid=12345678,forcegid,addr=10.1.30.10,file_mode=0755,dir_mode=0755,nounix,rsize=61440,wsize=65536,actimeo=1`
+
+const systemdMountinfo = `115 83 0:32 / / rw,relatime - aufs none rw,si=c0bd3d3,dio,dirperm1
+116 115 0:35 / /proc rw,nosuid,nodev,noexec,relatime - proc proc rw
+117 115 0:36 / /dev rw,nosuid - tmpfs tmpfs rw,mode=755
+118 117 0:37 / /dev/pts rw,nosuid,noexec,relatime - devpts devpts rw,gid=5,mode=620,ptmxmode=666
+119 115 0:38 / /sys rw,nosuid,nodev,noexec,relatime - sysfs sysfs rw
+120 119 0:39 / /sys/fs/cgroup rw,nosuid,nodev,noexec,relatime - tmpfs tmpfs rw,mode=755
+121 120 0:19 /system.slice/docker-dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e.scope /sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,xattr,release_agent=/lib/systemd/systemd-cgroups-agent,name=systemd
+122 120 0:20 /system.slice/docker-dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e.scope /sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,devices
+123 120 0:21 /system.slice/docker-dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e.scope /sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,freezer
+124 120 0:22 /system.slice/docker-dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e.scope /sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,memory
+125 120 0:23 /system.slice/docker-dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e.scope /sys/fs/cgroup/net_cls,net_prio rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,net_cls,net_prio
+126 120 0:24 /system.slice/docker-dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e.scope /sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,blkio
+127 120 0:25 /system.slice/docker-dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e.scope /sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,cpuset,clone_children
+128 120 0:26 /system.slice/docker-dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e.scope /sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,cpu,cpuacct
+129 120 0:27 /system.slice/docker-dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e.scope /sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,perf_event,release_agent=/run/cgmanager/agents/cgm-release-agent.perf_event
+130 115 43:0 /var/lib/docker/volumes/a44a712176377f57c094397330ee04387284c478364eb25f4c3d25f775f25c26/_data /var/lib/docker rw,relatime - ext4 /dev/nbd0 rw,data=ordered
+131 115 43:0 /var/lib/docker/containers/dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e/resolv.conf /etc/resolv.conf rw,relatime - ext4 /dev/nbd0 rw,data=ordered
+132 115 43:0 /var/lib/docker/containers/dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e/hostname /etc/hostname rw,relatime - ext4 /dev/nbd0 rw,data=ordered
+133 115 43:0 /var/lib/docker/containers/dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e/hosts /etc/hosts rw,relatime - ext4 /dev/nbd0 rw,data=ordered
+134 117 0:33 / /dev/shm rw,nosuid,nodev,noexec,relatime - tmpfs shm rw,size=65536k
+135 117 0:13 / /dev/mqueue rw,nosuid,nodev,noexec,relatime - mqueue mqueue rw
+136 117 0:12 /1 /dev/console rw,nosuid,noexec,relatime - devpts none rw,gid=5,mode=620,ptmxmode=000
+84 115 0:40 / /tmp rw,relatime - tmpfs none rw`
+
+const bedrockMountinfo = `120 17 0:28 / /sys/fs/cgroup ro,nosuid,nodev,noexec shared:16 - tmpfs tmpfs ro,mode=755
+124 28 0:28 / /bedrock/strata/arch/sys/fs/cgroup rw,nosuid,nodev,noexec shared:16 - tmpfs tmpfs ro,mode=755
+123 53 0:28 / /bedrock/strata/fallback/sys/fs/cgroup rw,nosuid,nodev,noexec shared:16 - tmpfs tmpfs ro,mode=755
+122 71 0:28 / /bedrock/strata/gentoo/sys/fs/cgroup rw,nosuid,nodev,noexec shared:16 - tmpfs tmpfs ro,mode=755
+121 89 0:28 / /bedrock/strata/kde/sys/fs/cgroup rw,nosuid,nodev,noexec shared:16 - tmpfs tmpfs ro,mode=755
+125 120 0:29 / /sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime shared:17 - cgroup cgroup rw,xattr,release_agent=/lib/systemd/systemd-cgroups-agent,name=systemd
+129 124 0:29 / /bedrock/strata/arch/sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime shared:17 - cgroup cgroup rw,xattr,release_agent=/lib/systemd/systemd-cgroups-agent,name=systemd
+128 123 0:29 / /bedrock/strata/fallback/sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime shared:17 - cgroup cgroup rw,xattr,release_agent=/lib/systemd/systemd-cgroups-agent,name=systemd
+127 122 0:29 / /bedrock/strata/gentoo/sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime shared:17 - cgroup cgroup rw,xattr,release_agent=/lib/systemd/systemd-cgroups-agent,name=systemd
+126 121 0:29 / /bedrock/strata/kde/sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime shared:17 - cgroup cgroup rw,xattr,release_agent=/lib/systemd/systemd-cgroups-agent,name=systemd
+140 120 0:32 / /sys/fs/cgroup/net_cls,net_prio rw,nosuid,nodev,noexec,relatime shared:48 - cgroup cgroup rw,net_cls,net_prio
+144 124 0:32 / /bedrock/strata/arch/sys/fs/cgroup/net_cls,net_prio rw,nosuid,nodev,noexec,relatime shared:48 - cgroup cgroup rw,net_cls,net_prio
+143 123 0:32 / /bedrock/strata/fallback/sys/fs/cgroup/net_cls,net_prio rw,nosuid,nodev,noexec,relatime shared:48 - cgroup cgroup rw,net_cls,net_prio
+142 122 0:32 / /bedrock/strata/gentoo/sys/fs/cgroup/net_cls,net_prio rw,nosuid,nodev,noexec,relatime shared:48 - cgroup cgroup rw,net_cls,net_prio
+141 121 0:32 / /bedrock/strata/kde/sys/fs/cgroup/net_cls,net_prio rw,nosuid,nodev,noexec,relatime shared:48 - cgroup cgroup rw,net_cls,net_prio
+145 120 0:33 / /sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime shared:49 - cgroup cgroup rw,blkio
+149 124 0:33 / /bedrock/strata/arch/sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime shared:49 - cgroup cgroup rw,blkio
+148 123 0:33 / /bedrock/strata/fallback/sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime shared:49 - cgroup cgroup rw,blkio
+147 122 0:33 / /bedrock/strata/gentoo/sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime shared:49 - cgroup cgroup rw,blkio
+146 121 0:33 / /bedrock/strata/kde/sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime shared:49 - cgroup cgroup rw,blkio
+150 120 0:34 / /sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime shared:50 - cgroup cgroup rw,cpu,cpuacct
+154 124 0:34 / /bedrock/strata/arch/sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime shared:50 - cgroup cgroup rw,cpu,cpuacct
+153 123 0:34 / /bedrock/strata/fallback/sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime shared:50 - cgroup cgroup rw,cpu,cpuacct
+152 122 0:34 / /bedrock/strata/gentoo/sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime shared:50 - cgroup cgroup rw,cpu,cpuacct
+151 121 0:34 / /bedrock/strata/kde/sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime shared:50 - cgroup cgroup rw,cpu,cpuacct
+155 120 0:35 / /sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime shared:51 - cgroup cgroup rw,cpuset
+159 124 0:35 / /bedrock/strata/arch/sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime shared:51 - cgroup cgroup rw,cpuset
+158 123 0:35 / /bedrock/strata/fallback/sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime shared:51 - cgroup cgroup rw,cpuset
+157 122 0:35 / /bedrock/strata/gentoo/sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime shared:51 - cgroup cgroup rw,cpuset
+156 121 0:35 / /bedrock/strata/kde/sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime shared:51 - cgroup cgroup rw,cpuset
+160 120 0:36 / /sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:52 - cgroup cgroup rw,devices
+164 124 0:36 / /bedrock/strata/arch/sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:52 - cgroup cgroup rw,devices
+163 123 0:36 / /bedrock/strata/fallback/sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:52 - cgroup cgroup rw,devices
+162 122 0:36 / /bedrock/strata/gentoo/sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:52 - cgroup cgroup rw,devices
+161 121 0:36 / /bedrock/strata/kde/sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:52 - cgroup cgroup rw,devices
+165 120 0:37 / /sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:53 - cgroup cgroup rw,memory
+169 124 0:37 / /bedrock/strata/arch/sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:53 - cgroup cgroup rw,memory
+168 123 0:37 / /bedrock/strata/fallback/sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:53 - cgroup cgroup rw,memory
+167 122 0:37 / /bedrock/strata/gentoo/sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:53 - cgroup cgroup rw,memory
+166 121 0:37 / /bedrock/strata/kde/sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:53 - cgroup cgroup rw,memory
+170 120 0:38 / /sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime shared:54 - cgroup cgroup rw,freezer
+174 124 0:38 / /bedrock/strata/arch/sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime shared:54 - cgroup cgroup rw,freezer
+173 123 0:38 / /bedrock/strata/fallback/sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime shared:54 - cgroup cgroup rw,freezer
+172 122 0:38 / /bedrock/strata/gentoo/sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime shared:54 - cgroup cgroup rw,freezer
+171 121 0:38 / /bedrock/strata/kde/sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime shared:54 - cgroup cgroup rw,freezer
+175 120 0:39 / /sys/fs/cgroup/pids rw,nosuid,nodev,noexec,relatime shared:55 - cgroup cgroup rw,pids
+179 124 0:39 / /bedrock/strata/arch/sys/fs/cgroup/pids rw,nosuid,nodev,noexec,relatime shared:55 - cgroup cgroup rw,pids
+178 123 0:39 / /bedrock/strata/fallback/sys/fs/cgroup/pids rw,nosuid,nodev,noexec,relatime shared:55 - cgroup cgroup rw,pids
+177 122 0:39 / /bedrock/strata/gentoo/sys/fs/cgroup/pids rw,nosuid,nodev,noexec,relatime shared:55 - cgroup cgroup rw,pids
+176 121 0:39 / /bedrock/strata/kde/sys/fs/cgroup/pids rw,nosuid,nodev,noexec,relatime shared:55 - cgroup cgroup rw,pids
+180 120 0:40 / /sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime shared:56 - cgroup cgroup rw,perf_event
+184 124 0:40 / /bedrock/strata/arch/sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime shared:56 - cgroup cgroup rw,perf_event
+183 123 0:40 / /bedrock/strata/fallback/sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime shared:56 - cgroup cgroup rw,perf_event
+182 122 0:40 / /bedrock/strata/gentoo/sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime shared:56 - cgroup cgroup rw,perf_event
+181 121 0:40 / /bedrock/strata/kde/sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime shared:56 - cgroup cgroup rw,perf_event`
+
+const cgroup2Mountinfo = `18 64 0:18 / /sys rw,nosuid,nodev,noexec,relatime shared:6 - sysfs sysfs rw,seclabel
+19 64 0:4 / /proc rw,nosuid,nodev,noexec,relatime shared:5 - proc proc rw
+20 64 0:6 / /dev rw,nosuid shared:2 - devtmpfs devtmpfs rw,seclabel,size=8171204k,nr_inodes=2042801,mode=755
+21 18 0:19 / /sys/kernel/security rw,nosuid,nodev,noexec,relatime shared:7 - securityfs securityfs rw
+22 20 0:20 / /dev/shm rw,nosuid,nodev shared:3 - tmpfs tmpfs rw,seclabel
+23 20 0:21 / /dev/pts rw,nosuid,noexec,relatime shared:4 - devpts devpts rw,seclabel,gid=5,mode=620,ptmxmode=000
+24 64 0:22 / /run rw,nosuid,nodev shared:24 - tmpfs tmpfs rw,seclabel,mode=755
+25 18 0:23 / /sys/fs/cgroup ro,nosuid,nodev,noexec shared:8 - tmpfs tmpfs ro,seclabel,mode=755
+26 25 0:24 / /sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime shared:9 - cgroup2 cgroup rw
+27 18 0:25 / /sys/fs/pstore rw,nosuid,nodev,noexec,relatime shared:20 - pstore pstore rw,seclabel
+28 18 0:26 / /sys/firmware/efi/efivars rw,nosuid,nodev,noexec,relatime shared:21 - efivarfs efivarfs rw
+29 25 0:27 / /sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime shared:10 - cgroup cgroup rw,cpu,cpuacct
+30 25 0:28 / /sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:11 - cgroup cgroup rw,memory
+31 25 0:29 / /sys/fs/cgroup/net_cls,net_prio rw,nosuid,nodev,noexec,relatime shared:12 - cgroup cgroup rw,net_cls,net_prio
+32 25 0:30 / /sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime shared:13 - cgroup cgroup rw,blkio
+33 25 0:31 / /sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime shared:14 - cgroup cgroup rw,perf_event
+34 25 0:32 / /sys/fs/cgroup/hugetlb rw,nosuid,nodev,noexec,relatime shared:15 - cgroup cgroup rw,hugetlb
+35 25 0:33 / /sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime shared:16 - cgroup cgroup rw,freezer
+36 25 0:34 / /sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime shared:17 - cgroup cgroup rw,cpuset
+37 25 0:35 / /sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:18 - cgroup cgroup rw,devices
+38 25 0:36 / /sys/fs/cgroup/pids rw,nosuid,nodev,noexec,relatime shared:19 - cgroup cgroup rw,pids
+61 18 0:37 / /sys/kernel/config rw,relatime shared:22 - configfs configfs rw
+64 0 253:0 / / rw,relatime shared:1 - ext4 /dev/mapper/fedora_dhcp--16--129-root rw,seclabel,data=ordered
+39 18 0:17 / /sys/fs/selinux rw,relatime shared:23 - selinuxfs selinuxfs rw
+40 20 0:16 / /dev/mqueue rw,relatime shared:25 - mqueue mqueue rw,seclabel
+41 20 0:39 / /dev/hugepages rw,relatime shared:26 - hugetlbfs hugetlbfs rw,seclabel
+`
+
+func TestGetCgroupMounts(t *testing.T) {
+ type testData struct {
+ mountInfo string
+ root string
+ subsystems map[string]bool
+ }
+ testTable := []testData{
+ {
+ mountInfo: fedoraMountinfo,
+ root: "/",
+ subsystems: map[string]bool{
+ "cpuset": false,
+ "cpu": false,
+ "cpuacct": false,
+ "memory": false,
+ "devices": false,
+ "freezer": false,
+ "net_cls": false,
+ "blkio": false,
+ "perf_event": false,
+ "hugetlb": false,
+ },
+ },
+ {
+ mountInfo: systemdMountinfo,
+ root: "/system.slice/docker-dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e.scope",
+ subsystems: map[string]bool{
+ "cpuset": false,
+ "cpu": false,
+ "cpuacct": false,
+ "memory": false,
+ "devices": false,
+ "freezer": false,
+ "net_cls": false,
+ "blkio": false,
+ "perf_event": false,
+ },
+ },
+ {
+ mountInfo: bedrockMountinfo,
+ root: "/",
+ subsystems: map[string]bool{
+ "cpuset": false,
+ "cpu": false,
+ "cpuacct": false,
+ "memory": false,
+ "devices": false,
+ "freezer": false,
+ "net_cls": false,
+ "blkio": false,
+ "perf_event": false,
+ },
+ },
+ }
+ for _, td := range testTable {
+ mi := bytes.NewBufferString(td.mountInfo)
+ cgMounts, err := getCgroupMountsHelper(td.subsystems, mi, false)
+ if err != nil {
+ t.Fatal(err)
+ }
+ cgMap := make(map[string]Mount)
+ for _, m := range cgMounts {
+ for _, ss := range m.Subsystems {
+ cgMap[ss] = m
+ }
+ }
+ for ss := range td.subsystems {
+ m, ok := cgMap[ss]
+ if !ok {
+ t.Fatalf("%s not found", ss)
+ }
+ if m.Root != td.root {
+ t.Fatalf("unexpected root for %s: %s", ss, m.Root)
+ }
+ if !strings.HasPrefix(m.Mountpoint, "/sys/fs/cgroup/") && !strings.Contains(m.Mountpoint, ss) {
+ t.Fatalf("unexpected mountpoint for %s: %s", ss, m.Mountpoint)
+ }
+ var ssFound bool
+ for _, mss := range m.Subsystems {
+ if mss == ss {
+ ssFound = true
+ break
+ }
+ }
+ if !ssFound {
+ t.Fatalf("subsystem %s not found in Subsystems field %v", ss, m.Subsystems)
+ }
+ }
+ }
+}
+
+func BenchmarkGetCgroupMounts(b *testing.B) {
+ subsystems := map[string]bool{
+ "cpuset": false,
+ "cpu": false,
+ "cpuacct": false,
+ "memory": false,
+ "devices": false,
+ "freezer": false,
+ "net_cls": false,
+ "blkio": false,
+ "perf_event": false,
+ "hugetlb": false,
+ }
+ b.ResetTimer()
+ for i := 0; i < b.N; i++ {
+ b.StopTimer()
+ mi := bytes.NewBufferString(fedoraMountinfo)
+ b.StartTimer()
+ if _, err := getCgroupMountsHelper(subsystems, mi, false); err != nil {
+ b.Fatal(err)
+ }
+ }
+}
+
+func TestParseCgroupString(t *testing.T) {
+ testCases := []struct {
+ input string
+ expectedError error
+ expectedOutput map[string]string
+ }{
+ {
+ // Taken from a CoreOS instance running systemd 225 with CPU/Mem
+ // accounting enabled in systemd
+ input: `9:blkio:/
+8:freezer:/
+7:perf_event:/
+6:devices:/system.slice/system-sshd.slice
+5:cpuset:/
+4:cpu,cpuacct:/system.slice/system-sshd.slice/sshd@126-10.240.0.15:22-xxx.yyy.zzz.aaa:33678.service
+3:net_cls,net_prio:/
+2:memory:/system.slice/system-sshd.slice/sshd@126-10.240.0.15:22-xxx.yyy.zzz.aaa:33678.service
+1:name=systemd:/system.slice/system-sshd.slice/sshd@126-10.240.0.15:22-xxx.yyy.zzz.aaa:33678.service`,
+ expectedOutput: map[string]string{
+ "name=systemd": "/system.slice/system-sshd.slice/sshd@126-10.240.0.15:22-xxx.yyy.zzz.aaa:33678.service",
+ "blkio": "/",
+ "freezer": "/",
+ "perf_event": "/",
+ "devices": "/system.slice/system-sshd.slice",
+ "cpuset": "/",
+ "cpu": "/system.slice/system-sshd.slice/sshd@126-10.240.0.15:22-xxx.yyy.zzz.aaa:33678.service",
+ "cpuacct": "/system.slice/system-sshd.slice/sshd@126-10.240.0.15:22-xxx.yyy.zzz.aaa:33678.service",
+ "net_cls": "/",
+ "net_prio": "/",
+ "memory": "/system.slice/system-sshd.slice/sshd@126-10.240.0.15:22-xxx.yyy.zzz.aaa:33678.service",
+ },
+ },
+ {
+ input: `malformed input`,
+ expectedError: fmt.Errorf(`invalid cgroup entry: must contain at least two colons: malformed input`),
+ },
+ }
+
+ for ndx, testCase := range testCases {
+ out, err := parseCgroupFromReader(strings.NewReader(testCase.input))
+ if err != nil {
+ if testCase.expectedError == nil || testCase.expectedError.Error() != err.Error() {
+ t.Errorf("%v: expected error %v, got error %v", ndx, testCase.expectedError, err)
+ }
+ } else {
+ if !reflect.DeepEqual(testCase.expectedOutput, out) {
+ t.Errorf("%v: expected output %v, got error %v", ndx, testCase.expectedOutput, out)
+ }
+ }
+ }
+
+}
+
+func TestIgnoreCgroup2Mount(t *testing.T) {
+ subsystems := map[string]bool{
+ "cpuset": false,
+ "cpu": false,
+ "cpuacct": false,
+ "memory": false,
+ "devices": false,
+ "freezer": false,
+ "net_cls": false,
+ "blkio": false,
+ "perf_event": false,
+ "pids": false,
+ "name=systemd": false,
+ }
+
+ mi := bytes.NewBufferString(cgroup2Mountinfo)
+ cgMounts, err := getCgroupMountsHelper(subsystems, mi, false)
+ if err != nil {
+ t.Fatal(err)
+ }
+ for _, m := range cgMounts {
+ if m.Mountpoint == "/sys/fs/cgroup/systemd" {
+ t.Errorf("parsed a cgroup2 mount at /sys/fs/cgroup/systemd instead of ignoring it")
+ }
+ }
+}
+
+func TestGetClosestMountpointAncestor(t *testing.T) {
+ fakeMountInfo := ` 18 24 0:17 / /sys rw,nosuid,nodev,noexec,relatime - sysfs sysfs rw
+100 99 1:31 / /foo/bar rw,relatime - fake fake rw,fake
+100 99 1:31 / /foo/bar/baz2 rw,relatime - fake fake rw,fake
+100 99 1:31 / /foo/bar/baz rw,relatime - fake fake rw,fake
+100 99 1:31 / /foo/bar/bazza rw,relatime - fake fake rw,fake
+100 99 1:31 / /foo/bar/baz3 rw,relatime - fake fake rw,fake
+100 99 1:31 / /foo rw,relatime - fake fake rw,fake
+100 99 1:31 / /unrelated rw,relatime - fake fake rw,fake
+100 99 1:31 / / rw,relatime - fake fake rw,fake
+`
+ testCases := []struct {
+ input string
+ output string
+ }{
+ {input: "/foo/bar/baz/a/b/c", output: "/foo/bar/baz"},
+ {input: "/foo/bar/baz", output: "/foo/bar/baz"},
+ {input: "/foo/bar/bazza", output: "/foo/bar/bazza"},
+ {input: "/a/b/c/d", output: "/"},
+ }
+
+ for _, c := range testCases {
+ mountpoint := GetClosestMountpointAncestor(c.input, fakeMountInfo)
+ if mountpoint != c.output {
+ t.Errorf("expected %s, got %s", c.output, mountpoint)
+ }
+ }
+}
+
+func TestFindCgroupMountpointAndRoot(t *testing.T) {
+ fakeMountInfo := `
+35 27 0:29 / /foo rw,nosuid,nodev,noexec,relatime shared:18 - cgroup cgroup rw,devices
+35 27 0:29 / /sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:18 - cgroup cgroup rw,devices
+`
+ testCases := []struct {
+ cgroupPath string
+ output string
+ }{
+ {cgroupPath: "/sys/fs", output: "/sys/fs/cgroup/devices"},
+ {cgroupPath: "", output: "/foo"},
+ }
+
+ for _, c := range testCases {
+ mountpoint, _, _ := findCgroupMountpointAndRootFromReader(strings.NewReader(fakeMountInfo), c.cgroupPath, "devices")
+ if mountpoint != c.output {
+ t.Errorf("expected %s, got %s", c.output, mountpoint)
+ }
+ }
+}
--- /dev/null
+package configs
+
+import "fmt"
+
+// blockIODevice holds major:minor format supported in blkio cgroup
+type blockIODevice struct {
+ // Major is the device's major number
+ Major int64 `json:"major"`
+ // Minor is the device's minor number
+ Minor int64 `json:"minor"`
+}
+
+// WeightDevice struct holds a `major:minor weight`|`major:minor leaf_weight` pair
+type WeightDevice struct {
+ blockIODevice
+ // Weight is the bandwidth rate for the device, range is from 10 to 1000
+ Weight uint16 `json:"weight"`
+ // LeafWeight is the bandwidth rate for the device while competing with the cgroup's child cgroups, range is from 10 to 1000, cfq scheduler only
+ LeafWeight uint16 `json:"leafWeight"`
+}
+
+// NewWeightDevice returns a configured WeightDevice pointer
+func NewWeightDevice(major, minor int64, weight, leafWeight uint16) *WeightDevice {
+ wd := &WeightDevice{}
+ wd.Major = major
+ wd.Minor = minor
+ wd.Weight = weight
+ wd.LeafWeight = leafWeight
+ return wd
+}
+
+// WeightString formats the struct to be writable to the cgroup specific file
+func (wd *WeightDevice) WeightString() string {
+ return fmt.Sprintf("%d:%d %d", wd.Major, wd.Minor, wd.Weight)
+}
+
+// LeafWeightString formats the struct to be writable to the cgroup specific file
+func (wd *WeightDevice) LeafWeightString() string {
+ return fmt.Sprintf("%d:%d %d", wd.Major, wd.Minor, wd.LeafWeight)
+}
+
+// ThrottleDevice struct holds a `major:minor rate_per_second` pair
+type ThrottleDevice struct {
+ blockIODevice
+ // Rate is the IO rate limit per cgroup per device
+ Rate uint64 `json:"rate"`
+}
+
+// NewThrottleDevice returns a configured ThrottleDevice pointer
+func NewThrottleDevice(major, minor int64, rate uint64) *ThrottleDevice {
+ td := &ThrottleDevice{}
+ td.Major = major
+ td.Minor = minor
+ td.Rate = rate
+ return td
+}
+
+// String formats the struct to be writable to the cgroup specific file
+func (td *ThrottleDevice) String() string {
+ return fmt.Sprintf("%d:%d %d", td.Major, td.Minor, td.Rate)
+}
--- /dev/null
+package configs
+
+type FreezerState string
+
+const (
+ Undefined FreezerState = ""
+ Frozen FreezerState = "FROZEN"
+ Thawed FreezerState = "THAWED"
+)
+
+type Cgroup struct {
+ // Deprecated, use Path instead
+ Name string `json:"name,omitempty"`
+
+ // name of parent of cgroup or slice
+ // Deprecated, use Path instead
+ Parent string `json:"parent,omitempty"`
+
+ // Path specifies the path to cgroups that are created and/or joined by the container.
+ // The path is assumed to be relative to the host system cgroup mountpoint.
+ Path string `json:"path"`
+
+ // ScopePrefix describes prefix for the scope name
+ ScopePrefix string `json:"scope_prefix"`
+
+ // Paths represent the absolute cgroups paths to join.
+ // This takes precedence over Path.
+ Paths map[string]string
+
+ // Resources contains various cgroups settings to apply
+ *Resources
+}
+
+type Resources struct {
+ // If this is true allow access to any kind of device within the container. If false, allow access only to devices explicitly listed in the allowed_devices list.
+ // Deprecated
+ AllowAllDevices *bool `json:"allow_all_devices,omitempty"`
+ // Deprecated
+ AllowedDevices []*Device `json:"allowed_devices,omitempty"`
+ // Deprecated
+ DeniedDevices []*Device `json:"denied_devices,omitempty"`
+
+ Devices []*Device `json:"devices"`
+
+ // Memory limit (in bytes)
+ Memory int64 `json:"memory"`
+
+ // Memory reservation or soft_limit (in bytes)
+ MemoryReservation int64 `json:"memory_reservation"`
+
+ // Total memory usage (memory + swap); set `-1` to enable unlimited swap
+ MemorySwap int64 `json:"memory_swap"`
+
+ // Kernel memory limit (in bytes)
+ KernelMemory int64 `json:"kernel_memory"`
+
+ // Kernel memory limit for TCP use (in bytes)
+ KernelMemoryTCP int64 `json:"kernel_memory_tcp"`
+
+ // CPU shares (relative weight vs. other containers)
+ CpuShares uint64 `json:"cpu_shares"`
+
+ // CPU hardcap limit (in usecs). Allowed cpu time in a given period.
+ CpuQuota int64 `json:"cpu_quota"`
+
+ // CPU period to be used for hardcapping (in usecs). 0 to use system default.
+ CpuPeriod uint64 `json:"cpu_period"`
+
+ // How many time CPU will use in realtime scheduling (in usecs).
+ CpuRtRuntime int64 `json:"cpu_rt_quota"`
+
+ // CPU period to be used for realtime scheduling (in usecs).
+ CpuRtPeriod uint64 `json:"cpu_rt_period"`
+
+ // CPU to use
+ CpusetCpus string `json:"cpuset_cpus"`
+
+ // MEM to use
+ CpusetMems string `json:"cpuset_mems"`
+
+ // Process limit; set <= `0' to disable limit.
+ PidsLimit int64 `json:"pids_limit"`
+
+ // Specifies per cgroup weight, range is from 10 to 1000.
+ BlkioWeight uint16 `json:"blkio_weight"`
+
+ // Specifies tasks' weight in the given cgroup while competing with the cgroup's child cgroups, range is from 10 to 1000, cfq scheduler only
+ BlkioLeafWeight uint16 `json:"blkio_leaf_weight"`
+
+ // Weight per cgroup per device, can override BlkioWeight.
+ BlkioWeightDevice []*WeightDevice `json:"blkio_weight_device"`
+
+ // IO read rate limit per cgroup per device, bytes per second.
+ BlkioThrottleReadBpsDevice []*ThrottleDevice `json:"blkio_throttle_read_bps_device"`
+
+ // IO write rate limit per cgroup per device, bytes per second.
+ BlkioThrottleWriteBpsDevice []*ThrottleDevice `json:"blkio_throttle_write_bps_device"`
+
+ // IO read rate limit per cgroup per device, IO per second.
+ BlkioThrottleReadIOPSDevice []*ThrottleDevice `json:"blkio_throttle_read_iops_device"`
+
+ // IO write rate limit per cgroup per device, IO per second.
+ BlkioThrottleWriteIOPSDevice []*ThrottleDevice `json:"blkio_throttle_write_iops_device"`
+
+ // set the freeze value for the process
+ Freezer FreezerState `json:"freezer"`
+
+ // Hugetlb limit (in bytes)
+ HugetlbLimit []*HugepageLimit `json:"hugetlb_limit"`
+
+ // Whether to disable OOM Killer
+ OomKillDisable bool `json:"oom_kill_disable"`
+
+ // Tuning swappiness behaviour per cgroup
+ MemorySwappiness *uint64 `json:"memory_swappiness"`
+
+ // Set priority of network traffic for container
+ NetPrioIfpriomap []*IfPrioMap `json:"net_prio_ifpriomap"`
+
+ // Set class identifier for container's network packets
+ NetClsClassid uint32 `json:"net_cls_classid_u"`
+}
--- /dev/null
+package configs
+
+// TODO Windows: This can ultimately be entirely factored out on Windows as
+// cgroups are a Unix-specific construct.
+type Cgroup struct {
+}
--- /dev/null
+package configs
+
+import (
+ "bytes"
+ "encoding/json"
+ "fmt"
+ "os/exec"
+ "time"
+
+ "github.com/opencontainers/runtime-spec/specs-go"
+
+ "github.com/sirupsen/logrus"
+)
+
+type Rlimit struct {
+ Type int `json:"type"`
+ Hard uint64 `json:"hard"`
+ Soft uint64 `json:"soft"`
+}
+
+// IDMap represents UID/GID Mappings for User Namespaces.
+type IDMap struct {
+ ContainerID int `json:"container_id"`
+ HostID int `json:"host_id"`
+ Size int `json:"size"`
+}
+
+// Seccomp represents syscall restrictions
+// By default, only the native architecture of the kernel is allowed to be used
+// for syscalls. Additional architectures can be added by specifying them in
+// Architectures.
+type Seccomp struct {
+ DefaultAction Action `json:"default_action"`
+ Architectures []string `json:"architectures"`
+ Syscalls []*Syscall `json:"syscalls"`
+}
+
+// Action is taken upon rule match in Seccomp
+type Action int
+
+const (
+ Kill Action = iota + 1
+ Errno
+ Trap
+ Allow
+ Trace
+)
+
+// Operator is a comparison operator to be used when matching syscall arguments in Seccomp
+type Operator int
+
+const (
+ EqualTo Operator = iota + 1
+ NotEqualTo
+ GreaterThan
+ GreaterThanOrEqualTo
+ LessThan
+ LessThanOrEqualTo
+ MaskEqualTo
+)
+
+// Arg is a rule to match a specific syscall argument in Seccomp
+type Arg struct {
+ Index uint `json:"index"`
+ Value uint64 `json:"value"`
+ ValueTwo uint64 `json:"value_two"`
+ Op Operator `json:"op"`
+}
+
+// Syscall is a rule to match a syscall in Seccomp
+type Syscall struct {
+ Name string `json:"name"`
+ Action Action `json:"action"`
+ Args []*Arg `json:"args"`
+}
+
+// TODO Windows. Many of these fields should be factored out into those parts
+// which are common across platforms, and those which are platform specific.
+
+// Config defines configuration options for executing a process inside a contained environment.
+type Config struct {
+ // NoPivotRoot will use MS_MOVE and a chroot to jail the process into the container's rootfs
+ // This is a common option when the container is running in ramdisk
+ NoPivotRoot bool `json:"no_pivot_root"`
+
+ // ParentDeathSignal specifies the signal that is sent to the container's process in the case
+ // that the parent process dies.
+ ParentDeathSignal int `json:"parent_death_signal"`
+
+ // Path to a directory containing the container's root filesystem.
+ Rootfs string `json:"rootfs"`
+
+ // Readonlyfs will remount the container's rootfs as readonly where only externally mounted
+ // bind mounts are writtable.
+ Readonlyfs bool `json:"readonlyfs"`
+
+ // Specifies the mount propagation flags to be applied to /.
+ RootPropagation int `json:"rootPropagation"`
+
+ // Mounts specify additional source and destination paths that will be mounted inside the container's
+ // rootfs and mount namespace if specified
+ Mounts []*Mount `json:"mounts"`
+
+ // The device nodes that should be automatically created within the container upon container start. Note, make sure that the node is marked as allowed in the cgroup as well!
+ Devices []*Device `json:"devices"`
+
+ MountLabel string `json:"mount_label"`
+
+ // Hostname optionally sets the container's hostname if provided
+ Hostname string `json:"hostname"`
+
+ // Namespaces specifies the container's namespaces that it should setup when cloning the init process
+ // If a namespace is not provided that namespace is shared from the container's parent process
+ Namespaces Namespaces `json:"namespaces"`
+
+ // Capabilities specify the capabilities to keep when executing the process inside the container
+ // All capabilities not specified will be dropped from the processes capability mask
+ Capabilities *Capabilities `json:"capabilities"`
+
+ // Networks specifies the container's network setup to be created
+ Networks []*Network `json:"networks"`
+
+ // Routes can be specified to create entries in the route table as the container is started
+ Routes []*Route `json:"routes"`
+
+ // Cgroups specifies specific cgroup settings for the various subsystems that the container is
+ // placed into to limit the resources the container has available
+ Cgroups *Cgroup `json:"cgroups"`
+
+ // AppArmorProfile specifies the profile to apply to the process running in the container and is
+ // change at the time the process is execed
+ AppArmorProfile string `json:"apparmor_profile,omitempty"`
+
+ // ProcessLabel specifies the label to apply to the process running in the container. It is
+ // commonly used by selinux
+ ProcessLabel string `json:"process_label,omitempty"`
+
+ // Rlimits specifies the resource limits, such as max open files, to set in the container
+ // If Rlimits are not set, the container will inherit rlimits from the parent process
+ Rlimits []Rlimit `json:"rlimits,omitempty"`
+
+ // OomScoreAdj specifies the adjustment to be made by the kernel when calculating oom scores
+ // for a process. Valid values are between the range [-1000, '1000'], where processes with
+ // higher scores are preferred for being killed. If it is unset then we don't touch the current
+ // value.
+ // More information about kernel oom score calculation here: https://lwn.net/Articles/317814/
+ OomScoreAdj *int `json:"oom_score_adj,omitempty"`
+
+ // UidMappings is an array of User ID mappings for User Namespaces
+ UidMappings []IDMap `json:"uid_mappings"`
+
+ // GidMappings is an array of Group ID mappings for User Namespaces
+ GidMappings []IDMap `json:"gid_mappings"`
+
+ // MaskPaths specifies paths within the container's rootfs to mask over with a bind
+ // mount pointing to /dev/null as to prevent reads of the file.
+ MaskPaths []string `json:"mask_paths"`
+
+ // ReadonlyPaths specifies paths within the container's rootfs to remount as read-only
+ // so that these files prevent any writes.
+ ReadonlyPaths []string `json:"readonly_paths"`
+
+ // Sysctl is a map of properties and their values. It is the equivalent of using
+ // sysctl -w my.property.name value in Linux.
+ Sysctl map[string]string `json:"sysctl"`
+
+ // Seccomp allows actions to be taken whenever a syscall is made within the container.
+ // A number of rules are given, each having an action to be taken if a syscall matches it.
+ // A default action to be taken if no rules match is also given.
+ Seccomp *Seccomp `json:"seccomp"`
+
+ // NoNewPrivileges controls whether processes in the container can gain additional privileges.
+ NoNewPrivileges bool `json:"no_new_privileges,omitempty"`
+
+ // Hooks are a collection of actions to perform at various container lifecycle events.
+ // CommandHooks are serialized to JSON, but other hooks are not.
+ Hooks *Hooks
+
+ // Version is the version of opencontainer specification that is supported.
+ Version string `json:"version"`
+
+ // Labels are user defined metadata that is stored in the config and populated on the state
+ Labels []string `json:"labels"`
+
+ // NoNewKeyring will not allocated a new session keyring for the container. It will use the
+ // callers keyring in this case.
+ NoNewKeyring bool `json:"no_new_keyring"`
+
+ // IntelRdt specifies settings for Intel RDT group that the container is placed into
+ // to limit the resources (e.g., L3 cache, memory bandwidth) the container has available
+ IntelRdt *IntelRdt `json:"intel_rdt,omitempty"`
+
+ // RootlessEUID is set when the runc was launched with non-zero EUID.
+ // Note that RootlessEUID is set to false when launched with EUID=0 in userns.
+ // When RootlessEUID is set, runc creates a new userns for the container.
+ // (config.json needs to contain userns settings)
+ RootlessEUID bool `json:"rootless_euid,omitempty"`
+
+ // RootlessCgroups is set when unlikely to have the full access to cgroups.
+ // When RootlessCgroups is set, cgroups errors are ignored.
+ RootlessCgroups bool `json:"rootless_cgroups,omitempty"`
+}
+
+type Hooks struct {
+ // Prestart commands are executed after the container namespaces are created,
+ // but before the user supplied command is executed from init.
+ Prestart []Hook
+
+ // Poststart commands are executed after the container init process starts.
+ Poststart []Hook
+
+ // Poststop commands are executed after the container init process exits.
+ Poststop []Hook
+}
+
+type Capabilities struct {
+ // Bounding is the set of capabilities checked by the kernel.
+ Bounding []string
+ // Effective is the set of capabilities checked by the kernel.
+ Effective []string
+ // Inheritable is the capabilities preserved across execve.
+ Inheritable []string
+ // Permitted is the limiting superset for effective capabilities.
+ Permitted []string
+ // Ambient is the ambient set of capabilities that are kept.
+ Ambient []string
+}
+
+func (hooks *Hooks) UnmarshalJSON(b []byte) error {
+ var state struct {
+ Prestart []CommandHook
+ Poststart []CommandHook
+ Poststop []CommandHook
+ }
+
+ if err := json.Unmarshal(b, &state); err != nil {
+ return err
+ }
+
+ deserialize := func(shooks []CommandHook) (hooks []Hook) {
+ for _, shook := range shooks {
+ hooks = append(hooks, shook)
+ }
+
+ return hooks
+ }
+
+ hooks.Prestart = deserialize(state.Prestart)
+ hooks.Poststart = deserialize(state.Poststart)
+ hooks.Poststop = deserialize(state.Poststop)
+ return nil
+}
+
+func (hooks Hooks) MarshalJSON() ([]byte, error) {
+ serialize := func(hooks []Hook) (serializableHooks []CommandHook) {
+ for _, hook := range hooks {
+ switch chook := hook.(type) {
+ case CommandHook:
+ serializableHooks = append(serializableHooks, chook)
+ default:
+ logrus.Warnf("cannot serialize hook of type %T, skipping", hook)
+ }
+ }
+
+ return serializableHooks
+ }
+
+ return json.Marshal(map[string]interface{}{
+ "prestart": serialize(hooks.Prestart),
+ "poststart": serialize(hooks.Poststart),
+ "poststop": serialize(hooks.Poststop),
+ })
+}
+
+type Hook interface {
+ // Run executes the hook with the provided state.
+ Run(*specs.State) error
+}
+
+// NewFunctionHook will call the provided function when the hook is run.
+func NewFunctionHook(f func(*specs.State) error) FuncHook {
+ return FuncHook{
+ run: f,
+ }
+}
+
+type FuncHook struct {
+ run func(*specs.State) error
+}
+
+func (f FuncHook) Run(s *specs.State) error {
+ return f.run(s)
+}
+
+type Command struct {
+ Path string `json:"path"`
+ Args []string `json:"args"`
+ Env []string `json:"env"`
+ Dir string `json:"dir"`
+ Timeout *time.Duration `json:"timeout"`
+}
+
+// NewCommandHook will execute the provided command when the hook is run.
+func NewCommandHook(cmd Command) CommandHook {
+ return CommandHook{
+ Command: cmd,
+ }
+}
+
+type CommandHook struct {
+ Command
+}
+
+func (c Command) Run(s *specs.State) error {
+ b, err := json.Marshal(s)
+ if err != nil {
+ return err
+ }
+ var stdout, stderr bytes.Buffer
+ cmd := exec.Cmd{
+ Path: c.Path,
+ Args: c.Args,
+ Env: c.Env,
+ Stdin: bytes.NewReader(b),
+ Stdout: &stdout,
+ Stderr: &stderr,
+ }
+ if err := cmd.Start(); err != nil {
+ return err
+ }
+ errC := make(chan error, 1)
+ go func() {
+ err := cmd.Wait()
+ if err != nil {
+ err = fmt.Errorf("error running hook: %v, stdout: %s, stderr: %s", err, stdout.String(), stderr.String())
+ }
+ errC <- err
+ }()
+ var timerCh <-chan time.Time
+ if c.Timeout != nil {
+ timer := time.NewTimer(*c.Timeout)
+ defer timer.Stop()
+ timerCh = timer.C
+ }
+ select {
+ case err := <-errC:
+ return err
+ case <-timerCh:
+ cmd.Process.Kill()
+ cmd.Wait()
+ return fmt.Errorf("hook ran past specified timeout of %.1fs", c.Timeout.Seconds())
+ }
+}
--- /dev/null
+package configs
+
+import "fmt"
+
+// HostUID gets the translated uid for the process on host which could be
+// different when user namespaces are enabled.
+func (c Config) HostUID(containerId int) (int, error) {
+ if c.Namespaces.Contains(NEWUSER) {
+ if c.UidMappings == nil {
+ return -1, fmt.Errorf("User namespaces enabled, but no uid mappings found.")
+ }
+ id, found := c.hostIDFromMapping(containerId, c.UidMappings)
+ if !found {
+ return -1, fmt.Errorf("User namespaces enabled, but no user mapping found.")
+ }
+ return id, nil
+ }
+ // Return unchanged id.
+ return containerId, nil
+}
+
+// HostRootUID gets the root uid for the process on host which could be non-zero
+// when user namespaces are enabled.
+func (c Config) HostRootUID() (int, error) {
+ return c.HostUID(0)
+}
+
+// HostGID gets the translated gid for the process on host which could be
+// different when user namespaces are enabled.
+func (c Config) HostGID(containerId int) (int, error) {
+ if c.Namespaces.Contains(NEWUSER) {
+ if c.GidMappings == nil {
+ return -1, fmt.Errorf("User namespaces enabled, but no gid mappings found.")
+ }
+ id, found := c.hostIDFromMapping(containerId, c.GidMappings)
+ if !found {
+ return -1, fmt.Errorf("User namespaces enabled, but no group mapping found.")
+ }
+ return id, nil
+ }
+ // Return unchanged id.
+ return containerId, nil
+}
+
+// HostRootGID gets the root gid for the process on host which could be non-zero
+// when user namespaces are enabled.
+func (c Config) HostRootGID() (int, error) {
+ return c.HostGID(0)
+}
+
+// Utility function that gets a host ID for a container ID from user namespace map
+// if that ID is present in the map.
+func (c Config) hostIDFromMapping(containerID int, uMap []IDMap) (int, bool) {
+ for _, m := range uMap {
+ if (containerID >= m.ContainerID) && (containerID <= (m.ContainerID + m.Size - 1)) {
+ hostID := m.HostID + (containerID - m.ContainerID)
+ return hostID, true
+ }
+ }
+ return -1, false
+}
--- /dev/null
+package configs
+
+import (
+ "encoding/json"
+ "fmt"
+ "os"
+ "path/filepath"
+ "testing"
+)
+
+func loadConfig(name string) (*Config, error) {
+ f, err := os.Open(filepath.Join("../sample_configs", name))
+ if err != nil {
+ return nil, err
+ }
+ defer f.Close()
+
+ var container *Config
+ if err := json.NewDecoder(f).Decode(&container); err != nil {
+ return nil, err
+ }
+
+ // Check that a config doesn't contain extra fields
+ var configMap, abstractMap map[string]interface{}
+
+ if _, err := f.Seek(0, 0); err != nil {
+ return nil, err
+ }
+
+ if err := json.NewDecoder(f).Decode(&abstractMap); err != nil {
+ return nil, err
+ }
+
+ configData, err := json.Marshal(&container)
+ if err != nil {
+ return nil, err
+ }
+
+ if err := json.Unmarshal(configData, &configMap); err != nil {
+ return nil, err
+ }
+
+ for k := range configMap {
+ delete(abstractMap, k)
+ }
+
+ if len(abstractMap) != 0 {
+ return nil, fmt.Errorf("unknown fields: %s", abstractMap)
+ }
+
+ return container, nil
+}
+
+func TestRemoveNamespace(t *testing.T) {
+ ns := Namespaces{
+ {Type: NEWNET},
+ }
+ if !ns.Remove(NEWNET) {
+ t.Fatal("NEWNET was not removed")
+ }
+ if len(ns) != 0 {
+ t.Fatalf("namespaces should have 0 items but reports %d", len(ns))
+ }
+}
+
+func TestHostRootUIDNoUSERNS(t *testing.T) {
+ config := &Config{
+ Namespaces: Namespaces{},
+ }
+ uid, err := config.HostRootUID()
+ if err != nil {
+ t.Fatal(err)
+ }
+ if uid != 0 {
+ t.Fatalf("expected uid 0 with no USERNS but received %d", uid)
+ }
+}
+
+func TestHostRootUIDWithUSERNS(t *testing.T) {
+ config := &Config{
+ Namespaces: Namespaces{{Type: NEWUSER}},
+ UidMappings: []IDMap{
+ {
+ ContainerID: 0,
+ HostID: 1000,
+ Size: 1,
+ },
+ },
+ }
+ uid, err := config.HostRootUID()
+ if err != nil {
+ t.Fatal(err)
+ }
+ if uid != 1000 {
+ t.Fatalf("expected uid 1000 with no USERNS but received %d", uid)
+ }
+}
+
+func TestHostRootGIDNoUSERNS(t *testing.T) {
+ config := &Config{
+ Namespaces: Namespaces{},
+ }
+ uid, err := config.HostRootGID()
+ if err != nil {
+ t.Fatal(err)
+ }
+ if uid != 0 {
+ t.Fatalf("expected gid 0 with no USERNS but received %d", uid)
+ }
+}
+
+func TestHostRootGIDWithUSERNS(t *testing.T) {
+ config := &Config{
+ Namespaces: Namespaces{{Type: NEWUSER}},
+ GidMappings: []IDMap{
+ {
+ ContainerID: 0,
+ HostID: 1000,
+ Size: 1,
+ },
+ },
+ }
+ uid, err := config.HostRootGID()
+ if err != nil {
+ t.Fatal(err)
+ }
+ if uid != 1000 {
+ t.Fatalf("expected gid 1000 with no USERNS but received %d", uid)
+ }
+}
--- /dev/null
+package configs_test
+
+import (
+ "encoding/json"
+ "fmt"
+ "os"
+ "reflect"
+ "testing"
+ "time"
+
+ "github.com/opencontainers/runc/libcontainer/configs"
+ "github.com/opencontainers/runtime-spec/specs-go"
+)
+
+func TestUnmarshalHooks(t *testing.T) {
+ timeout := time.Second
+
+ prestartCmd := configs.NewCommandHook(configs.Command{
+ Path: "/var/vcap/hooks/prestart",
+ Args: []string{"--pid=123"},
+ Env: []string{"FOO=BAR"},
+ Dir: "/var/vcap",
+ Timeout: &timeout,
+ })
+ prestart, err := json.Marshal(prestartCmd.Command)
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ hook := configs.Hooks{}
+ err = hook.UnmarshalJSON([]byte(fmt.Sprintf(`{"Prestart" :[%s]}`, prestart)))
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ if !reflect.DeepEqual(hook.Prestart[0], prestartCmd) {
+ t.Errorf("Expected prestart to equal %+v but it was %+v",
+ prestartCmd, hook.Prestart[0])
+ }
+}
+
+func TestUnmarshalHooksWithInvalidData(t *testing.T) {
+ hook := configs.Hooks{}
+ err := hook.UnmarshalJSON([]byte(`{invalid-json}`))
+ if err == nil {
+ t.Error("Expected error to occur but it was nil")
+ }
+}
+
+func TestMarshalHooks(t *testing.T) {
+ timeout := time.Second
+
+ prestartCmd := configs.NewCommandHook(configs.Command{
+ Path: "/var/vcap/hooks/prestart",
+ Args: []string{"--pid=123"},
+ Env: []string{"FOO=BAR"},
+ Dir: "/var/vcap",
+ Timeout: &timeout,
+ })
+
+ hook := configs.Hooks{
+ Prestart: []configs.Hook{prestartCmd},
+ }
+ hooks, err := hook.MarshalJSON()
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ h := `{"poststart":null,"poststop":null,"prestart":[{"path":"/var/vcap/hooks/prestart","args":["--pid=123"],"env":["FOO=BAR"],"dir":"/var/vcap","timeout":1000000000}]}`
+ if string(hooks) != h {
+ t.Errorf("Expected hooks %s to equal %s", string(hooks), h)
+ }
+}
+
+func TestMarshalUnmarshalHooks(t *testing.T) {
+ timeout := time.Second
+
+ prestart := configs.NewCommandHook(configs.Command{
+ Path: "/var/vcap/hooks/prestart",
+ Args: []string{"--pid=123"},
+ Env: []string{"FOO=BAR"},
+ Dir: "/var/vcap",
+ Timeout: &timeout,
+ })
+
+ hook := configs.Hooks{
+ Prestart: []configs.Hook{prestart},
+ }
+ hooks, err := hook.MarshalJSON()
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ umMhook := configs.Hooks{}
+ err = umMhook.UnmarshalJSON(hooks)
+ if err != nil {
+ t.Fatal(err)
+ }
+ if !reflect.DeepEqual(umMhook.Prestart[0], prestart) {
+ t.Errorf("Expected hooks to be equal after mashaling -> unmarshaling them: %+v, %+v", umMhook.Prestart[0], prestart)
+ }
+}
+
+func TestMarshalHooksWithUnexpectedType(t *testing.T) {
+ fHook := configs.NewFunctionHook(func(*specs.State) error {
+ return nil
+ })
+ hook := configs.Hooks{
+ Prestart: []configs.Hook{fHook},
+ }
+ hooks, err := hook.MarshalJSON()
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ h := `{"poststart":null,"poststop":null,"prestart":null}`
+ if string(hooks) != h {
+ t.Errorf("Expected hooks %s to equal %s", string(hooks), h)
+ }
+}
+
+func TestFuncHookRun(t *testing.T) {
+ state := &specs.State{
+ Version: "1",
+ ID: "1",
+ Status: "created",
+ Pid: 1,
+ Bundle: "/bundle",
+ }
+
+ fHook := configs.NewFunctionHook(func(s *specs.State) error {
+ if !reflect.DeepEqual(state, s) {
+ t.Errorf("Expected state %+v to equal %+v", state, s)
+ }
+ return nil
+ })
+
+ fHook.Run(state)
+}
+
+func TestCommandHookRun(t *testing.T) {
+ state := &specs.State{
+ Version: "1",
+ ID: "1",
+ Status: "created",
+ Pid: 1,
+ Bundle: "/bundle",
+ }
+ timeout := time.Second
+
+ cmdHook := configs.NewCommandHook(configs.Command{
+ Path: os.Args[0],
+ Args: []string{os.Args[0], "-test.run=TestHelperProcess"},
+ Env: []string{"FOO=BAR"},
+ Dir: "/",
+ Timeout: &timeout,
+ })
+
+ err := cmdHook.Run(state)
+ if err != nil {
+ t.Errorf(fmt.Sprintf("Expected error to not occur but it was %+v", err))
+ }
+}
+
+func TestCommandHookRunTimeout(t *testing.T) {
+ state := &specs.State{
+ Version: "1",
+ ID: "1",
+ Status: "created",
+ Pid: 1,
+ Bundle: "/bundle",
+ }
+ timeout := (10 * time.Millisecond)
+
+ cmdHook := configs.NewCommandHook(configs.Command{
+ Path: os.Args[0],
+ Args: []string{os.Args[0], "-test.run=TestHelperProcessWithTimeout"},
+ Env: []string{"FOO=BAR"},
+ Dir: "/",
+ Timeout: &timeout,
+ })
+
+ err := cmdHook.Run(state)
+ if err == nil {
+ t.Error("Expected error to occur but it was nil")
+ }
+}
+
+func TestHelperProcess(*testing.T) {
+ fmt.Println("Helper Process")
+ os.Exit(0)
+}
+func TestHelperProcessWithTimeout(*testing.T) {
+ time.Sleep(time.Second)
+}
--- /dev/null
+package configs
+
+// All current tests are for Unix-specific functionality
--- /dev/null
+package configs
+
+import (
+ "fmt"
+ "os"
+)
+
+const (
+ Wildcard = -1
+)
+
+// TODO Windows: This can be factored out in the future
+
+type Device struct {
+ // Device type, block, char, etc.
+ Type rune `json:"type"`
+
+ // Path to the device.
+ Path string `json:"path"`
+
+ // Major is the device's major number.
+ Major int64 `json:"major"`
+
+ // Minor is the device's minor number.
+ Minor int64 `json:"minor"`
+
+ // Cgroup permissions format, rwm.
+ Permissions string `json:"permissions"`
+
+ // FileMode permission bits for the device.
+ FileMode os.FileMode `json:"file_mode"`
+
+ // Uid of the device.
+ Uid uint32 `json:"uid"`
+
+ // Gid of the device.
+ Gid uint32 `json:"gid"`
+
+ // Write the file to the allowed list
+ Allow bool `json:"allow"`
+}
+
+func (d *Device) CgroupString() string {
+ return fmt.Sprintf("%c %s:%s %s", d.Type, deviceNumberString(d.Major), deviceNumberString(d.Minor), d.Permissions)
+}
+
+func (d *Device) Mkdev() int {
+ return int((d.Major << 8) | (d.Minor & 0xff) | ((d.Minor & 0xfff00) << 12))
+}
+
+// deviceNumberString converts the device number to a string return result.
+func deviceNumberString(number int64) string {
+ if number == Wildcard {
+ return "*"
+ }
+ return fmt.Sprint(number)
+}
--- /dev/null
+// +build linux
+
+package configs
+
+var (
+ // DefaultSimpleDevices are devices that are to be both allowed and created.
+ DefaultSimpleDevices = []*Device{
+ // /dev/null and zero
+ {
+ Path: "/dev/null",
+ Type: 'c',
+ Major: 1,
+ Minor: 3,
+ Permissions: "rwm",
+ FileMode: 0666,
+ },
+ {
+ Path: "/dev/zero",
+ Type: 'c',
+ Major: 1,
+ Minor: 5,
+ Permissions: "rwm",
+ FileMode: 0666,
+ },
+
+ {
+ Path: "/dev/full",
+ Type: 'c',
+ Major: 1,
+ Minor: 7,
+ Permissions: "rwm",
+ FileMode: 0666,
+ },
+
+ // consoles and ttys
+ {
+ Path: "/dev/tty",
+ Type: 'c',
+ Major: 5,
+ Minor: 0,
+ Permissions: "rwm",
+ FileMode: 0666,
+ },
+
+ // /dev/urandom,/dev/random
+ {
+ Path: "/dev/urandom",
+ Type: 'c',
+ Major: 1,
+ Minor: 9,
+ Permissions: "rwm",
+ FileMode: 0666,
+ },
+ {
+ Path: "/dev/random",
+ Type: 'c',
+ Major: 1,
+ Minor: 8,
+ Permissions: "rwm",
+ FileMode: 0666,
+ },
+ }
+ DefaultAllowedDevices = append([]*Device{
+ // allow mknod for any device
+ {
+ Type: 'c',
+ Major: Wildcard,
+ Minor: Wildcard,
+ Permissions: "m",
+ },
+ {
+ Type: 'b',
+ Major: Wildcard,
+ Minor: Wildcard,
+ Permissions: "m",
+ },
+
+ {
+ Path: "/dev/console",
+ Type: 'c',
+ Major: 5,
+ Minor: 1,
+ Permissions: "rwm",
+ },
+ // /dev/pts/ - pts namespaces are "coming soon"
+ {
+ Path: "",
+ Type: 'c',
+ Major: 136,
+ Minor: Wildcard,
+ Permissions: "rwm",
+ },
+ {
+ Path: "",
+ Type: 'c',
+ Major: 5,
+ Minor: 2,
+ Permissions: "rwm",
+ },
+
+ // tuntap
+ {
+ Path: "",
+ Type: 'c',
+ Major: 10,
+ Minor: 200,
+ Permissions: "rwm",
+ },
+ }, DefaultSimpleDevices...)
+ DefaultAutoCreatedDevices = append([]*Device{}, DefaultSimpleDevices...)
+)
--- /dev/null
+package configs
+
+type HugepageLimit struct {
+ // which type of hugepage to limit.
+ Pagesize string `json:"page_size"`
+
+ // usage limit for hugepage.
+ Limit uint64 `json:"limit"`
+}
--- /dev/null
+package configs
+
+type IntelRdt struct {
+ // The schema for L3 cache id and capacity bitmask (CBM)
+ // Format: "L3:<cache_id0>=<cbm0>;<cache_id1>=<cbm1>;..."
+ L3CacheSchema string `json:"l3_cache_schema,omitempty"`
+
+ // The schema of memory bandwidth percentage per L3 cache id
+ // Format: "MB:<cache_id0>=bandwidth0;<cache_id1>=bandwidth1;..."
+ MemBwSchema string `json:"memBwSchema,omitempty"`
+}
--- /dev/null
+package configs
+
+import (
+ "fmt"
+)
+
+type IfPrioMap struct {
+ Interface string `json:"interface"`
+ Priority int64 `json:"priority"`
+}
+
+func (i *IfPrioMap) CgroupString() string {
+ return fmt.Sprintf("%s %d", i.Interface, i.Priority)
+}
--- /dev/null
+package configs
+
+const (
+ // EXT_COPYUP is a directive to copy up the contents of a directory when
+ // a tmpfs is mounted over it.
+ EXT_COPYUP = 1 << iota
+)
+
+type Mount struct {
+ // Source path for the mount.
+ Source string `json:"source"`
+
+ // Destination path for the mount inside the container.
+ Destination string `json:"destination"`
+
+ // Device the mount is for.
+ Device string `json:"device"`
+
+ // Mount flags.
+ Flags int `json:"flags"`
+
+ // Propagation Flags
+ PropagationFlags []int `json:"propagation_flags"`
+
+ // Mount data applied to the mount.
+ Data string `json:"data"`
+
+ // Relabel source if set, "z" indicates shared, "Z" indicates unshared.
+ Relabel string `json:"relabel"`
+
+ // Extensions are additional flags that are specific to runc.
+ Extensions int `json:"extensions"`
+
+ // Optional Command to be run before Source is mounted.
+ PremountCmds []Command `json:"premount_cmds"`
+
+ // Optional Command to be run after Source is mounted.
+ PostmountCmds []Command `json:"postmount_cmds"`
+}
--- /dev/null
+package configs
+
+type NamespaceType string
+
+type Namespaces []Namespace
--- /dev/null
+package configs
+
+import (
+ "fmt"
+ "os"
+ "sync"
+)
+
+const (
+ NEWNET NamespaceType = "NEWNET"
+ NEWPID NamespaceType = "NEWPID"
+ NEWNS NamespaceType = "NEWNS"
+ NEWUTS NamespaceType = "NEWUTS"
+ NEWIPC NamespaceType = "NEWIPC"
+ NEWUSER NamespaceType = "NEWUSER"
+ NEWCGROUP NamespaceType = "NEWCGROUP"
+)
+
+var (
+ nsLock sync.Mutex
+ supportedNamespaces = make(map[NamespaceType]bool)
+)
+
+// NsName converts the namespace type to its filename
+func NsName(ns NamespaceType) string {
+ switch ns {
+ case NEWNET:
+ return "net"
+ case NEWNS:
+ return "mnt"
+ case NEWPID:
+ return "pid"
+ case NEWIPC:
+ return "ipc"
+ case NEWUSER:
+ return "user"
+ case NEWUTS:
+ return "uts"
+ case NEWCGROUP:
+ return "cgroup"
+ }
+ return ""
+}
+
+// IsNamespaceSupported returns whether a namespace is available or
+// not
+func IsNamespaceSupported(ns NamespaceType) bool {
+ nsLock.Lock()
+ defer nsLock.Unlock()
+ supported, ok := supportedNamespaces[ns]
+ if ok {
+ return supported
+ }
+ nsFile := NsName(ns)
+ // if the namespace type is unknown, just return false
+ if nsFile == "" {
+ return false
+ }
+ _, err := os.Stat(fmt.Sprintf("/proc/self/ns/%s", nsFile))
+ // a namespace is supported if it exists and we have permissions to read it
+ supported = err == nil
+ supportedNamespaces[ns] = supported
+ return supported
+}
+
+func NamespaceTypes() []NamespaceType {
+ return []NamespaceType{
+ NEWUSER, // Keep user NS always first, don't move it.
+ NEWIPC,
+ NEWUTS,
+ NEWNET,
+ NEWPID,
+ NEWNS,
+ NEWCGROUP,
+ }
+}
+
+// Namespace defines configuration for each namespace. It specifies an
+// alternate path that is able to be joined via setns.
+type Namespace struct {
+ Type NamespaceType `json:"type"`
+ Path string `json:"path"`
+}
+
+func (n *Namespace) GetPath(pid int) string {
+ return fmt.Sprintf("/proc/%d/ns/%s", pid, NsName(n.Type))
+}
+
+func (n *Namespaces) Remove(t NamespaceType) bool {
+ i := n.index(t)
+ if i == -1 {
+ return false
+ }
+ *n = append((*n)[:i], (*n)[i+1:]...)
+ return true
+}
+
+func (n *Namespaces) Add(t NamespaceType, path string) {
+ i := n.index(t)
+ if i == -1 {
+ *n = append(*n, Namespace{Type: t, Path: path})
+ return
+ }
+ (*n)[i].Path = path
+}
+
+func (n *Namespaces) index(t NamespaceType) int {
+ for i, ns := range *n {
+ if ns.Type == t {
+ return i
+ }
+ }
+ return -1
+}
+
+func (n *Namespaces) Contains(t NamespaceType) bool {
+ return n.index(t) != -1
+}
+
+func (n *Namespaces) PathOf(t NamespaceType) string {
+ i := n.index(t)
+ if i == -1 {
+ return ""
+ }
+ return (*n)[i].Path
+}
--- /dev/null
+// +build linux
+
+package configs
+
+import "golang.org/x/sys/unix"
+
+func (n *Namespace) Syscall() int {
+ return namespaceInfo[n.Type]
+}
+
+var namespaceInfo = map[NamespaceType]int{
+ NEWNET: unix.CLONE_NEWNET,
+ NEWNS: unix.CLONE_NEWNS,
+ NEWUSER: unix.CLONE_NEWUSER,
+ NEWIPC: unix.CLONE_NEWIPC,
+ NEWUTS: unix.CLONE_NEWUTS,
+ NEWPID: unix.CLONE_NEWPID,
+ NEWCGROUP: unix.CLONE_NEWCGROUP,
+}
+
+// CloneFlags parses the container's Namespaces options to set the correct
+// flags on clone, unshare. This function returns flags only for new namespaces.
+func (n *Namespaces) CloneFlags() uintptr {
+ var flag int
+ for _, v := range *n {
+ if v.Path != "" {
+ continue
+ }
+ flag |= namespaceInfo[v.Type]
+ }
+ return uintptr(flag)
+}
--- /dev/null
+// +build !linux,!windows
+
+package configs
+
+func (n *Namespace) Syscall() int {
+ panic("No namespace syscall support")
+}
+
+// CloneFlags parses the container's Namespaces options to set the correct
+// flags on clone, unshare. This function returns flags only for new namespaces.
+func (n *Namespaces) CloneFlags() uintptr {
+ panic("No namespace syscall support")
+}
--- /dev/null
+// +build !linux
+
+package configs
+
+// Namespace defines configuration for each namespace. It specifies an
+// alternate path that is able to be joined via setns.
+type Namespace struct {
+}
--- /dev/null
+package configs
+
+// Network defines configuration for a container's networking stack
+//
+// The network configuration can be omitted from a container causing the
+// container to be setup with the host's networking stack
+type Network struct {
+ // Type sets the networks type, commonly veth and loopback
+ Type string `json:"type"`
+
+ // Name of the network interface
+ Name string `json:"name"`
+
+ // The bridge to use.
+ Bridge string `json:"bridge"`
+
+ // MacAddress contains the MAC address to set on the network interface
+ MacAddress string `json:"mac_address"`
+
+ // Address contains the IPv4 and mask to set on the network interface
+ Address string `json:"address"`
+
+ // Gateway sets the gateway address that is used as the default for the interface
+ Gateway string `json:"gateway"`
+
+ // IPv6Address contains the IPv6 and mask to set on the network interface
+ IPv6Address string `json:"ipv6_address"`
+
+ // IPv6Gateway sets the ipv6 gateway address that is used as the default for the interface
+ IPv6Gateway string `json:"ipv6_gateway"`
+
+ // Mtu sets the mtu value for the interface and will be mirrored on both the host and
+ // container's interfaces if a pair is created, specifically in the case of type veth
+ // Note: This does not apply to loopback interfaces.
+ Mtu int `json:"mtu"`
+
+ // TxQueueLen sets the tx_queuelen value for the interface and will be mirrored on both the host and
+ // container's interfaces if a pair is created, specifically in the case of type veth
+ // Note: This does not apply to loopback interfaces.
+ TxQueueLen int `json:"txqueuelen"`
+
+ // HostInterfaceName is a unique name of a veth pair that resides on in the host interface of the
+ // container.
+ HostInterfaceName string `json:"host_interface_name"`
+
+ // HairpinMode specifies if hairpin NAT should be enabled on the virtual interface
+ // bridge port in the case of type veth
+ // Note: This is unsupported on some systems.
+ // Note: This does not apply to loopback interfaces.
+ HairpinMode bool `json:"hairpin_mode"`
+}
+
+// Routes can be specified to create entries in the route table as the container is started
+//
+// All of destination, source, and gateway should be either IPv4 or IPv6.
+// One of the three options must be present, and omitted entries will use their
+// IP family default for the route table. For IPv4 for example, setting the
+// gateway to 1.2.3.4 and the interface to eth0 will set up a standard
+// destination of 0.0.0.0(or *) when viewed in the route table.
+type Route struct {
+ // Sets the destination and mask, should be a CIDR. Accepts IPv4 and IPv6
+ Destination string `json:"destination"`
+
+ // Sets the source and mask, should be a CIDR. Accepts IPv4 and IPv6
+ Source string `json:"source"`
+
+ // Sets the gateway. Accepts IPv4 and IPv6
+ Gateway string `json:"gateway"`
+
+ // The device to set this route up for, for example: eth0
+ InterfaceName string `json:"interface_name"`
+}
--- /dev/null
+package validate
+
+import (
+ "fmt"
+ "strings"
+
+ "github.com/opencontainers/runc/libcontainer/configs"
+)
+
+// rootlessEUID makes sure that the config can be applied when runc
+// is being executed as a non-root user (euid != 0) in the current user namespace.
+func (v *ConfigValidator) rootlessEUID(config *configs.Config) error {
+ if err := rootlessEUIDMappings(config); err != nil {
+ return err
+ }
+ if err := rootlessEUIDMount(config); err != nil {
+ return err
+ }
+
+ // XXX: We currently can't verify the user config at all, because
+ // configs.Config doesn't store the user-related configs. So this
+ // has to be verified by setupUser() in init_linux.go.
+
+ return nil
+}
+
+func hasIDMapping(id int, mappings []configs.IDMap) bool {
+ for _, m := range mappings {
+ if id >= m.ContainerID && id < m.ContainerID+m.Size {
+ return true
+ }
+ }
+ return false
+}
+
+func rootlessEUIDMappings(config *configs.Config) error {
+ if !config.Namespaces.Contains(configs.NEWUSER) {
+ return fmt.Errorf("rootless container requires user namespaces")
+ }
+
+ if len(config.UidMappings) == 0 {
+ return fmt.Errorf("rootless containers requires at least one UID mapping")
+ }
+ if len(config.GidMappings) == 0 {
+ return fmt.Errorf("rootless containers requires at least one GID mapping")
+ }
+ return nil
+}
+
+// mount verifies that the user isn't trying to set up any mounts they don't have
+// the rights to do. In addition, it makes sure that no mount has a `uid=` or
+// `gid=` option that doesn't resolve to root.
+func rootlessEUIDMount(config *configs.Config) error {
+ // XXX: We could whitelist allowed devices at this point, but I'm not
+ // convinced that's a good idea. The kernel is the best arbiter of
+ // access control.
+
+ for _, mount := range config.Mounts {
+ // Check that the options list doesn't contain any uid= or gid= entries
+ // that don't resolve to root.
+ for _, opt := range strings.Split(mount.Data, ",") {
+ if strings.HasPrefix(opt, "uid=") {
+ var uid int
+ n, err := fmt.Sscanf(opt, "uid=%d", &uid)
+ if n != 1 || err != nil {
+ // Ignore unknown mount options.
+ continue
+ }
+ if !hasIDMapping(uid, config.UidMappings) {
+ return fmt.Errorf("cannot specify uid= mount options for unmapped uid in rootless containers")
+ }
+ }
+
+ if strings.HasPrefix(opt, "gid=") {
+ var gid int
+ n, err := fmt.Sscanf(opt, "gid=%d", &gid)
+ if n != 1 || err != nil {
+ // Ignore unknown mount options.
+ continue
+ }
+ if !hasIDMapping(gid, config.GidMappings) {
+ return fmt.Errorf("cannot specify gid= mount options for unmapped gid in rootless containers")
+ }
+ }
+ }
+ }
+
+ return nil
+}
--- /dev/null
+package validate
+
+import (
+ "testing"
+
+ "github.com/opencontainers/runc/libcontainer/configs"
+)
+
+func rootlessEUIDConfig() *configs.Config {
+ return &configs.Config{
+ Rootfs: "/var",
+ RootlessEUID: true,
+ RootlessCgroups: true,
+ Namespaces: configs.Namespaces(
+ []configs.Namespace{
+ {Type: configs.NEWUSER},
+ },
+ ),
+ UidMappings: []configs.IDMap{
+ {
+ HostID: 1337,
+ ContainerID: 0,
+ Size: 1,
+ },
+ },
+ GidMappings: []configs.IDMap{
+ {
+ HostID: 7331,
+ ContainerID: 0,
+ Size: 1,
+ },
+ },
+ }
+}
+
+func TestValidateRootlessEUID(t *testing.T) {
+ validator := New()
+
+ config := rootlessEUIDConfig()
+ if err := validator.Validate(config); err != nil {
+ t.Errorf("Expected error to not occur: %+v", err)
+ }
+}
+
+/* rootlessEUIDMappings */
+
+func TestValidateRootlessEUIDUserns(t *testing.T) {
+ validator := New()
+
+ config := rootlessEUIDConfig()
+ config.Namespaces = nil
+ if err := validator.Validate(config); err == nil {
+ t.Errorf("Expected error to occur if user namespaces not set")
+ }
+}
+
+func TestValidateRootlessEUIDMappingUid(t *testing.T) {
+ validator := New()
+
+ config := rootlessEUIDConfig()
+ config.UidMappings = nil
+ if err := validator.Validate(config); err == nil {
+ t.Errorf("Expected error to occur if no uid mappings provided")
+ }
+}
+
+func TestValidateNonZeroEUIDMappingGid(t *testing.T) {
+ validator := New()
+
+ config := rootlessEUIDConfig()
+ config.GidMappings = nil
+ if err := validator.Validate(config); err == nil {
+ t.Errorf("Expected error to occur if no gid mappings provided")
+ }
+}
+
+/* rootlessEUIDMount() */
+
+func TestValidateRootlessEUIDMountUid(t *testing.T) {
+ config := rootlessEUIDConfig()
+ validator := New()
+
+ config.Mounts = []*configs.Mount{
+ {
+ Source: "devpts",
+ Destination: "/dev/pts",
+ Device: "devpts",
+ },
+ }
+
+ if err := validator.Validate(config); err != nil {
+ t.Errorf("Expected error to not occur when uid= not set in mount options: %+v", err)
+ }
+
+ config.Mounts[0].Data = "uid=5"
+ if err := validator.Validate(config); err == nil {
+ t.Errorf("Expected error to occur when setting uid=5 in mount options")
+ }
+
+ config.Mounts[0].Data = "uid=0"
+ if err := validator.Validate(config); err != nil {
+ t.Errorf("Expected error to not occur when setting uid=0 in mount options: %+v", err)
+ }
+
+ config.Mounts[0].Data = "uid=2"
+ config.UidMappings[0].Size = 10
+ if err := validator.Validate(config); err != nil {
+ t.Errorf("Expected error to not occur when setting uid=2 in mount options and UidMapping[0].size is 10")
+ }
+
+ config.Mounts[0].Data = "uid=20"
+ config.UidMappings[0].Size = 10
+ if err := validator.Validate(config); err == nil {
+ t.Errorf("Expected error to occur when setting uid=20 in mount options and UidMapping[0].size is 10")
+ }
+}
+
+func TestValidateRootlessEUIDMountGid(t *testing.T) {
+ config := rootlessEUIDConfig()
+ validator := New()
+
+ config.Mounts = []*configs.Mount{
+ {
+ Source: "devpts",
+ Destination: "/dev/pts",
+ Device: "devpts",
+ },
+ }
+
+ if err := validator.Validate(config); err != nil {
+ t.Errorf("Expected error to not occur when gid= not set in mount options: %+v", err)
+ }
+
+ config.Mounts[0].Data = "gid=5"
+ if err := validator.Validate(config); err == nil {
+ t.Errorf("Expected error to occur when setting gid=5 in mount options")
+ }
+
+ config.Mounts[0].Data = "gid=0"
+ if err := validator.Validate(config); err != nil {
+ t.Errorf("Expected error to not occur when setting gid=0 in mount options: %+v", err)
+ }
+
+ config.Mounts[0].Data = "gid=5"
+ config.GidMappings[0].Size = 10
+ if err := validator.Validate(config); err != nil {
+ t.Errorf("Expected error to not occur when setting gid=5 in mount options and GidMapping[0].size is 10")
+ }
+
+ config.Mounts[0].Data = "gid=11"
+ config.GidMappings[0].Size = 10
+ if err := validator.Validate(config); err == nil {
+ t.Errorf("Expected error to occur when setting gid=11 in mount options and GidMapping[0].size is 10")
+ }
+}
--- /dev/null
+package validate
+
+import (
+ "fmt"
+ "os"
+ "path/filepath"
+ "strings"
+
+ "github.com/opencontainers/runc/libcontainer/configs"
+ "github.com/opencontainers/runc/libcontainer/intelrdt"
+ selinux "github.com/opencontainers/selinux/go-selinux"
+)
+
+type Validator interface {
+ Validate(*configs.Config) error
+}
+
+func New() Validator {
+ return &ConfigValidator{}
+}
+
+type ConfigValidator struct {
+}
+
+func (v *ConfigValidator) Validate(config *configs.Config) error {
+ if err := v.rootfs(config); err != nil {
+ return err
+ }
+ if err := v.network(config); err != nil {
+ return err
+ }
+ if err := v.hostname(config); err != nil {
+ return err
+ }
+ if err := v.security(config); err != nil {
+ return err
+ }
+ if err := v.usernamespace(config); err != nil {
+ return err
+ }
+ if err := v.cgroupnamespace(config); err != nil {
+ return err
+ }
+ if err := v.sysctl(config); err != nil {
+ return err
+ }
+ if err := v.intelrdt(config); err != nil {
+ return err
+ }
+ if config.RootlessEUID {
+ if err := v.rootlessEUID(config); err != nil {
+ return err
+ }
+ }
+ return nil
+}
+
+// rootfs validates if the rootfs is an absolute path and is not a symlink
+// to the container's root filesystem.
+func (v *ConfigValidator) rootfs(config *configs.Config) error {
+ if _, err := os.Stat(config.Rootfs); err != nil {
+ if os.IsNotExist(err) {
+ return fmt.Errorf("rootfs (%s) does not exist", config.Rootfs)
+ }
+ return err
+ }
+ cleaned, err := filepath.Abs(config.Rootfs)
+ if err != nil {
+ return err
+ }
+ if cleaned, err = filepath.EvalSymlinks(cleaned); err != nil {
+ return err
+ }
+ if filepath.Clean(config.Rootfs) != cleaned {
+ return fmt.Errorf("%s is not an absolute path or is a symlink", config.Rootfs)
+ }
+ return nil
+}
+
+func (v *ConfigValidator) network(config *configs.Config) error {
+ if !config.Namespaces.Contains(configs.NEWNET) {
+ if len(config.Networks) > 0 || len(config.Routes) > 0 {
+ return fmt.Errorf("unable to apply network settings without a private NET namespace")
+ }
+ }
+ return nil
+}
+
+func (v *ConfigValidator) hostname(config *configs.Config) error {
+ if config.Hostname != "" && !config.Namespaces.Contains(configs.NEWUTS) {
+ return fmt.Errorf("unable to set hostname without a private UTS namespace")
+ }
+ return nil
+}
+
+func (v *ConfigValidator) security(config *configs.Config) error {
+ // restrict sys without mount namespace
+ if (len(config.MaskPaths) > 0 || len(config.ReadonlyPaths) > 0) &&
+ !config.Namespaces.Contains(configs.NEWNS) {
+ return fmt.Errorf("unable to restrict sys entries without a private MNT namespace")
+ }
+ if config.ProcessLabel != "" && !selinux.GetEnabled() {
+ return fmt.Errorf("selinux label is specified in config, but selinux is disabled or not supported")
+ }
+
+ return nil
+}
+
+func (v *ConfigValidator) usernamespace(config *configs.Config) error {
+ if config.Namespaces.Contains(configs.NEWUSER) {
+ if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) {
+ return fmt.Errorf("USER namespaces aren't enabled in the kernel")
+ }
+ } else {
+ if config.UidMappings != nil || config.GidMappings != nil {
+ return fmt.Errorf("User namespace mappings specified, but USER namespace isn't enabled in the config")
+ }
+ }
+ return nil
+}
+
+func (v *ConfigValidator) cgroupnamespace(config *configs.Config) error {
+ if config.Namespaces.Contains(configs.NEWCGROUP) {
+ if _, err := os.Stat("/proc/self/ns/cgroup"); os.IsNotExist(err) {
+ return fmt.Errorf("cgroup namespaces aren't enabled in the kernel")
+ }
+ }
+ return nil
+}
+
+// sysctl validates that the specified sysctl keys are valid or not.
+// /proc/sys isn't completely namespaced and depending on which namespaces
+// are specified, a subset of sysctls are permitted.
+func (v *ConfigValidator) sysctl(config *configs.Config) error {
+ validSysctlMap := map[string]bool{
+ "kernel.msgmax": true,
+ "kernel.msgmnb": true,
+ "kernel.msgmni": true,
+ "kernel.sem": true,
+ "kernel.shmall": true,
+ "kernel.shmmax": true,
+ "kernel.shmmni": true,
+ "kernel.shm_rmid_forced": true,
+ }
+
+ for s := range config.Sysctl {
+ if validSysctlMap[s] || strings.HasPrefix(s, "fs.mqueue.") {
+ if config.Namespaces.Contains(configs.NEWIPC) {
+ continue
+ } else {
+ return fmt.Errorf("sysctl %q is not allowed in the hosts ipc namespace", s)
+ }
+ }
+ if strings.HasPrefix(s, "net.") {
+ if config.Namespaces.Contains(configs.NEWNET) {
+ if path := config.Namespaces.PathOf(configs.NEWNET); path != "" {
+ if err := checkHostNs(s, path); err != nil {
+ return err
+ }
+ }
+ continue
+ } else {
+ return fmt.Errorf("sysctl %q is not allowed in the hosts network namespace", s)
+ }
+ }
+ if config.Namespaces.Contains(configs.NEWUTS) {
+ switch s {
+ case "kernel.domainname":
+ // This is namespaced and there's no explicit OCI field for it.
+ continue
+ case "kernel.hostname":
+ // This is namespaced but there's a conflicting (dedicated) OCI field for it.
+ return fmt.Errorf("sysctl %q is not allowed as it conflicts with the OCI %q field", s, "hostname")
+ }
+ }
+ return fmt.Errorf("sysctl %q is not in a separate kernel namespace", s)
+ }
+
+ return nil
+}
+
+func (v *ConfigValidator) intelrdt(config *configs.Config) error {
+ if config.IntelRdt != nil {
+ if !intelrdt.IsCatEnabled() && !intelrdt.IsMbaEnabled() {
+ return fmt.Errorf("intelRdt is specified in config, but Intel RDT is not supported or enabled")
+ }
+
+ if !intelrdt.IsCatEnabled() && config.IntelRdt.L3CacheSchema != "" {
+ return fmt.Errorf("intelRdt.l3CacheSchema is specified in config, but Intel RDT/CAT is not enabled")
+ }
+ if !intelrdt.IsMbaEnabled() && config.IntelRdt.MemBwSchema != "" {
+ return fmt.Errorf("intelRdt.memBwSchema is specified in config, but Intel RDT/MBA is not enabled")
+ }
+
+ if intelrdt.IsCatEnabled() && config.IntelRdt.L3CacheSchema == "" {
+ return fmt.Errorf("Intel RDT/CAT is enabled and intelRdt is specified in config, but intelRdt.l3CacheSchema is empty")
+ }
+ if intelrdt.IsMbaEnabled() && config.IntelRdt.MemBwSchema == "" {
+ return fmt.Errorf("Intel RDT/MBA is enabled and intelRdt is specified in config, but intelRdt.memBwSchema is empty")
+ }
+ }
+
+ return nil
+}
+
+func isSymbolicLink(path string) (bool, error) {
+ fi, err := os.Lstat(path)
+ if err != nil {
+ return false, err
+ }
+
+ return fi.Mode()&os.ModeSymlink == os.ModeSymlink, nil
+}
+
+// checkHostNs checks whether network sysctl is used in host namespace.
+func checkHostNs(sysctlConfig string, path string) error {
+ var currentProcessNetns = "/proc/self/ns/net"
+ // readlink on the current processes network namespace
+ destOfCurrentProcess, err := os.Readlink(currentProcessNetns)
+ if err != nil {
+ return fmt.Errorf("read soft link %q error", currentProcessNetns)
+ }
+
+ // First check if the provided path is a symbolic link
+ symLink, err := isSymbolicLink(path)
+ if err != nil {
+ return fmt.Errorf("could not check that %q is a symlink: %v", path, err)
+ }
+
+ if symLink == false {
+ // The provided namespace is not a symbolic link,
+ // it is not the host namespace.
+ return nil
+ }
+
+ // readlink on the path provided in the struct
+ destOfContainer, err := os.Readlink(path)
+ if err != nil {
+ return fmt.Errorf("read soft link %q error", path)
+ }
+ if destOfContainer == destOfCurrentProcess {
+ return fmt.Errorf("sysctl %q is not allowed in the hosts network namespace", sysctlConfig)
+ }
+ return nil
+}
--- /dev/null
+package validate_test
+
+import (
+ "os"
+ "testing"
+
+ "github.com/opencontainers/runc/libcontainer/configs"
+ "github.com/opencontainers/runc/libcontainer/configs/validate"
+)
+
+func TestValidate(t *testing.T) {
+ config := &configs.Config{
+ Rootfs: "/var",
+ }
+
+ validator := validate.New()
+ err := validator.Validate(config)
+ if err != nil {
+ t.Errorf("Expected error to not occur: %+v", err)
+ }
+}
+
+func TestValidateWithInvalidRootfs(t *testing.T) {
+ dir := "rootfs"
+ os.Symlink("/var", dir)
+ defer os.Remove(dir)
+
+ config := &configs.Config{
+ Rootfs: dir,
+ }
+
+ validator := validate.New()
+ err := validator.Validate(config)
+ if err == nil {
+ t.Error("Expected error to occur but it was nil")
+ }
+}
+
+func TestValidateNetworkWithoutNETNamespace(t *testing.T) {
+ network := &configs.Network{Type: "loopback"}
+ config := &configs.Config{
+ Rootfs: "/var",
+ Namespaces: []configs.Namespace{},
+ Networks: []*configs.Network{network},
+ }
+
+ validator := validate.New()
+ err := validator.Validate(config)
+ if err == nil {
+ t.Error("Expected error to occur but it was nil")
+ }
+}
+
+func TestValidateNetworkRoutesWithoutNETNamespace(t *testing.T) {
+ route := &configs.Route{Gateway: "255.255.255.0"}
+ config := &configs.Config{
+ Rootfs: "/var",
+ Namespaces: []configs.Namespace{},
+ Routes: []*configs.Route{route},
+ }
+
+ validator := validate.New()
+ err := validator.Validate(config)
+ if err == nil {
+ t.Error("Expected error to occur but it was nil")
+ }
+}
+
+func TestValidateHostname(t *testing.T) {
+ config := &configs.Config{
+ Rootfs: "/var",
+ Hostname: "runc",
+ Namespaces: configs.Namespaces(
+ []configs.Namespace{
+ {Type: configs.NEWUTS},
+ },
+ ),
+ }
+
+ validator := validate.New()
+ err := validator.Validate(config)
+ if err != nil {
+ t.Errorf("Expected error to not occur: %+v", err)
+ }
+}
+
+func TestValidateHostnameWithoutUTSNamespace(t *testing.T) {
+ config := &configs.Config{
+ Rootfs: "/var",
+ Hostname: "runc",
+ }
+
+ validator := validate.New()
+ err := validator.Validate(config)
+ if err == nil {
+ t.Error("Expected error to occur but it was nil")
+ }
+}
+
+func TestValidateSecurityWithMaskPaths(t *testing.T) {
+ config := &configs.Config{
+ Rootfs: "/var",
+ MaskPaths: []string{"/proc/kcore"},
+ Namespaces: configs.Namespaces(
+ []configs.Namespace{
+ {Type: configs.NEWNS},
+ },
+ ),
+ }
+
+ validator := validate.New()
+ err := validator.Validate(config)
+ if err != nil {
+ t.Errorf("Expected error to not occur: %+v", err)
+ }
+}
+
+func TestValidateSecurityWithROPaths(t *testing.T) {
+ config := &configs.Config{
+ Rootfs: "/var",
+ ReadonlyPaths: []string{"/proc/sys"},
+ Namespaces: configs.Namespaces(
+ []configs.Namespace{
+ {Type: configs.NEWNS},
+ },
+ ),
+ }
+
+ validator := validate.New()
+ err := validator.Validate(config)
+ if err != nil {
+ t.Errorf("Expected error to not occur: %+v", err)
+ }
+}
+
+func TestValidateSecurityWithoutNEWNS(t *testing.T) {
+ config := &configs.Config{
+ Rootfs: "/var",
+ MaskPaths: []string{"/proc/kcore"},
+ ReadonlyPaths: []string{"/proc/sys"},
+ }
+
+ validator := validate.New()
+ err := validator.Validate(config)
+ if err == nil {
+ t.Error("Expected error to occur but it was nil")
+ }
+}
+
+func TestValidateUsernamespace(t *testing.T) {
+ if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) {
+ t.Skip("userns is unsupported")
+ }
+ config := &configs.Config{
+ Rootfs: "/var",
+ Namespaces: configs.Namespaces(
+ []configs.Namespace{
+ {Type: configs.NEWUSER},
+ },
+ ),
+ }
+
+ validator := validate.New()
+ err := validator.Validate(config)
+ if err != nil {
+ t.Errorf("expected error to not occur %+v", err)
+ }
+}
+
+func TestValidateUsernamespaceWithoutUserNS(t *testing.T) {
+ uidMap := configs.IDMap{ContainerID: 123}
+ config := &configs.Config{
+ Rootfs: "/var",
+ UidMappings: []configs.IDMap{uidMap},
+ }
+
+ validator := validate.New()
+ err := validator.Validate(config)
+ if err == nil {
+ t.Error("Expected error to occur but it was nil")
+ }
+}
+
+func TestValidateSysctl(t *testing.T) {
+ sysctl := map[string]string{
+ "fs.mqueue.ctl": "ctl",
+ "net.ctl": "ctl",
+ "kernel.ctl": "ctl",
+ }
+
+ for k, v := range sysctl {
+ config := &configs.Config{
+ Rootfs: "/var",
+ Sysctl: map[string]string{k: v},
+ }
+
+ validator := validate.New()
+ err := validator.Validate(config)
+ if err == nil {
+ t.Error("Expected error to occur but it was nil")
+ }
+ }
+}
+
+func TestValidateValidSysctl(t *testing.T) {
+ sysctl := map[string]string{
+ "fs.mqueue.ctl": "ctl",
+ "net.ctl": "ctl",
+ "kernel.msgmax": "ctl",
+ }
+
+ for k, v := range sysctl {
+ config := &configs.Config{
+ Rootfs: "/var",
+ Sysctl: map[string]string{k: v},
+ Namespaces: []configs.Namespace{
+ {
+ Type: configs.NEWNET,
+ },
+ {
+ Type: configs.NEWIPC,
+ },
+ },
+ }
+
+ validator := validate.New()
+ err := validator.Validate(config)
+ if err != nil {
+ t.Errorf("Expected error to not occur with {%s=%s} but got: %q", k, v, err)
+ }
+ }
+}
+
+func TestValidateSysctlWithSameNs(t *testing.T) {
+ config := &configs.Config{
+ Rootfs: "/var",
+ Sysctl: map[string]string{"net.ctl": "ctl"},
+ Namespaces: configs.Namespaces(
+ []configs.Namespace{
+ {
+ Type: configs.NEWNET,
+ Path: "/proc/self/ns/net",
+ },
+ },
+ ),
+ }
+
+ validator := validate.New()
+ err := validator.Validate(config)
+ if err == nil {
+ t.Error("Expected error to occur but it was nil")
+ }
+}
+
+func TestValidateSysctlWithoutNETNamespace(t *testing.T) {
+ config := &configs.Config{
+ Rootfs: "/var",
+ Sysctl: map[string]string{"net.ctl": "ctl"},
+ Namespaces: []configs.Namespace{},
+ }
+
+ validator := validate.New()
+ err := validator.Validate(config)
+ if err == nil {
+ t.Error("Expected error to occur but it was nil")
+ }
+}
--- /dev/null
+package libcontainer
+
+import (
+ "os"
+
+ "golang.org/x/sys/unix"
+)
+
+// mount initializes the console inside the rootfs mounting with the specified mount label
+// and applying the correct ownership of the console.
+func mountConsole(slavePath string) error {
+ oldMask := unix.Umask(0000)
+ defer unix.Umask(oldMask)
+ f, err := os.Create("/dev/console")
+ if err != nil && !os.IsExist(err) {
+ return err
+ }
+ if f != nil {
+ f.Close()
+ }
+ return unix.Mount(slavePath, "/dev/console", "bind", unix.MS_BIND, "")
+}
+
+// dupStdio opens the slavePath for the console and dups the fds to the current
+// processes stdio, fd 0,1,2.
+func dupStdio(slavePath string) error {
+ fd, err := unix.Open(slavePath, unix.O_RDWR, 0)
+ if err != nil {
+ return &os.PathError{
+ Op: "open",
+ Path: slavePath,
+ Err: err,
+ }
+ }
+ for _, i := range []int{0, 1, 2} {
+ if err := unix.Dup3(fd, i, 0); err != nil {
+ return err
+ }
+ }
+ return nil
+}
--- /dev/null
+// Package libcontainer provides a native Go implementation for creating containers
+// with namespaces, cgroups, capabilities, and filesystem access controls.
+// It allows you to manage the lifecycle of the container performing additional operations
+// after the container is created.
+package libcontainer
+
+import (
+ "os"
+ "time"
+
+ "github.com/opencontainers/runc/libcontainer/configs"
+ "github.com/opencontainers/runtime-spec/specs-go"
+)
+
+// Status is the status of a container.
+type Status int
+
+const (
+ // Created is the status that denotes the container exists but has not been run yet.
+ Created Status = iota
+ // Running is the status that denotes the container exists and is running.
+ Running
+ // Pausing is the status that denotes the container exists, it is in the process of being paused.
+ Pausing
+ // Paused is the status that denotes the container exists, but all its processes are paused.
+ Paused
+ // Stopped is the status that denotes the container does not have a created or running process.
+ Stopped
+)
+
+func (s Status) String() string {
+ switch s {
+ case Created:
+ return "created"
+ case Running:
+ return "running"
+ case Pausing:
+ return "pausing"
+ case Paused:
+ return "paused"
+ case Stopped:
+ return "stopped"
+ default:
+ return "unknown"
+ }
+}
+
+// BaseState represents the platform agnostic pieces relating to a
+// running container's state
+type BaseState struct {
+ // ID is the container ID.
+ ID string `json:"id"`
+
+ // InitProcessPid is the init process id in the parent namespace.
+ InitProcessPid int `json:"init_process_pid"`
+
+ // InitProcessStartTime is the init process start time in clock cycles since boot time.
+ InitProcessStartTime uint64 `json:"init_process_start"`
+
+ // Created is the unix timestamp for the creation time of the container in UTC
+ Created time.Time `json:"created"`
+
+ // Config is the container's configuration.
+ Config configs.Config `json:"config"`
+}
+
+// BaseContainer is a libcontainer container object.
+//
+// Each container is thread-safe within the same process. Since a container can
+// be destroyed by a separate process, any function may return that the container
+// was not found. BaseContainer includes methods that are platform agnostic.
+type BaseContainer interface {
+ // Returns the ID of the container
+ ID() string
+
+ // Returns the current status of the container.
+ //
+ // errors:
+ // ContainerNotExists - Container no longer exists,
+ // Systemerror - System error.
+ Status() (Status, error)
+
+ // State returns the current container's state information.
+ //
+ // errors:
+ // SystemError - System error.
+ State() (*State, error)
+
+ // OCIState returns the current container's state information.
+ //
+ // errors:
+ // SystemError - System error.
+ OCIState() (*specs.State, error)
+
+ // Returns the current config of the container.
+ Config() configs.Config
+
+ // Returns the PIDs inside this container. The PIDs are in the namespace of the calling process.
+ //
+ // errors:
+ // ContainerNotExists - Container no longer exists,
+ // Systemerror - System error.
+ //
+ // Some of the returned PIDs may no longer refer to processes in the Container, unless
+ // the Container state is PAUSED in which case every PID in the slice is valid.
+ Processes() ([]int, error)
+
+ // Returns statistics for the container.
+ //
+ // errors:
+ // ContainerNotExists - Container no longer exists,
+ // Systemerror - System error.
+ Stats() (*Stats, error)
+
+ // Set resources of container as configured
+ //
+ // We can use this to change resources when containers are running.
+ //
+ // errors:
+ // SystemError - System error.
+ Set(config configs.Config) error
+
+ // Start a process inside the container. Returns error if process fails to
+ // start. You can track process lifecycle with passed Process structure.
+ //
+ // errors:
+ // ContainerNotExists - Container no longer exists,
+ // ConfigInvalid - config is invalid,
+ // ContainerPaused - Container is paused,
+ // SystemError - System error.
+ Start(process *Process) (err error)
+
+ // Run immediately starts the process inside the container. Returns error if process
+ // fails to start. It does not block waiting for the exec fifo after start returns but
+ // opens the fifo after start returns.
+ //
+ // errors:
+ // ContainerNotExists - Container no longer exists,
+ // ConfigInvalid - config is invalid,
+ // ContainerPaused - Container is paused,
+ // SystemError - System error.
+ Run(process *Process) (err error)
+
+ // Destroys the container, if its in a valid state, after killing any
+ // remaining running processes.
+ //
+ // Any event registrations are removed before the container is destroyed.
+ // No error is returned if the container is already destroyed.
+ //
+ // Running containers must first be stopped using Signal(..).
+ // Paused containers must first be resumed using Resume(..).
+ //
+ // errors:
+ // ContainerNotStopped - Container is still running,
+ // ContainerPaused - Container is paused,
+ // SystemError - System error.
+ Destroy() error
+
+ // Signal sends the provided signal code to the container's initial process.
+ //
+ // If all is specified the signal is sent to all processes in the container
+ // including the initial process.
+ //
+ // errors:
+ // SystemError - System error.
+ Signal(s os.Signal, all bool) error
+
+ // Exec signals the container to exec the users process at the end of the init.
+ //
+ // errors:
+ // SystemError - System error.
+ Exec() error
+}
--- /dev/null
+// +build linux
+
+package libcontainer
+
+import (
+ "bytes"
+ "encoding/json"
+ "errors"
+ "fmt"
+ "io"
+ "io/ioutil"
+ "net"
+ "os"
+ "os/exec"
+ "path/filepath"
+ "reflect"
+ "strings"
+ "sync"
+ "syscall" // only for SysProcAttr and Signal
+ "time"
+
+ "github.com/opencontainers/runc/libcontainer/cgroups"
+ "github.com/opencontainers/runc/libcontainer/configs"
+ "github.com/opencontainers/runc/libcontainer/criurpc"
+ "github.com/opencontainers/runc/libcontainer/intelrdt"
+ "github.com/opencontainers/runc/libcontainer/system"
+ "github.com/opencontainers/runc/libcontainer/utils"
+ "github.com/opencontainers/runtime-spec/specs-go"
+
+ "github.com/golang/protobuf/proto"
+ "github.com/sirupsen/logrus"
+ "github.com/vishvananda/netlink/nl"
+ "golang.org/x/sys/unix"
+)
+
+const stdioFdCount = 3
+
+type linuxContainer struct {
+ id string
+ root string
+ config *configs.Config
+ cgroupManager cgroups.Manager
+ intelRdtManager intelrdt.Manager
+ initPath string
+ initArgs []string
+ initProcess parentProcess
+ initProcessStartTime uint64
+ criuPath string
+ newuidmapPath string
+ newgidmapPath string
+ m sync.Mutex
+ criuVersion int
+ state containerState
+ created time.Time
+}
+
+// State represents a running container's state
+type State struct {
+ BaseState
+
+ // Platform specific fields below here
+
+ // Specified if the container was started under the rootless mode.
+ // Set to true if BaseState.Config.RootlessEUID && BaseState.Config.RootlessCgroups
+ Rootless bool `json:"rootless"`
+
+ // Path to all the cgroups setup for a container. Key is cgroup subsystem name
+ // with the value as the path.
+ CgroupPaths map[string]string `json:"cgroup_paths"`
+
+ // NamespacePaths are filepaths to the container's namespaces. Key is the namespace type
+ // with the value as the path.
+ NamespacePaths map[configs.NamespaceType]string `json:"namespace_paths"`
+
+ // Container's standard descriptors (std{in,out,err}), needed for checkpoint and restore
+ ExternalDescriptors []string `json:"external_descriptors,omitempty"`
+
+ // Intel RDT "resource control" filesystem path
+ IntelRdtPath string `json:"intel_rdt_path"`
+}
+
+// Container is a libcontainer container object.
+//
+// Each container is thread-safe within the same process. Since a container can
+// be destroyed by a separate process, any function may return that the container
+// was not found.
+type Container interface {
+ BaseContainer
+
+ // Methods below here are platform specific
+
+ // Checkpoint checkpoints the running container's state to disk using the criu(8) utility.
+ //
+ // errors:
+ // Systemerror - System error.
+ Checkpoint(criuOpts *CriuOpts) error
+
+ // Restore restores the checkpointed container to a running state using the criu(8) utility.
+ //
+ // errors:
+ // Systemerror - System error.
+ Restore(process *Process, criuOpts *CriuOpts) error
+
+ // If the Container state is RUNNING or CREATED, sets the Container state to PAUSING and pauses
+ // the execution of any user processes. Asynchronously, when the container finished being paused the
+ // state is changed to PAUSED.
+ // If the Container state is PAUSED, do nothing.
+ //
+ // errors:
+ // ContainerNotExists - Container no longer exists,
+ // ContainerNotRunning - Container not running or created,
+ // Systemerror - System error.
+ Pause() error
+
+ // If the Container state is PAUSED, resumes the execution of any user processes in the
+ // Container before setting the Container state to RUNNING.
+ // If the Container state is RUNNING, do nothing.
+ //
+ // errors:
+ // ContainerNotExists - Container no longer exists,
+ // ContainerNotPaused - Container is not paused,
+ // Systemerror - System error.
+ Resume() error
+
+ // NotifyOOM returns a read-only channel signaling when the container receives an OOM notification.
+ //
+ // errors:
+ // Systemerror - System error.
+ NotifyOOM() (<-chan struct{}, error)
+
+ // NotifyMemoryPressure returns a read-only channel signaling when the container reaches a given pressure level
+ //
+ // errors:
+ // Systemerror - System error.
+ NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error)
+}
+
+// ID returns the container's unique ID
+func (c *linuxContainer) ID() string {
+ return c.id
+}
+
+// Config returns the container's configuration
+func (c *linuxContainer) Config() configs.Config {
+ return *c.config
+}
+
+func (c *linuxContainer) Status() (Status, error) {
+ c.m.Lock()
+ defer c.m.Unlock()
+ return c.currentStatus()
+}
+
+func (c *linuxContainer) State() (*State, error) {
+ c.m.Lock()
+ defer c.m.Unlock()
+ return c.currentState()
+}
+
+func (c *linuxContainer) OCIState() (*specs.State, error) {
+ c.m.Lock()
+ defer c.m.Unlock()
+ return c.currentOCIState()
+}
+
+func (c *linuxContainer) Processes() ([]int, error) {
+ pids, err := c.cgroupManager.GetAllPids()
+ if err != nil {
+ return nil, newSystemErrorWithCause(err, "getting all container pids from cgroups")
+ }
+ return pids, nil
+}
+
+func (c *linuxContainer) Stats() (*Stats, error) {
+ var (
+ err error
+ stats = &Stats{}
+ )
+ if stats.CgroupStats, err = c.cgroupManager.GetStats(); err != nil {
+ return stats, newSystemErrorWithCause(err, "getting container stats from cgroups")
+ }
+ if c.intelRdtManager != nil {
+ if stats.IntelRdtStats, err = c.intelRdtManager.GetStats(); err != nil {
+ return stats, newSystemErrorWithCause(err, "getting container's Intel RDT stats")
+ }
+ }
+ for _, iface := range c.config.Networks {
+ switch iface.Type {
+ case "veth":
+ istats, err := getNetworkInterfaceStats(iface.HostInterfaceName)
+ if err != nil {
+ return stats, newSystemErrorWithCausef(err, "getting network stats for interface %q", iface.HostInterfaceName)
+ }
+ stats.Interfaces = append(stats.Interfaces, istats)
+ }
+ }
+ return stats, nil
+}
+
+func (c *linuxContainer) Set(config configs.Config) error {
+ c.m.Lock()
+ defer c.m.Unlock()
+ status, err := c.currentStatus()
+ if err != nil {
+ return err
+ }
+ if status == Stopped {
+ return newGenericError(fmt.Errorf("container not running"), ContainerNotRunning)
+ }
+ if err := c.cgroupManager.Set(&config); err != nil {
+ // Set configs back
+ if err2 := c.cgroupManager.Set(c.config); err2 != nil {
+ logrus.Warnf("Setting back cgroup configs failed due to error: %v, your state.json and actual configs might be inconsistent.", err2)
+ }
+ return err
+ }
+ if c.intelRdtManager != nil {
+ if err := c.intelRdtManager.Set(&config); err != nil {
+ // Set configs back
+ if err2 := c.intelRdtManager.Set(c.config); err2 != nil {
+ logrus.Warnf("Setting back intelrdt configs failed due to error: %v, your state.json and actual configs might be inconsistent.", err2)
+ }
+ return err
+ }
+ }
+ // After config setting succeed, update config and states
+ c.config = &config
+ _, err = c.updateState(nil)
+ return err
+}
+
+func (c *linuxContainer) Start(process *Process) error {
+ c.m.Lock()
+ defer c.m.Unlock()
+ if process.Init {
+ if err := c.createExecFifo(); err != nil {
+ return err
+ }
+ }
+ if err := c.start(process); err != nil {
+ if process.Init {
+ c.deleteExecFifo()
+ }
+ return err
+ }
+ return nil
+}
+
+func (c *linuxContainer) Run(process *Process) error {
+ if err := c.Start(process); err != nil {
+ return err
+ }
+ if process.Init {
+ return c.exec()
+ }
+ return nil
+}
+
+func (c *linuxContainer) Exec() error {
+ c.m.Lock()
+ defer c.m.Unlock()
+ return c.exec()
+}
+
+func (c *linuxContainer) exec() error {
+ path := filepath.Join(c.root, execFifoFilename)
+
+ fifoOpen := make(chan struct{})
+ select {
+ case <-awaitProcessExit(c.initProcess.pid(), fifoOpen):
+ return errors.New("container process is already dead")
+ case result := <-awaitFifoOpen(path):
+ close(fifoOpen)
+ if result.err != nil {
+ return result.err
+ }
+ f := result.file
+ defer f.Close()
+ if err := readFromExecFifo(f); err != nil {
+ return err
+ }
+ return os.Remove(path)
+ }
+}
+
+func readFromExecFifo(execFifo io.Reader) error {
+ data, err := ioutil.ReadAll(execFifo)
+ if err != nil {
+ return err
+ }
+ if len(data) <= 0 {
+ return fmt.Errorf("cannot start an already running container")
+ }
+ return nil
+}
+
+func awaitProcessExit(pid int, exit <-chan struct{}) <-chan struct{} {
+ isDead := make(chan struct{})
+ go func() {
+ for {
+ select {
+ case <-exit:
+ return
+ case <-time.After(time.Millisecond * 100):
+ stat, err := system.Stat(pid)
+ if err != nil || stat.State == system.Zombie {
+ close(isDead)
+ return
+ }
+ }
+ }
+ }()
+ return isDead
+}
+
+func awaitFifoOpen(path string) <-chan openResult {
+ fifoOpened := make(chan openResult)
+ go func() {
+ f, err := os.OpenFile(path, os.O_RDONLY, 0)
+ if err != nil {
+ fifoOpened <- openResult{err: newSystemErrorWithCause(err, "open exec fifo for reading")}
+ return
+ }
+ fifoOpened <- openResult{file: f}
+ }()
+ return fifoOpened
+}
+
+type openResult struct {
+ file *os.File
+ err error
+}
+
+func (c *linuxContainer) start(process *Process) error {
+ parent, err := c.newParentProcess(process)
+ if err != nil {
+ return newSystemErrorWithCause(err, "creating new parent process")
+ }
+ if err := parent.start(); err != nil {
+ // terminate the process to ensure that it properly is reaped.
+ if err := ignoreTerminateErrors(parent.terminate()); err != nil {
+ logrus.Warn(err)
+ }
+ return newSystemErrorWithCause(err, "starting container process")
+ }
+ // generate a timestamp indicating when the container was started
+ c.created = time.Now().UTC()
+ if process.Init {
+ c.state = &createdState{
+ c: c,
+ }
+ state, err := c.updateState(parent)
+ if err != nil {
+ return err
+ }
+ c.initProcessStartTime = state.InitProcessStartTime
+
+ if c.config.Hooks != nil {
+ s, err := c.currentOCIState()
+ if err != nil {
+ return err
+ }
+ for i, hook := range c.config.Hooks.Poststart {
+ if err := hook.Run(s); err != nil {
+ if err := ignoreTerminateErrors(parent.terminate()); err != nil {
+ logrus.Warn(err)
+ }
+ return newSystemErrorWithCausef(err, "running poststart hook %d", i)
+ }
+ }
+ }
+ }
+ return nil
+}
+
+func (c *linuxContainer) Signal(s os.Signal, all bool) error {
+ if all {
+ return signalAllProcesses(c.cgroupManager, s)
+ }
+ if err := c.initProcess.signal(s); err != nil {
+ return newSystemErrorWithCause(err, "signaling init process")
+ }
+ return nil
+}
+
+func (c *linuxContainer) createExecFifo() error {
+ rootuid, err := c.Config().HostRootUID()
+ if err != nil {
+ return err
+ }
+ rootgid, err := c.Config().HostRootGID()
+ if err != nil {
+ return err
+ }
+
+ fifoName := filepath.Join(c.root, execFifoFilename)
+ if _, err := os.Stat(fifoName); err == nil {
+ return fmt.Errorf("exec fifo %s already exists", fifoName)
+ }
+ oldMask := unix.Umask(0000)
+ if err := unix.Mkfifo(fifoName, 0622); err != nil {
+ unix.Umask(oldMask)
+ return err
+ }
+ unix.Umask(oldMask)
+ return os.Chown(fifoName, rootuid, rootgid)
+}
+
+func (c *linuxContainer) deleteExecFifo() {
+ fifoName := filepath.Join(c.root, execFifoFilename)
+ os.Remove(fifoName)
+}
+
+// includeExecFifo opens the container's execfifo as a pathfd, so that the
+// container cannot access the statedir (and the FIFO itself remains
+// un-opened). It then adds the FifoFd to the given exec.Cmd as an inherited
+// fd, with _LIBCONTAINER_FIFOFD set to its fd number.
+func (c *linuxContainer) includeExecFifo(cmd *exec.Cmd) error {
+ fifoName := filepath.Join(c.root, execFifoFilename)
+ fifoFd, err := unix.Open(fifoName, unix.O_PATH|unix.O_CLOEXEC, 0)
+ if err != nil {
+ return err
+ }
+
+ cmd.ExtraFiles = append(cmd.ExtraFiles, os.NewFile(uintptr(fifoFd), fifoName))
+ cmd.Env = append(cmd.Env,
+ fmt.Sprintf("_LIBCONTAINER_FIFOFD=%d", stdioFdCount+len(cmd.ExtraFiles)-1))
+ return nil
+}
+
+func (c *linuxContainer) newParentProcess(p *Process) (parentProcess, error) {
+ parentPipe, childPipe, err := utils.NewSockPair("init")
+ if err != nil {
+ return nil, newSystemErrorWithCause(err, "creating new init pipe")
+ }
+ cmd, err := c.commandTemplate(p, childPipe)
+ if err != nil {
+ return nil, newSystemErrorWithCause(err, "creating new command template")
+ }
+ if !p.Init {
+ return c.newSetnsProcess(p, cmd, parentPipe, childPipe)
+ }
+
+ // We only set up fifoFd if we're not doing a `runc exec`. The historic
+ // reason for this is that previously we would pass a dirfd that allowed
+ // for container rootfs escape (and not doing it in `runc exec` avoided
+ // that problem), but we no longer do that. However, there's no need to do
+ // this for `runc exec` so we just keep it this way to be safe.
+ if err := c.includeExecFifo(cmd); err != nil {
+ return nil, newSystemErrorWithCause(err, "including execfifo in cmd.Exec setup")
+ }
+ return c.newInitProcess(p, cmd, parentPipe, childPipe)
+}
+
+func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec.Cmd, error) {
+ cmd := exec.Command(c.initPath, c.initArgs[1:]...)
+ cmd.Args[0] = c.initArgs[0]
+ cmd.Stdin = p.Stdin
+ cmd.Stdout = p.Stdout
+ cmd.Stderr = p.Stderr
+ cmd.Dir = c.config.Rootfs
+ if cmd.SysProcAttr == nil {
+ cmd.SysProcAttr = &syscall.SysProcAttr{}
+ }
+ cmd.Env = append(cmd.Env, fmt.Sprintf("GOMAXPROCS=%s", os.Getenv("GOMAXPROCS")))
+ cmd.ExtraFiles = append(cmd.ExtraFiles, p.ExtraFiles...)
+ if p.ConsoleSocket != nil {
+ cmd.ExtraFiles = append(cmd.ExtraFiles, p.ConsoleSocket)
+ cmd.Env = append(cmd.Env,
+ fmt.Sprintf("_LIBCONTAINER_CONSOLE=%d", stdioFdCount+len(cmd.ExtraFiles)-1),
+ )
+ }
+ cmd.ExtraFiles = append(cmd.ExtraFiles, childPipe)
+ cmd.Env = append(cmd.Env,
+ fmt.Sprintf("_LIBCONTAINER_INITPIPE=%d", stdioFdCount+len(cmd.ExtraFiles)-1),
+ )
+ // NOTE: when running a container with no PID namespace and the parent process spawning the container is
+ // PID1 the pdeathsig is being delivered to the container's init process by the kernel for some reason
+ // even with the parent still running.
+ if c.config.ParentDeathSignal > 0 {
+ cmd.SysProcAttr.Pdeathsig = syscall.Signal(c.config.ParentDeathSignal)
+ }
+ return cmd, nil
+}
+
+func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) (*initProcess, error) {
+ cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard))
+ nsMaps := make(map[configs.NamespaceType]string)
+ for _, ns := range c.config.Namespaces {
+ if ns.Path != "" {
+ nsMaps[ns.Type] = ns.Path
+ }
+ }
+ _, sharePidns := nsMaps[configs.NEWPID]
+ data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps)
+ if err != nil {
+ return nil, err
+ }
+ init := &initProcess{
+ cmd: cmd,
+ childPipe: childPipe,
+ parentPipe: parentPipe,
+ manager: c.cgroupManager,
+ intelRdtManager: c.intelRdtManager,
+ config: c.newInitConfig(p),
+ container: c,
+ process: p,
+ bootstrapData: data,
+ sharePidns: sharePidns,
+ }
+ c.initProcess = init
+ return init, nil
+}
+
+func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) (*setnsProcess, error) {
+ cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initSetns))
+ state, err := c.currentState()
+ if err != nil {
+ return nil, newSystemErrorWithCause(err, "getting container's current state")
+ }
+ // for setns process, we don't have to set cloneflags as the process namespaces
+ // will only be set via setns syscall
+ data, err := c.bootstrapData(0, state.NamespacePaths)
+ if err != nil {
+ return nil, err
+ }
+ return &setnsProcess{
+ cmd: cmd,
+ cgroupPaths: c.cgroupManager.GetPaths(),
+ rootlessCgroups: c.config.RootlessCgroups,
+ intelRdtPath: state.IntelRdtPath,
+ childPipe: childPipe,
+ parentPipe: parentPipe,
+ config: c.newInitConfig(p),
+ process: p,
+ bootstrapData: data,
+ }, nil
+}
+
+func (c *linuxContainer) newInitConfig(process *Process) *initConfig {
+ cfg := &initConfig{
+ Config: c.config,
+ Args: process.Args,
+ Env: process.Env,
+ User: process.User,
+ AdditionalGroups: process.AdditionalGroups,
+ Cwd: process.Cwd,
+ Capabilities: process.Capabilities,
+ PassedFilesCount: len(process.ExtraFiles),
+ ContainerId: c.ID(),
+ NoNewPrivileges: c.config.NoNewPrivileges,
+ RootlessEUID: c.config.RootlessEUID,
+ RootlessCgroups: c.config.RootlessCgroups,
+ AppArmorProfile: c.config.AppArmorProfile,
+ ProcessLabel: c.config.ProcessLabel,
+ Rlimits: c.config.Rlimits,
+ }
+ if process.NoNewPrivileges != nil {
+ cfg.NoNewPrivileges = *process.NoNewPrivileges
+ }
+ if process.AppArmorProfile != "" {
+ cfg.AppArmorProfile = process.AppArmorProfile
+ }
+ if process.Label != "" {
+ cfg.ProcessLabel = process.Label
+ }
+ if len(process.Rlimits) > 0 {
+ cfg.Rlimits = process.Rlimits
+ }
+ cfg.CreateConsole = process.ConsoleSocket != nil
+ cfg.ConsoleWidth = process.ConsoleWidth
+ cfg.ConsoleHeight = process.ConsoleHeight
+ return cfg
+}
+
+func (c *linuxContainer) Destroy() error {
+ c.m.Lock()
+ defer c.m.Unlock()
+ return c.state.destroy()
+}
+
+func (c *linuxContainer) Pause() error {
+ c.m.Lock()
+ defer c.m.Unlock()
+ status, err := c.currentStatus()
+ if err != nil {
+ return err
+ }
+ switch status {
+ case Running, Created:
+ if err := c.cgroupManager.Freeze(configs.Frozen); err != nil {
+ return err
+ }
+ return c.state.transition(&pausedState{
+ c: c,
+ })
+ }
+ return newGenericError(fmt.Errorf("container not running or created: %s", status), ContainerNotRunning)
+}
+
+func (c *linuxContainer) Resume() error {
+ c.m.Lock()
+ defer c.m.Unlock()
+ status, err := c.currentStatus()
+ if err != nil {
+ return err
+ }
+ if status != Paused {
+ return newGenericError(fmt.Errorf("container not paused"), ContainerNotPaused)
+ }
+ if err := c.cgroupManager.Freeze(configs.Thawed); err != nil {
+ return err
+ }
+ return c.state.transition(&runningState{
+ c: c,
+ })
+}
+
+func (c *linuxContainer) NotifyOOM() (<-chan struct{}, error) {
+ // XXX(cyphar): This requires cgroups.
+ if c.config.RootlessCgroups {
+ logrus.Warn("getting OOM notifications may fail if you don't have the full access to cgroups")
+ }
+ return notifyOnOOM(c.cgroupManager.GetPaths())
+}
+
+func (c *linuxContainer) NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error) {
+ // XXX(cyphar): This requires cgroups.
+ if c.config.RootlessCgroups {
+ logrus.Warn("getting memory pressure notifications may fail if you don't have the full access to cgroups")
+ }
+ return notifyMemoryPressure(c.cgroupManager.GetPaths(), level)
+}
+
+var criuFeatures *criurpc.CriuFeatures
+
+func (c *linuxContainer) checkCriuFeatures(criuOpts *CriuOpts, rpcOpts *criurpc.CriuOpts, criuFeat *criurpc.CriuFeatures) error {
+
+ var t criurpc.CriuReqType
+ t = criurpc.CriuReqType_FEATURE_CHECK
+
+ // criu 1.8 => 10800
+ if err := c.checkCriuVersion(10800); err != nil {
+ // Feature checking was introduced with CRIU 1.8.
+ // Ignore the feature check if an older CRIU version is used
+ // and just act as before.
+ // As all automated PR testing is done using CRIU 1.7 this
+ // code will not be tested by automated PR testing.
+ return nil
+ }
+
+ // make sure the features we are looking for are really not from
+ // some previous check
+ criuFeatures = nil
+
+ req := &criurpc.CriuReq{
+ Type: &t,
+ // Theoretically this should not be necessary but CRIU
+ // segfaults if Opts is empty.
+ // Fixed in CRIU 2.12
+ Opts: rpcOpts,
+ Features: criuFeat,
+ }
+
+ err := c.criuSwrk(nil, req, criuOpts, false, nil)
+ if err != nil {
+ logrus.Debugf("%s", err)
+ return fmt.Errorf("CRIU feature check failed")
+ }
+
+ logrus.Debugf("Feature check says: %s", criuFeatures)
+ missingFeatures := false
+
+ // The outer if checks if the fields actually exist
+ if (criuFeat.MemTrack != nil) &&
+ (criuFeatures.MemTrack != nil) {
+ // The inner if checks if they are set to true
+ if *criuFeat.MemTrack && !*criuFeatures.MemTrack {
+ missingFeatures = true
+ logrus.Debugf("CRIU does not support MemTrack")
+ }
+ }
+
+ // This needs to be repeated for every new feature check.
+ // Is there a way to put this in a function. Reflection?
+ if (criuFeat.LazyPages != nil) &&
+ (criuFeatures.LazyPages != nil) {
+ if *criuFeat.LazyPages && !*criuFeatures.LazyPages {
+ missingFeatures = true
+ logrus.Debugf("CRIU does not support LazyPages")
+ }
+ }
+
+ if missingFeatures {
+ return fmt.Errorf("CRIU is missing features")
+ }
+
+ return nil
+}
+
+func parseCriuVersion(path string) (int, error) {
+ var x, y, z int
+
+ out, err := exec.Command(path, "-V").Output()
+ if err != nil {
+ return 0, fmt.Errorf("Unable to execute CRIU command: %s", path)
+ }
+
+ x = 0
+ y = 0
+ z = 0
+ if ep := strings.Index(string(out), "-"); ep >= 0 {
+ // criu Git version format
+ var version string
+ if sp := strings.Index(string(out), "GitID"); sp > 0 {
+ version = string(out)[sp:ep]
+ } else {
+ return 0, fmt.Errorf("Unable to parse the CRIU version: %s", path)
+ }
+
+ n, err := fmt.Sscanf(version, "GitID: v%d.%d.%d", &x, &y, &z) // 1.5.2
+ if err != nil {
+ n, err = fmt.Sscanf(version, "GitID: v%d.%d", &x, &y) // 1.6
+ y++
+ } else {
+ z++
+ }
+ if n < 2 || err != nil {
+ return 0, fmt.Errorf("Unable to parse the CRIU version: %s %d %s", version, n, err)
+ }
+ } else {
+ // criu release version format
+ n, err := fmt.Sscanf(string(out), "Version: %d.%d.%d\n", &x, &y, &z) // 1.5.2
+ if err != nil {
+ n, err = fmt.Sscanf(string(out), "Version: %d.%d\n", &x, &y) // 1.6
+ }
+ if n < 2 || err != nil {
+ return 0, fmt.Errorf("Unable to parse the CRIU version: %s %d %s", out, n, err)
+ }
+ }
+
+ return x*10000 + y*100 + z, nil
+}
+
+func compareCriuVersion(criuVersion int, minVersion int) error {
+ // simple function to perform the actual version compare
+ if criuVersion < minVersion {
+ return fmt.Errorf("CRIU version %d must be %d or higher", criuVersion, minVersion)
+ }
+
+ return nil
+}
+
+// This is used to store the result of criu version RPC
+var criuVersionRPC *criurpc.CriuVersion
+
+// checkCriuVersion checks Criu version greater than or equal to minVersion
+func (c *linuxContainer) checkCriuVersion(minVersion int) error {
+
+ // If the version of criu has already been determined there is no need
+ // to ask criu for the version again. Use the value from c.criuVersion.
+ if c.criuVersion != 0 {
+ return compareCriuVersion(c.criuVersion, minVersion)
+ }
+
+ // First try if this version of CRIU support the version RPC.
+ // The CRIU version RPC was introduced with CRIU 3.0.
+
+ // First, reset the variable for the RPC answer to nil
+ criuVersionRPC = nil
+
+ var t criurpc.CriuReqType
+ t = criurpc.CriuReqType_VERSION
+ req := &criurpc.CriuReq{
+ Type: &t,
+ }
+
+ err := c.criuSwrk(nil, req, nil, false, nil)
+ if err != nil {
+ return fmt.Errorf("CRIU version check failed: %s", err)
+ }
+
+ if criuVersionRPC != nil {
+ logrus.Debugf("CRIU version: %s", criuVersionRPC)
+ // major and minor are always set
+ c.criuVersion = int(*criuVersionRPC.Major) * 10000
+ c.criuVersion += int(*criuVersionRPC.Minor) * 100
+ if criuVersionRPC.Sublevel != nil {
+ c.criuVersion += int(*criuVersionRPC.Sublevel)
+ }
+ if criuVersionRPC.Gitid != nil {
+ // runc's convention is that a CRIU git release is
+ // always the same as increasing the minor by 1
+ c.criuVersion -= (c.criuVersion % 100)
+ c.criuVersion += 100
+ }
+ return compareCriuVersion(c.criuVersion, minVersion)
+ }
+
+ // This is CRIU without the version RPC and therefore
+ // older than 3.0. Parsing the output is required.
+
+ // This can be remove once runc does not work with criu older than 3.0
+
+ c.criuVersion, err = parseCriuVersion(c.criuPath)
+ if err != nil {
+ return err
+ }
+
+ return compareCriuVersion(c.criuVersion, minVersion)
+}
+
+const descriptorsFilename = "descriptors.json"
+
+func (c *linuxContainer) addCriuDumpMount(req *criurpc.CriuReq, m *configs.Mount) {
+ mountDest := m.Destination
+ if strings.HasPrefix(mountDest, c.config.Rootfs) {
+ mountDest = mountDest[len(c.config.Rootfs):]
+ }
+
+ extMnt := &criurpc.ExtMountMap{
+ Key: proto.String(mountDest),
+ Val: proto.String(mountDest),
+ }
+ req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt)
+}
+
+func (c *linuxContainer) addMaskPaths(req *criurpc.CriuReq) error {
+ for _, path := range c.config.MaskPaths {
+ fi, err := os.Stat(fmt.Sprintf("/proc/%d/root/%s", c.initProcess.pid(), path))
+ if err != nil {
+ if os.IsNotExist(err) {
+ continue
+ }
+ return err
+ }
+ if fi.IsDir() {
+ continue
+ }
+
+ extMnt := &criurpc.ExtMountMap{
+ Key: proto.String(path),
+ Val: proto.String("/dev/null"),
+ }
+ req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt)
+ }
+ return nil
+}
+
+func waitForCriuLazyServer(r *os.File, status string) error {
+
+ data := make([]byte, 1)
+ _, err := r.Read(data)
+ if err != nil {
+ return err
+ }
+ fd, err := os.OpenFile(status, os.O_TRUNC|os.O_WRONLY, os.ModeAppend)
+ if err != nil {
+ return err
+ }
+ _, err = fd.Write(data)
+ if err != nil {
+ return err
+ }
+ fd.Close()
+
+ return nil
+}
+
+func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error {
+ c.m.Lock()
+ defer c.m.Unlock()
+
+ // Checkpoint is unlikely to work if os.Geteuid() != 0 || system.RunningInUserNS().
+ // (CLI prints a warning)
+ // TODO(avagin): Figure out how to make this work nicely. CRIU 2.0 has
+ // support for doing unprivileged dumps, but the setup of
+ // rootless containers might make this complicated.
+
+ // criu 1.5.2 => 10502
+ if err := c.checkCriuVersion(10502); err != nil {
+ return err
+ }
+
+ if criuOpts.ImagesDirectory == "" {
+ return fmt.Errorf("invalid directory to save checkpoint")
+ }
+
+ // Since a container can be C/R'ed multiple times,
+ // the checkpoint directory may already exist.
+ if err := os.Mkdir(criuOpts.ImagesDirectory, 0755); err != nil && !os.IsExist(err) {
+ return err
+ }
+
+ if criuOpts.WorkDirectory == "" {
+ criuOpts.WorkDirectory = filepath.Join(c.root, "criu.work")
+ }
+
+ if err := os.Mkdir(criuOpts.WorkDirectory, 0755); err != nil && !os.IsExist(err) {
+ return err
+ }
+
+ workDir, err := os.Open(criuOpts.WorkDirectory)
+ if err != nil {
+ return err
+ }
+ defer workDir.Close()
+
+ imageDir, err := os.Open(criuOpts.ImagesDirectory)
+ if err != nil {
+ return err
+ }
+ defer imageDir.Close()
+
+ rpcOpts := criurpc.CriuOpts{
+ ImagesDirFd: proto.Int32(int32(imageDir.Fd())),
+ WorkDirFd: proto.Int32(int32(workDir.Fd())),
+ LogLevel: proto.Int32(4),
+ LogFile: proto.String("dump.log"),
+ Root: proto.String(c.config.Rootfs),
+ ManageCgroups: proto.Bool(true),
+ NotifyScripts: proto.Bool(true),
+ Pid: proto.Int32(int32(c.initProcess.pid())),
+ ShellJob: proto.Bool(criuOpts.ShellJob),
+ LeaveRunning: proto.Bool(criuOpts.LeaveRunning),
+ TcpEstablished: proto.Bool(criuOpts.TcpEstablished),
+ ExtUnixSk: proto.Bool(criuOpts.ExternalUnixConnections),
+ FileLocks: proto.Bool(criuOpts.FileLocks),
+ EmptyNs: proto.Uint32(criuOpts.EmptyNs),
+ OrphanPtsMaster: proto.Bool(true),
+ AutoDedup: proto.Bool(criuOpts.AutoDedup),
+ LazyPages: proto.Bool(criuOpts.LazyPages),
+ }
+
+ // If the container is running in a network namespace and has
+ // a path to the network namespace configured, we will dump
+ // that network namespace as an external namespace and we
+ // will expect that the namespace exists during restore.
+ // This basically means that CRIU will ignore the namespace
+ // and expect to be setup correctly.
+ nsPath := c.config.Namespaces.PathOf(configs.NEWNET)
+ if nsPath != "" {
+ // For this to work we need at least criu 3.11.0 => 31100.
+ // As there was already a successful version check we will
+ // not error out if it fails. runc will just behave as it used
+ // to do and ignore external network namespaces.
+ err := c.checkCriuVersion(31100)
+ if err == nil {
+ // CRIU expects the information about an external namespace
+ // like this: --external net[<inode>]:<key>
+ // This <key> is always 'extRootNetNS'.
+ var netns syscall.Stat_t
+ err = syscall.Stat(nsPath, &netns)
+ if err != nil {
+ return err
+ }
+ criuExternal := fmt.Sprintf("net[%d]:extRootNetNS", netns.Ino)
+ rpcOpts.External = append(rpcOpts.External, criuExternal)
+ }
+ }
+
+ fcg := c.cgroupManager.GetPaths()["freezer"]
+ if fcg != "" {
+ rpcOpts.FreezeCgroup = proto.String(fcg)
+ }
+
+ // append optional criu opts, e.g., page-server and port
+ if criuOpts.PageServer.Address != "" && criuOpts.PageServer.Port != 0 {
+ rpcOpts.Ps = &criurpc.CriuPageServerInfo{
+ Address: proto.String(criuOpts.PageServer.Address),
+ Port: proto.Int32(criuOpts.PageServer.Port),
+ }
+ }
+
+ //pre-dump may need parentImage param to complete iterative migration
+ if criuOpts.ParentImage != "" {
+ rpcOpts.ParentImg = proto.String(criuOpts.ParentImage)
+ rpcOpts.TrackMem = proto.Bool(true)
+ }
+
+ // append optional manage cgroups mode
+ if criuOpts.ManageCgroupsMode != 0 {
+ // criu 1.7 => 10700
+ if err := c.checkCriuVersion(10700); err != nil {
+ return err
+ }
+ mode := criurpc.CriuCgMode(criuOpts.ManageCgroupsMode)
+ rpcOpts.ManageCgroupsMode = &mode
+ }
+
+ var t criurpc.CriuReqType
+ if criuOpts.PreDump {
+ feat := criurpc.CriuFeatures{
+ MemTrack: proto.Bool(true),
+ }
+
+ if err := c.checkCriuFeatures(criuOpts, &rpcOpts, &feat); err != nil {
+ return err
+ }
+
+ t = criurpc.CriuReqType_PRE_DUMP
+ } else {
+ t = criurpc.CriuReqType_DUMP
+ }
+ req := &criurpc.CriuReq{
+ Type: &t,
+ Opts: &rpcOpts,
+ }
+
+ if criuOpts.LazyPages {
+ // lazy migration requested; check if criu supports it
+ feat := criurpc.CriuFeatures{
+ LazyPages: proto.Bool(true),
+ }
+
+ if err := c.checkCriuFeatures(criuOpts, &rpcOpts, &feat); err != nil {
+ return err
+ }
+
+ statusRead, statusWrite, err := os.Pipe()
+ if err != nil {
+ return err
+ }
+ rpcOpts.StatusFd = proto.Int32(int32(statusWrite.Fd()))
+ go waitForCriuLazyServer(statusRead, criuOpts.StatusFd)
+ }
+
+ //no need to dump these information in pre-dump
+ if !criuOpts.PreDump {
+ for _, m := range c.config.Mounts {
+ switch m.Device {
+ case "bind":
+ c.addCriuDumpMount(req, m)
+ case "cgroup":
+ binds, err := getCgroupMounts(m)
+ if err != nil {
+ return err
+ }
+ for _, b := range binds {
+ c.addCriuDumpMount(req, b)
+ }
+ }
+ }
+
+ if err := c.addMaskPaths(req); err != nil {
+ return err
+ }
+
+ for _, node := range c.config.Devices {
+ m := &configs.Mount{Destination: node.Path, Source: node.Path}
+ c.addCriuDumpMount(req, m)
+ }
+
+ // Write the FD info to a file in the image directory
+ fdsJSON, err := json.Marshal(c.initProcess.externalDescriptors())
+ if err != nil {
+ return err
+ }
+
+ err = ioutil.WriteFile(filepath.Join(criuOpts.ImagesDirectory, descriptorsFilename), fdsJSON, 0655)
+ if err != nil {
+ return err
+ }
+ }
+
+ err = c.criuSwrk(nil, req, criuOpts, false, nil)
+ if err != nil {
+ return err
+ }
+ return nil
+}
+
+func (c *linuxContainer) addCriuRestoreMount(req *criurpc.CriuReq, m *configs.Mount) {
+ mountDest := m.Destination
+ if strings.HasPrefix(mountDest, c.config.Rootfs) {
+ mountDest = mountDest[len(c.config.Rootfs):]
+ }
+
+ extMnt := &criurpc.ExtMountMap{
+ Key: proto.String(mountDest),
+ Val: proto.String(m.Source),
+ }
+ req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt)
+}
+
+func (c *linuxContainer) restoreNetwork(req *criurpc.CriuReq, criuOpts *CriuOpts) {
+ for _, iface := range c.config.Networks {
+ switch iface.Type {
+ case "veth":
+ veth := new(criurpc.CriuVethPair)
+ veth.IfOut = proto.String(iface.HostInterfaceName)
+ veth.IfIn = proto.String(iface.Name)
+ req.Opts.Veths = append(req.Opts.Veths, veth)
+ case "loopback":
+ // Do nothing
+ }
+ }
+ for _, i := range criuOpts.VethPairs {
+ veth := new(criurpc.CriuVethPair)
+ veth.IfOut = proto.String(i.HostInterfaceName)
+ veth.IfIn = proto.String(i.ContainerInterfaceName)
+ req.Opts.Veths = append(req.Opts.Veths, veth)
+ }
+}
+
+func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
+ c.m.Lock()
+ defer c.m.Unlock()
+
+ var extraFiles []*os.File
+
+ // Restore is unlikely to work if os.Geteuid() != 0 || system.RunningInUserNS().
+ // (CLI prints a warning)
+ // TODO(avagin): Figure out how to make this work nicely. CRIU doesn't have
+ // support for unprivileged restore at the moment.
+
+ // criu 1.5.2 => 10502
+ if err := c.checkCriuVersion(10502); err != nil {
+ return err
+ }
+ if criuOpts.WorkDirectory == "" {
+ criuOpts.WorkDirectory = filepath.Join(c.root, "criu.work")
+ }
+ // Since a container can be C/R'ed multiple times,
+ // the work directory may already exist.
+ if err := os.Mkdir(criuOpts.WorkDirectory, 0655); err != nil && !os.IsExist(err) {
+ return err
+ }
+ workDir, err := os.Open(criuOpts.WorkDirectory)
+ if err != nil {
+ return err
+ }
+ defer workDir.Close()
+ if criuOpts.ImagesDirectory == "" {
+ return fmt.Errorf("invalid directory to restore checkpoint")
+ }
+ imageDir, err := os.Open(criuOpts.ImagesDirectory)
+ if err != nil {
+ return err
+ }
+ defer imageDir.Close()
+ // CRIU has a few requirements for a root directory:
+ // * it must be a mount point
+ // * its parent must not be overmounted
+ // c.config.Rootfs is bind-mounted to a temporary directory
+ // to satisfy these requirements.
+ root := filepath.Join(c.root, "criu-root")
+ if err := os.Mkdir(root, 0755); err != nil {
+ return err
+ }
+ defer os.Remove(root)
+ root, err = filepath.EvalSymlinks(root)
+ if err != nil {
+ return err
+ }
+ err = unix.Mount(c.config.Rootfs, root, "", unix.MS_BIND|unix.MS_REC, "")
+ if err != nil {
+ return err
+ }
+ defer unix.Unmount(root, unix.MNT_DETACH)
+ t := criurpc.CriuReqType_RESTORE
+ req := &criurpc.CriuReq{
+ Type: &t,
+ Opts: &criurpc.CriuOpts{
+ ImagesDirFd: proto.Int32(int32(imageDir.Fd())),
+ WorkDirFd: proto.Int32(int32(workDir.Fd())),
+ EvasiveDevices: proto.Bool(true),
+ LogLevel: proto.Int32(4),
+ LogFile: proto.String("restore.log"),
+ RstSibling: proto.Bool(true),
+ Root: proto.String(root),
+ ManageCgroups: proto.Bool(true),
+ NotifyScripts: proto.Bool(true),
+ ShellJob: proto.Bool(criuOpts.ShellJob),
+ ExtUnixSk: proto.Bool(criuOpts.ExternalUnixConnections),
+ TcpEstablished: proto.Bool(criuOpts.TcpEstablished),
+ FileLocks: proto.Bool(criuOpts.FileLocks),
+ EmptyNs: proto.Uint32(criuOpts.EmptyNs),
+ OrphanPtsMaster: proto.Bool(true),
+ AutoDedup: proto.Bool(criuOpts.AutoDedup),
+ LazyPages: proto.Bool(criuOpts.LazyPages),
+ },
+ }
+
+ // Same as during checkpointing. If the container has a specific network namespace
+ // assigned to it, this now expects that the checkpoint will be restored in a
+ // already created network namespace.
+ nsPath := c.config.Namespaces.PathOf(configs.NEWNET)
+ if nsPath != "" {
+ // For this to work we need at least criu 3.11.0 => 31100.
+ // As there was already a successful version check we will
+ // not error out if it fails. runc will just behave as it used
+ // to do and ignore external network namespaces.
+ err := c.checkCriuVersion(31100)
+ if err == nil {
+ // CRIU wants the information about an existing network namespace
+ // like this: --inherit-fd fd[<fd>]:<key>
+ // The <key> needs to be the same as during checkpointing.
+ // We are always using 'extRootNetNS' as the key in this.
+ netns, err := os.Open(nsPath)
+ defer netns.Close()
+ if err != nil {
+ logrus.Errorf("If a specific network namespace is defined it must exist: %s", err)
+ return fmt.Errorf("Requested network namespace %v does not exist", nsPath)
+ }
+ inheritFd := new(criurpc.InheritFd)
+ inheritFd.Key = proto.String("extRootNetNS")
+ // The offset of four is necessary because 0, 1, 2 and 3 is already
+ // used by stdin, stdout, stderr, 'criu swrk' socket.
+ inheritFd.Fd = proto.Int32(int32(4 + len(extraFiles)))
+ req.Opts.InheritFd = append(req.Opts.InheritFd, inheritFd)
+ // All open FDs need to be transferred to CRIU via extraFiles
+ extraFiles = append(extraFiles, netns)
+ }
+ }
+
+ for _, m := range c.config.Mounts {
+ switch m.Device {
+ case "bind":
+ c.addCriuRestoreMount(req, m)
+ case "cgroup":
+ binds, err := getCgroupMounts(m)
+ if err != nil {
+ return err
+ }
+ for _, b := range binds {
+ c.addCriuRestoreMount(req, b)
+ }
+ }
+ }
+
+ if len(c.config.MaskPaths) > 0 {
+ m := &configs.Mount{Destination: "/dev/null", Source: "/dev/null"}
+ c.addCriuRestoreMount(req, m)
+ }
+
+ for _, node := range c.config.Devices {
+ m := &configs.Mount{Destination: node.Path, Source: node.Path}
+ c.addCriuRestoreMount(req, m)
+ }
+
+ if criuOpts.EmptyNs&unix.CLONE_NEWNET == 0 {
+ c.restoreNetwork(req, criuOpts)
+ }
+
+ // append optional manage cgroups mode
+ if criuOpts.ManageCgroupsMode != 0 {
+ // criu 1.7 => 10700
+ if err := c.checkCriuVersion(10700); err != nil {
+ return err
+ }
+ mode := criurpc.CriuCgMode(criuOpts.ManageCgroupsMode)
+ req.Opts.ManageCgroupsMode = &mode
+ }
+
+ var (
+ fds []string
+ fdJSON []byte
+ )
+ if fdJSON, err = ioutil.ReadFile(filepath.Join(criuOpts.ImagesDirectory, descriptorsFilename)); err != nil {
+ return err
+ }
+
+ if err := json.Unmarshal(fdJSON, &fds); err != nil {
+ return err
+ }
+ for i := range fds {
+ if s := fds[i]; strings.Contains(s, "pipe:") {
+ inheritFd := new(criurpc.InheritFd)
+ inheritFd.Key = proto.String(s)
+ inheritFd.Fd = proto.Int32(int32(i))
+ req.Opts.InheritFd = append(req.Opts.InheritFd, inheritFd)
+ }
+ }
+ return c.criuSwrk(process, req, criuOpts, true, extraFiles)
+}
+
+func (c *linuxContainer) criuApplyCgroups(pid int, req *criurpc.CriuReq) error {
+ // XXX: Do we need to deal with this case? AFAIK criu still requires root.
+ if err := c.cgroupManager.Apply(pid); err != nil {
+ return err
+ }
+
+ if err := c.cgroupManager.Set(c.config); err != nil {
+ return newSystemError(err)
+ }
+
+ path := fmt.Sprintf("/proc/%d/cgroup", pid)
+ cgroupsPaths, err := cgroups.ParseCgroupFile(path)
+ if err != nil {
+ return err
+ }
+
+ for c, p := range cgroupsPaths {
+ cgroupRoot := &criurpc.CgroupRoot{
+ Ctrl: proto.String(c),
+ Path: proto.String(p),
+ }
+ req.Opts.CgRoot = append(req.Opts.CgRoot, cgroupRoot)
+ }
+
+ return nil
+}
+
+func (c *linuxContainer) criuSwrk(process *Process, req *criurpc.CriuReq, opts *CriuOpts, applyCgroups bool, extraFiles []*os.File) error {
+ fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_SEQPACKET|unix.SOCK_CLOEXEC, 0)
+ if err != nil {
+ return err
+ }
+
+ var logPath string
+ if opts != nil {
+ logPath = filepath.Join(opts.WorkDirectory, req.GetOpts().GetLogFile())
+ } else {
+ // For the VERSION RPC 'opts' is set to 'nil' and therefore
+ // opts.WorkDirectory does not exist. Set logPath to "".
+ logPath = ""
+ }
+ criuClient := os.NewFile(uintptr(fds[0]), "criu-transport-client")
+ criuClientFileCon, err := net.FileConn(criuClient)
+ criuClient.Close()
+ if err != nil {
+ return err
+ }
+
+ criuClientCon := criuClientFileCon.(*net.UnixConn)
+ defer criuClientCon.Close()
+
+ criuServer := os.NewFile(uintptr(fds[1]), "criu-transport-server")
+ defer criuServer.Close()
+
+ args := []string{"swrk", "3"}
+ if c.criuVersion != 0 {
+ // If the CRIU Version is still '0' then this is probably
+ // the initial CRIU run to detect the version. Skip it.
+ logrus.Debugf("Using CRIU %d at: %s", c.criuVersion, c.criuPath)
+ }
+ logrus.Debugf("Using CRIU with following args: %s", args)
+ cmd := exec.Command(c.criuPath, args...)
+ if process != nil {
+ cmd.Stdin = process.Stdin
+ cmd.Stdout = process.Stdout
+ cmd.Stderr = process.Stderr
+ }
+ cmd.ExtraFiles = append(cmd.ExtraFiles, criuServer)
+ if extraFiles != nil {
+ cmd.ExtraFiles = append(cmd.ExtraFiles, extraFiles...)
+ }
+
+ if err := cmd.Start(); err != nil {
+ return err
+ }
+ criuServer.Close()
+
+ defer func() {
+ criuClientCon.Close()
+ _, err := cmd.Process.Wait()
+ if err != nil {
+ return
+ }
+ }()
+
+ if applyCgroups {
+ err := c.criuApplyCgroups(cmd.Process.Pid, req)
+ if err != nil {
+ return err
+ }
+ }
+
+ var extFds []string
+ if process != nil {
+ extFds, err = getPipeFds(cmd.Process.Pid)
+ if err != nil {
+ return err
+ }
+ }
+
+ logrus.Debugf("Using CRIU in %s mode", req.GetType().String())
+ // In the case of criurpc.CriuReqType_FEATURE_CHECK req.GetOpts()
+ // should be empty. For older CRIU versions it still will be
+ // available but empty. criurpc.CriuReqType_VERSION actually
+ // has no req.GetOpts().
+ if !(req.GetType() == criurpc.CriuReqType_FEATURE_CHECK ||
+ req.GetType() == criurpc.CriuReqType_VERSION) {
+
+ val := reflect.ValueOf(req.GetOpts())
+ v := reflect.Indirect(val)
+ for i := 0; i < v.NumField(); i++ {
+ st := v.Type()
+ name := st.Field(i).Name
+ if strings.HasPrefix(name, "XXX_") {
+ continue
+ }
+ value := val.MethodByName("Get" + name).Call([]reflect.Value{})
+ logrus.Debugf("CRIU option %s with value %v", name, value[0])
+ }
+ }
+ data, err := proto.Marshal(req)
+ if err != nil {
+ return err
+ }
+ _, err = criuClientCon.Write(data)
+ if err != nil {
+ return err
+ }
+
+ buf := make([]byte, 10*4096)
+ oob := make([]byte, 4096)
+ for true {
+ n, oobn, _, _, err := criuClientCon.ReadMsgUnix(buf, oob)
+ if err != nil {
+ return err
+ }
+ if n == 0 {
+ return fmt.Errorf("unexpected EOF")
+ }
+ if n == len(buf) {
+ return fmt.Errorf("buffer is too small")
+ }
+
+ resp := new(criurpc.CriuResp)
+ err = proto.Unmarshal(buf[:n], resp)
+ if err != nil {
+ return err
+ }
+ if !resp.GetSuccess() {
+ typeString := req.GetType().String()
+ if typeString == "VERSION" {
+ // If the VERSION RPC fails this probably means that the CRIU
+ // version is too old for this RPC. Just return 'nil'.
+ return nil
+ }
+ return fmt.Errorf("criu failed: type %s errno %d\nlog file: %s", typeString, resp.GetCrErrno(), logPath)
+ }
+
+ t := resp.GetType()
+ switch {
+ case t == criurpc.CriuReqType_VERSION:
+ logrus.Debugf("CRIU version: %s", resp)
+ criuVersionRPC = resp.GetVersion()
+ break
+ case t == criurpc.CriuReqType_FEATURE_CHECK:
+ logrus.Debugf("Feature check says: %s", resp)
+ criuFeatures = resp.GetFeatures()
+ case t == criurpc.CriuReqType_NOTIFY:
+ if err := c.criuNotifications(resp, process, opts, extFds, oob[:oobn]); err != nil {
+ return err
+ }
+ t = criurpc.CriuReqType_NOTIFY
+ req = &criurpc.CriuReq{
+ Type: &t,
+ NotifySuccess: proto.Bool(true),
+ }
+ data, err = proto.Marshal(req)
+ if err != nil {
+ return err
+ }
+ _, err = criuClientCon.Write(data)
+ if err != nil {
+ return err
+ }
+ continue
+ case t == criurpc.CriuReqType_RESTORE:
+ case t == criurpc.CriuReqType_DUMP:
+ case t == criurpc.CriuReqType_PRE_DUMP:
+ default:
+ return fmt.Errorf("unable to parse the response %s", resp.String())
+ }
+
+ break
+ }
+
+ criuClientCon.CloseWrite()
+ // cmd.Wait() waits cmd.goroutines which are used for proxying file descriptors.
+ // Here we want to wait only the CRIU process.
+ st, err := cmd.Process.Wait()
+ if err != nil {
+ return err
+ }
+
+ // In pre-dump mode CRIU is in a loop and waits for
+ // the final DUMP command.
+ // The current runc pre-dump approach, however, is
+ // start criu in PRE_DUMP once for a single pre-dump
+ // and not the whole series of pre-dump, pre-dump, ...m, dump
+ // If we got the message CriuReqType_PRE_DUMP it means
+ // CRIU was successful and we need to forcefully stop CRIU
+ if !st.Success() && *req.Type != criurpc.CriuReqType_PRE_DUMP {
+ return fmt.Errorf("criu failed: %s\nlog file: %s", st.String(), logPath)
+ }
+ return nil
+}
+
+// block any external network activity
+func lockNetwork(config *configs.Config) error {
+ for _, config := range config.Networks {
+ strategy, err := getStrategy(config.Type)
+ if err != nil {
+ return err
+ }
+
+ if err := strategy.detach(config); err != nil {
+ return err
+ }
+ }
+ return nil
+}
+
+func unlockNetwork(config *configs.Config) error {
+ for _, config := range config.Networks {
+ strategy, err := getStrategy(config.Type)
+ if err != nil {
+ return err
+ }
+ if err = strategy.attach(config); err != nil {
+ return err
+ }
+ }
+ return nil
+}
+
+func (c *linuxContainer) criuNotifications(resp *criurpc.CriuResp, process *Process, opts *CriuOpts, fds []string, oob []byte) error {
+ notify := resp.GetNotify()
+ if notify == nil {
+ return fmt.Errorf("invalid response: %s", resp.String())
+ }
+ logrus.Debugf("notify: %s\n", notify.GetScript())
+ switch {
+ case notify.GetScript() == "post-dump":
+ f, err := os.Create(filepath.Join(c.root, "checkpoint"))
+ if err != nil {
+ return err
+ }
+ f.Close()
+ case notify.GetScript() == "network-unlock":
+ if err := unlockNetwork(c.config); err != nil {
+ return err
+ }
+ case notify.GetScript() == "network-lock":
+ if err := lockNetwork(c.config); err != nil {
+ return err
+ }
+ case notify.GetScript() == "setup-namespaces":
+ if c.config.Hooks != nil {
+ s, err := c.currentOCIState()
+ if err != nil {
+ return nil
+ }
+ for i, hook := range c.config.Hooks.Prestart {
+ if err := hook.Run(s); err != nil {
+ return newSystemErrorWithCausef(err, "running prestart hook %d", i)
+ }
+ }
+ }
+ case notify.GetScript() == "post-restore":
+ pid := notify.GetPid()
+ r, err := newRestoredProcess(int(pid), fds)
+ if err != nil {
+ return err
+ }
+ process.ops = r
+ if err := c.state.transition(&restoredState{
+ imageDir: opts.ImagesDirectory,
+ c: c,
+ }); err != nil {
+ return err
+ }
+ // create a timestamp indicating when the restored checkpoint was started
+ c.created = time.Now().UTC()
+ if _, err := c.updateState(r); err != nil {
+ return err
+ }
+ if err := os.Remove(filepath.Join(c.root, "checkpoint")); err != nil {
+ if !os.IsNotExist(err) {
+ logrus.Error(err)
+ }
+ }
+ case notify.GetScript() == "orphan-pts-master":
+ scm, err := unix.ParseSocketControlMessage(oob)
+ if err != nil {
+ return err
+ }
+ fds, err := unix.ParseUnixRights(&scm[0])
+ if err != nil {
+ return err
+ }
+
+ master := os.NewFile(uintptr(fds[0]), "orphan-pts-master")
+ defer master.Close()
+
+ // While we can access console.master, using the API is a good idea.
+ if err := utils.SendFd(process.ConsoleSocket, master.Name(), master.Fd()); err != nil {
+ return err
+ }
+ }
+ return nil
+}
+
+func (c *linuxContainer) updateState(process parentProcess) (*State, error) {
+ if process != nil {
+ c.initProcess = process
+ }
+ state, err := c.currentState()
+ if err != nil {
+ return nil, err
+ }
+ err = c.saveState(state)
+ if err != nil {
+ return nil, err
+ }
+ return state, nil
+}
+
+func (c *linuxContainer) saveState(s *State) error {
+ f, err := os.Create(filepath.Join(c.root, stateFilename))
+ if err != nil {
+ return err
+ }
+ defer f.Close()
+ return utils.WriteJSON(f, s)
+}
+
+func (c *linuxContainer) deleteState() error {
+ return os.Remove(filepath.Join(c.root, stateFilename))
+}
+
+func (c *linuxContainer) currentStatus() (Status, error) {
+ if err := c.refreshState(); err != nil {
+ return -1, err
+ }
+ return c.state.status(), nil
+}
+
+// refreshState needs to be called to verify that the current state on the
+// container is what is true. Because consumers of libcontainer can use it
+// out of process we need to verify the container's status based on runtime
+// information and not rely on our in process info.
+func (c *linuxContainer) refreshState() error {
+ paused, err := c.isPaused()
+ if err != nil {
+ return err
+ }
+ if paused {
+ return c.state.transition(&pausedState{c: c})
+ }
+ t, err := c.runType()
+ if err != nil {
+ return err
+ }
+ switch t {
+ case Created:
+ return c.state.transition(&createdState{c: c})
+ case Running:
+ return c.state.transition(&runningState{c: c})
+ }
+ return c.state.transition(&stoppedState{c: c})
+}
+
+func (c *linuxContainer) runType() (Status, error) {
+ if c.initProcess == nil {
+ return Stopped, nil
+ }
+ pid := c.initProcess.pid()
+ stat, err := system.Stat(pid)
+ if err != nil {
+ return Stopped, nil
+ }
+ if stat.StartTime != c.initProcessStartTime || stat.State == system.Zombie || stat.State == system.Dead {
+ return Stopped, nil
+ }
+ // We'll create exec fifo and blocking on it after container is created,
+ // and delete it after start container.
+ if _, err := os.Stat(filepath.Join(c.root, execFifoFilename)); err == nil {
+ return Created, nil
+ }
+ return Running, nil
+}
+
+func (c *linuxContainer) isPaused() (bool, error) {
+ fcg := c.cgroupManager.GetPaths()["freezer"]
+ if fcg == "" {
+ // A container doesn't have a freezer cgroup
+ return false, nil
+ }
+ data, err := ioutil.ReadFile(filepath.Join(fcg, "freezer.state"))
+ if err != nil {
+ // If freezer cgroup is not mounted, the container would just be not paused.
+ if os.IsNotExist(err) {
+ return false, nil
+ }
+ return false, newSystemErrorWithCause(err, "checking if container is paused")
+ }
+ return bytes.Equal(bytes.TrimSpace(data), []byte("FROZEN")), nil
+}
+
+func (c *linuxContainer) currentState() (*State, error) {
+ var (
+ startTime uint64
+ externalDescriptors []string
+ pid = -1
+ )
+ if c.initProcess != nil {
+ pid = c.initProcess.pid()
+ startTime, _ = c.initProcess.startTime()
+ externalDescriptors = c.initProcess.externalDescriptors()
+ }
+ intelRdtPath, err := intelrdt.GetIntelRdtPath(c.ID())
+ if err != nil {
+ intelRdtPath = ""
+ }
+ state := &State{
+ BaseState: BaseState{
+ ID: c.ID(),
+ Config: *c.config,
+ InitProcessPid: pid,
+ InitProcessStartTime: startTime,
+ Created: c.created,
+ },
+ Rootless: c.config.RootlessEUID && c.config.RootlessCgroups,
+ CgroupPaths: c.cgroupManager.GetPaths(),
+ IntelRdtPath: intelRdtPath,
+ NamespacePaths: make(map[configs.NamespaceType]string),
+ ExternalDescriptors: externalDescriptors,
+ }
+ if pid > 0 {
+ for _, ns := range c.config.Namespaces {
+ state.NamespacePaths[ns.Type] = ns.GetPath(pid)
+ }
+ for _, nsType := range configs.NamespaceTypes() {
+ if !configs.IsNamespaceSupported(nsType) {
+ continue
+ }
+ if _, ok := state.NamespacePaths[nsType]; !ok {
+ ns := configs.Namespace{Type: nsType}
+ state.NamespacePaths[ns.Type] = ns.GetPath(pid)
+ }
+ }
+ }
+ return state, nil
+}
+
+func (c *linuxContainer) currentOCIState() (*specs.State, error) {
+ bundle, annotations := utils.Annotations(c.config.Labels)
+ state := &specs.State{
+ Version: specs.Version,
+ ID: c.ID(),
+ Bundle: bundle,
+ Annotations: annotations,
+ }
+ status, err := c.currentStatus()
+ if err != nil {
+ return nil, err
+ }
+ state.Status = status.String()
+ if status != Stopped {
+ if c.initProcess != nil {
+ state.Pid = c.initProcess.pid()
+ }
+ }
+ return state, nil
+}
+
+// orderNamespacePaths sorts namespace paths into a list of paths that we
+// can setns in order.
+func (c *linuxContainer) orderNamespacePaths(namespaces map[configs.NamespaceType]string) ([]string, error) {
+ paths := []string{}
+ for _, ns := range configs.NamespaceTypes() {
+
+ // Remove namespaces that we don't need to join.
+ if !c.config.Namespaces.Contains(ns) {
+ continue
+ }
+
+ if p, ok := namespaces[ns]; ok && p != "" {
+ // check if the requested namespace is supported
+ if !configs.IsNamespaceSupported(ns) {
+ return nil, newSystemError(fmt.Errorf("namespace %s is not supported", ns))
+ }
+ // only set to join this namespace if it exists
+ if _, err := os.Lstat(p); err != nil {
+ return nil, newSystemErrorWithCausef(err, "running lstat on namespace path %q", p)
+ }
+ // do not allow namespace path with comma as we use it to separate
+ // the namespace paths
+ if strings.ContainsRune(p, ',') {
+ return nil, newSystemError(fmt.Errorf("invalid path %s", p))
+ }
+ paths = append(paths, fmt.Sprintf("%s:%s", configs.NsName(ns), p))
+ }
+
+ }
+
+ return paths, nil
+}
+
+func encodeIDMapping(idMap []configs.IDMap) ([]byte, error) {
+ data := bytes.NewBuffer(nil)
+ for _, im := range idMap {
+ line := fmt.Sprintf("%d %d %d\n", im.ContainerID, im.HostID, im.Size)
+ if _, err := data.WriteString(line); err != nil {
+ return nil, err
+ }
+ }
+ return data.Bytes(), nil
+}
+
+// bootstrapData encodes the necessary data in netlink binary format
+// as a io.Reader.
+// Consumer can write the data to a bootstrap program
+// such as one that uses nsenter package to bootstrap the container's
+// init process correctly, i.e. with correct namespaces, uid/gid
+// mapping etc.
+func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.NamespaceType]string) (io.Reader, error) {
+ // create the netlink message
+ r := nl.NewNetlinkRequest(int(InitMsg), 0)
+
+ // write cloneFlags
+ r.AddData(&Int32msg{
+ Type: CloneFlagsAttr,
+ Value: uint32(cloneFlags),
+ })
+
+ // write custom namespace paths
+ if len(nsMaps) > 0 {
+ nsPaths, err := c.orderNamespacePaths(nsMaps)
+ if err != nil {
+ return nil, err
+ }
+ r.AddData(&Bytemsg{
+ Type: NsPathsAttr,
+ Value: []byte(strings.Join(nsPaths, ",")),
+ })
+ }
+
+ // write namespace paths only when we are not joining an existing user ns
+ _, joinExistingUser := nsMaps[configs.NEWUSER]
+ if !joinExistingUser {
+ // write uid mappings
+ if len(c.config.UidMappings) > 0 {
+ if c.config.RootlessEUID && c.newuidmapPath != "" {
+ r.AddData(&Bytemsg{
+ Type: UidmapPathAttr,
+ Value: []byte(c.newuidmapPath),
+ })
+ }
+ b, err := encodeIDMapping(c.config.UidMappings)
+ if err != nil {
+ return nil, err
+ }
+ r.AddData(&Bytemsg{
+ Type: UidmapAttr,
+ Value: b,
+ })
+ }
+
+ // write gid mappings
+ if len(c.config.GidMappings) > 0 {
+ b, err := encodeIDMapping(c.config.GidMappings)
+ if err != nil {
+ return nil, err
+ }
+ r.AddData(&Bytemsg{
+ Type: GidmapAttr,
+ Value: b,
+ })
+ if c.config.RootlessEUID && c.newgidmapPath != "" {
+ r.AddData(&Bytemsg{
+ Type: GidmapPathAttr,
+ Value: []byte(c.newgidmapPath),
+ })
+ }
+ if requiresRootOrMappingTool(c.config) {
+ r.AddData(&Boolmsg{
+ Type: SetgroupAttr,
+ Value: true,
+ })
+ }
+ }
+ }
+
+ if c.config.OomScoreAdj != nil {
+ // write oom_score_adj
+ r.AddData(&Bytemsg{
+ Type: OomScoreAdjAttr,
+ Value: []byte(fmt.Sprintf("%d", *c.config.OomScoreAdj)),
+ })
+ }
+
+ // write rootless
+ r.AddData(&Boolmsg{
+ Type: RootlessEUIDAttr,
+ Value: c.config.RootlessEUID,
+ })
+
+ return bytes.NewReader(r.Serialize()), nil
+}
+
+// ignoreTerminateErrors returns nil if the given err matches an error known
+// to indicate that the terminate occurred successfully or err was nil, otherwise
+// err is returned unaltered.
+func ignoreTerminateErrors(err error) error {
+ if err == nil {
+ return nil
+ }
+ s := err.Error()
+ switch {
+ case strings.Contains(s, "process already finished"), strings.Contains(s, "Wait was already called"):
+ return nil
+ }
+ return err
+}
+
+func requiresRootOrMappingTool(c *configs.Config) bool {
+ gidMap := []configs.IDMap{
+ {ContainerID: 0, HostID: os.Getegid(), Size: 1},
+ }
+ return !reflect.DeepEqual(c.GidMappings, gidMap)
+}
--- /dev/null
+// +build linux
+
+package libcontainer
+
+import (
+ "fmt"
+ "io/ioutil"
+ "os"
+ "testing"
+
+ "github.com/opencontainers/runc/libcontainer/cgroups"
+ "github.com/opencontainers/runc/libcontainer/configs"
+ "github.com/opencontainers/runc/libcontainer/intelrdt"
+ "github.com/opencontainers/runc/libcontainer/system"
+)
+
+type mockCgroupManager struct {
+ pids []int
+ allPids []int
+ stats *cgroups.Stats
+ paths map[string]string
+}
+
+type mockIntelRdtManager struct {
+ stats *intelrdt.Stats
+ path string
+}
+
+func (m *mockCgroupManager) GetPids() ([]int, error) {
+ return m.pids, nil
+}
+
+func (m *mockCgroupManager) GetAllPids() ([]int, error) {
+ return m.allPids, nil
+}
+
+func (m *mockCgroupManager) GetStats() (*cgroups.Stats, error) {
+ return m.stats, nil
+}
+
+func (m *mockCgroupManager) Apply(pid int) error {
+ return nil
+}
+
+func (m *mockCgroupManager) Set(container *configs.Config) error {
+ return nil
+}
+
+func (m *mockCgroupManager) Destroy() error {
+ return nil
+}
+
+func (m *mockCgroupManager) GetPaths() map[string]string {
+ return m.paths
+}
+
+func (m *mockCgroupManager) Freeze(state configs.FreezerState) error {
+ return nil
+}
+
+func (m *mockIntelRdtManager) Apply(pid int) error {
+ return nil
+}
+
+func (m *mockIntelRdtManager) GetStats() (*intelrdt.Stats, error) {
+ return m.stats, nil
+}
+
+func (m *mockIntelRdtManager) Destroy() error {
+ return nil
+}
+
+func (m *mockIntelRdtManager) GetPath() string {
+ return m.path
+}
+
+func (m *mockIntelRdtManager) Set(container *configs.Config) error {
+ return nil
+}
+
+type mockProcess struct {
+ _pid int
+ started uint64
+}
+
+func (m *mockProcess) terminate() error {
+ return nil
+}
+
+func (m *mockProcess) pid() int {
+ return m._pid
+}
+
+func (m *mockProcess) startTime() (uint64, error) {
+ return m.started, nil
+}
+
+func (m *mockProcess) start() error {
+ return nil
+}
+
+func (m *mockProcess) wait() (*os.ProcessState, error) {
+ return nil, nil
+}
+
+func (m *mockProcess) signal(_ os.Signal) error {
+ return nil
+}
+
+func (m *mockProcess) externalDescriptors() []string {
+ return []string{}
+}
+
+func (m *mockProcess) setExternalDescriptors(newFds []string) {
+}
+
+func TestGetContainerPids(t *testing.T) {
+ container := &linuxContainer{
+ id: "myid",
+ config: &configs.Config{},
+ cgroupManager: &mockCgroupManager{allPids: []int{1, 2, 3}},
+ }
+ pids, err := container.Processes()
+ if err != nil {
+ t.Fatal(err)
+ }
+ for i, expected := range []int{1, 2, 3} {
+ if pids[i] != expected {
+ t.Fatalf("expected pid %d but received %d", expected, pids[i])
+ }
+ }
+}
+
+func TestGetContainerStats(t *testing.T) {
+ container := &linuxContainer{
+ id: "myid",
+ config: &configs.Config{},
+ cgroupManager: &mockCgroupManager{
+ pids: []int{1, 2, 3},
+ stats: &cgroups.Stats{
+ MemoryStats: cgroups.MemoryStats{
+ Usage: cgroups.MemoryData{
+ Usage: 1024,
+ },
+ },
+ },
+ },
+ intelRdtManager: &mockIntelRdtManager{
+ stats: &intelrdt.Stats{
+ L3CacheSchema: "L3:0=f;1=f0",
+ MemBwSchema: "MB:0=20;1=70",
+ },
+ },
+ }
+ stats, err := container.Stats()
+ if err != nil {
+ t.Fatal(err)
+ }
+ if stats.CgroupStats == nil {
+ t.Fatal("cgroup stats are nil")
+ }
+ if stats.CgroupStats.MemoryStats.Usage.Usage != 1024 {
+ t.Fatalf("expected memory usage 1024 but received %d", stats.CgroupStats.MemoryStats.Usage.Usage)
+ }
+ if intelrdt.IsCatEnabled() {
+ if stats.IntelRdtStats == nil {
+ t.Fatal("intel rdt stats are nil")
+ }
+ if stats.IntelRdtStats.L3CacheSchema != "L3:0=f;1=f0" {
+ t.Fatalf("expected L3CacheSchema L3:0=f;1=f0 but recevied %s", stats.IntelRdtStats.L3CacheSchema)
+ }
+ }
+ if intelrdt.IsMbaEnabled() {
+ if stats.IntelRdtStats == nil {
+ t.Fatal("intel rdt stats are nil")
+ }
+ if stats.IntelRdtStats.MemBwSchema != "MB:0=20;1=70" {
+ t.Fatalf("expected MemBwSchema MB:0=20;1=70 but recevied %s", stats.IntelRdtStats.MemBwSchema)
+ }
+ }
+}
+
+func TestGetContainerState(t *testing.T) {
+ var (
+ pid = os.Getpid()
+ expectedMemoryPath = "/sys/fs/cgroup/memory/myid"
+ expectedNetworkPath = fmt.Sprintf("/proc/%d/ns/net", pid)
+ expectedIntelRdtPath = "/sys/fs/resctrl/myid"
+ )
+ container := &linuxContainer{
+ id: "myid",
+ config: &configs.Config{
+ Namespaces: []configs.Namespace{
+ {Type: configs.NEWPID},
+ {Type: configs.NEWNS},
+ {Type: configs.NEWNET, Path: expectedNetworkPath},
+ {Type: configs.NEWUTS},
+ // emulate host for IPC
+ //{Type: configs.NEWIPC},
+ },
+ },
+ initProcess: &mockProcess{
+ _pid: pid,
+ started: 10,
+ },
+ cgroupManager: &mockCgroupManager{
+ pids: []int{1, 2, 3},
+ stats: &cgroups.Stats{
+ MemoryStats: cgroups.MemoryStats{
+ Usage: cgroups.MemoryData{
+ Usage: 1024,
+ },
+ },
+ },
+ paths: map[string]string{
+ "memory": expectedMemoryPath,
+ },
+ },
+ intelRdtManager: &mockIntelRdtManager{
+ stats: &intelrdt.Stats{
+ L3CacheSchema: "L3:0=f0;1=f",
+ MemBwSchema: "MB:0=70;1=20",
+ },
+ path: expectedIntelRdtPath,
+ },
+ }
+ container.state = &createdState{c: container}
+ state, err := container.State()
+ if err != nil {
+ t.Fatal(err)
+ }
+ if state.InitProcessPid != pid {
+ t.Fatalf("expected pid %d but received %d", pid, state.InitProcessPid)
+ }
+ if state.InitProcessStartTime != 10 {
+ t.Fatalf("expected process start time 10 but received %d", state.InitProcessStartTime)
+ }
+ paths := state.CgroupPaths
+ if paths == nil {
+ t.Fatal("cgroup paths should not be nil")
+ }
+ if memPath := paths["memory"]; memPath != expectedMemoryPath {
+ t.Fatalf("expected memory path %q but received %q", expectedMemoryPath, memPath)
+ }
+ if intelrdt.IsCatEnabled() || intelrdt.IsMbaEnabled() {
+ intelRdtPath := state.IntelRdtPath
+ if intelRdtPath == "" {
+ t.Fatal("intel rdt path should not be empty")
+ }
+ if intelRdtPath != expectedIntelRdtPath {
+ t.Fatalf("expected intel rdt path %q but received %q", expectedIntelRdtPath, intelRdtPath)
+ }
+ }
+ for _, ns := range container.config.Namespaces {
+ path := state.NamespacePaths[ns.Type]
+ if path == "" {
+ t.Fatalf("expected non nil namespace path for %s", ns.Type)
+ }
+ if ns.Type == configs.NEWNET {
+ if path != expectedNetworkPath {
+ t.Fatalf("expected path %q but received %q", expectedNetworkPath, path)
+ }
+ } else {
+ file := ""
+ switch ns.Type {
+ case configs.NEWNET:
+ file = "net"
+ case configs.NEWNS:
+ file = "mnt"
+ case configs.NEWPID:
+ file = "pid"
+ case configs.NEWIPC:
+ file = "ipc"
+ case configs.NEWUSER:
+ file = "user"
+ case configs.NEWUTS:
+ file = "uts"
+ }
+ expected := fmt.Sprintf("/proc/%d/ns/%s", pid, file)
+ if expected != path {
+ t.Fatalf("expected path %q but received %q", expected, path)
+ }
+ }
+ }
+}
+
+func TestGetContainerStateAfterUpdate(t *testing.T) {
+ var (
+ pid = os.Getpid()
+ )
+ stat, err := system.Stat(pid)
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ rootDir, err := ioutil.TempDir("", "TestGetContainerStateAfterUpdate")
+ if err != nil {
+ t.Fatal(err)
+ }
+ defer os.RemoveAll(rootDir)
+
+ container := &linuxContainer{
+ root: rootDir,
+ id: "myid",
+ config: &configs.Config{
+ Namespaces: []configs.Namespace{
+ {Type: configs.NEWPID},
+ {Type: configs.NEWNS},
+ {Type: configs.NEWNET},
+ {Type: configs.NEWUTS},
+ {Type: configs.NEWIPC},
+ },
+ Cgroups: &configs.Cgroup{
+ Resources: &configs.Resources{
+ Memory: 1024,
+ },
+ },
+ },
+ initProcess: &mockProcess{
+ _pid: pid,
+ started: stat.StartTime,
+ },
+ cgroupManager: &mockCgroupManager{},
+ }
+ container.state = &createdState{c: container}
+ state, err := container.State()
+ if err != nil {
+ t.Fatal(err)
+ }
+ if state.InitProcessPid != pid {
+ t.Fatalf("expected pid %d but received %d", pid, state.InitProcessPid)
+ }
+ if state.InitProcessStartTime != stat.StartTime {
+ t.Fatalf("expected process start time %d but received %d", stat.StartTime, state.InitProcessStartTime)
+ }
+ if state.Config.Cgroups.Resources.Memory != 1024 {
+ t.Fatalf("expected Memory to be 1024 but received %q", state.Config.Cgroups.Memory)
+ }
+
+ // Set initProcessStartTime so we fake to be running
+ container.initProcessStartTime = state.InitProcessStartTime
+ container.state = &runningState{c: container}
+ newConfig := container.Config()
+ newConfig.Cgroups.Resources.Memory = 2048
+ if err := container.Set(newConfig); err != nil {
+ t.Fatal(err)
+ }
+ state, err = container.State()
+ if err != nil {
+ t.Fatal(err)
+ }
+ if state.Config.Cgroups.Resources.Memory != 2048 {
+ t.Fatalf("expected Memory to be 2048 but received %q", state.Config.Cgroups.Memory)
+ }
+}
--- /dev/null
+package libcontainer
+
+// cgroup restoring strategy provided by criu
+type cgMode uint32
+
+const (
+ CRIU_CG_MODE_SOFT cgMode = 3 + iota // restore cgroup properties if only dir created by criu
+ CRIU_CG_MODE_FULL // always restore all cgroups and their properties
+ CRIU_CG_MODE_STRICT // restore all, requiring them to not present in the system
+ CRIU_CG_MODE_DEFAULT // the same as CRIU_CG_MODE_SOFT
+)
+
+type CriuPageServerInfo struct {
+ Address string // IP address of CRIU page server
+ Port int32 // port number of CRIU page server
+}
+
+type VethPairName struct {
+ ContainerInterfaceName string
+ HostInterfaceName string
+}
+
+type CriuOpts struct {
+ ImagesDirectory string // directory for storing image files
+ WorkDirectory string // directory to cd and write logs/pidfiles/stats to
+ ParentImage string // directory for storing parent image files in pre-dump and dump
+ LeaveRunning bool // leave container in running state after checkpoint
+ TcpEstablished bool // checkpoint/restore established TCP connections
+ ExternalUnixConnections bool // allow external unix connections
+ ShellJob bool // allow to dump and restore shell jobs
+ FileLocks bool // handle file locks, for safety
+ PreDump bool // call criu predump to perform iterative checkpoint
+ PageServer CriuPageServerInfo // allow to dump to criu page server
+ VethPairs []VethPairName // pass the veth to criu when restore
+ ManageCgroupsMode cgMode // dump or restore cgroup mode
+ EmptyNs uint32 // don't c/r properties for namespace from this mask
+ AutoDedup bool // auto deduplication for incremental dumps
+ LazyPages bool // restore memory pages lazily using userfaultfd
+ StatusFd string // fd for feedback when lazy server is ready
+}
--- /dev/null
+gen: criurpc.proto
+ protoc --go_out=. criurpc.proto
--- /dev/null
+// Code generated by protoc-gen-go.
+// source: criurpc.proto
+// DO NOT EDIT!
+
+/*
+Package criurpc is a generated protocol buffer package.
+
+It is generated from these files:
+ criurpc.proto
+
+It has these top-level messages:
+ CriuPageServerInfo
+ CriuVethPair
+ ExtMountMap
+ JoinNamespace
+ InheritFd
+ CgroupRoot
+ UnixSk
+ CriuOpts
+ CriuDumpResp
+ CriuRestoreResp
+ CriuNotify
+ CriuFeatures
+ CriuReq
+ CriuResp
+ CriuVersion
+*/
+package criurpc
+
+import proto "github.com/golang/protobuf/proto"
+import fmt "fmt"
+import math "math"
+
+// Reference imports to suppress errors if they are not otherwise used.
+var _ = proto.Marshal
+var _ = fmt.Errorf
+var _ = math.Inf
+
+// This is a compile-time assertion to ensure that this generated file
+// is compatible with the proto package it is being compiled against.
+// A compilation error at this line likely means your copy of the
+// proto package needs to be updated.
+const _ = proto.ProtoPackageIsVersion2 // please upgrade the proto package
+
+type CriuCgMode int32
+
+const (
+ CriuCgMode_IGNORE CriuCgMode = 0
+ CriuCgMode_CG_NONE CriuCgMode = 1
+ CriuCgMode_PROPS CriuCgMode = 2
+ CriuCgMode_SOFT CriuCgMode = 3
+ CriuCgMode_FULL CriuCgMode = 4
+ CriuCgMode_STRICT CriuCgMode = 5
+ CriuCgMode_DEFAULT CriuCgMode = 6
+)
+
+var CriuCgMode_name = map[int32]string{
+ 0: "IGNORE",
+ 1: "CG_NONE",
+ 2: "PROPS",
+ 3: "SOFT",
+ 4: "FULL",
+ 5: "STRICT",
+ 6: "DEFAULT",
+}
+var CriuCgMode_value = map[string]int32{
+ "IGNORE": 0,
+ "CG_NONE": 1,
+ "PROPS": 2,
+ "SOFT": 3,
+ "FULL": 4,
+ "STRICT": 5,
+ "DEFAULT": 6,
+}
+
+func (x CriuCgMode) Enum() *CriuCgMode {
+ p := new(CriuCgMode)
+ *p = x
+ return p
+}
+func (x CriuCgMode) String() string {
+ return proto.EnumName(CriuCgMode_name, int32(x))
+}
+func (x *CriuCgMode) UnmarshalJSON(data []byte) error {
+ value, err := proto.UnmarshalJSONEnum(CriuCgMode_value, data, "CriuCgMode")
+ if err != nil {
+ return err
+ }
+ *x = CriuCgMode(value)
+ return nil
+}
+func (CriuCgMode) EnumDescriptor() ([]byte, []int) { return fileDescriptor0, []int{0} }
+
+type CriuReqType int32
+
+const (
+ CriuReqType_EMPTY CriuReqType = 0
+ CriuReqType_DUMP CriuReqType = 1
+ CriuReqType_RESTORE CriuReqType = 2
+ CriuReqType_CHECK CriuReqType = 3
+ CriuReqType_PRE_DUMP CriuReqType = 4
+ CriuReqType_PAGE_SERVER CriuReqType = 5
+ CriuReqType_NOTIFY CriuReqType = 6
+ CriuReqType_CPUINFO_DUMP CriuReqType = 7
+ CriuReqType_CPUINFO_CHECK CriuReqType = 8
+ CriuReqType_FEATURE_CHECK CriuReqType = 9
+ CriuReqType_VERSION CriuReqType = 10
+)
+
+var CriuReqType_name = map[int32]string{
+ 0: "EMPTY",
+ 1: "DUMP",
+ 2: "RESTORE",
+ 3: "CHECK",
+ 4: "PRE_DUMP",
+ 5: "PAGE_SERVER",
+ 6: "NOTIFY",
+ 7: "CPUINFO_DUMP",
+ 8: "CPUINFO_CHECK",
+ 9: "FEATURE_CHECK",
+ 10: "VERSION",
+}
+var CriuReqType_value = map[string]int32{
+ "EMPTY": 0,
+ "DUMP": 1,
+ "RESTORE": 2,
+ "CHECK": 3,
+ "PRE_DUMP": 4,
+ "PAGE_SERVER": 5,
+ "NOTIFY": 6,
+ "CPUINFO_DUMP": 7,
+ "CPUINFO_CHECK": 8,
+ "FEATURE_CHECK": 9,
+ "VERSION": 10,
+}
+
+func (x CriuReqType) Enum() *CriuReqType {
+ p := new(CriuReqType)
+ *p = x
+ return p
+}
+func (x CriuReqType) String() string {
+ return proto.EnumName(CriuReqType_name, int32(x))
+}
+func (x *CriuReqType) UnmarshalJSON(data []byte) error {
+ value, err := proto.UnmarshalJSONEnum(CriuReqType_value, data, "CriuReqType")
+ if err != nil {
+ return err
+ }
+ *x = CriuReqType(value)
+ return nil
+}
+func (CriuReqType) EnumDescriptor() ([]byte, []int) { return fileDescriptor0, []int{1} }
+
+type CriuPageServerInfo struct {
+ Address *string `protobuf:"bytes,1,opt,name=address" json:"address,omitempty"`
+ Port *int32 `protobuf:"varint,2,opt,name=port" json:"port,omitempty"`
+ Pid *int32 `protobuf:"varint,3,opt,name=pid" json:"pid,omitempty"`
+ Fd *int32 `protobuf:"varint,4,opt,name=fd" json:"fd,omitempty"`
+ XXX_unrecognized []byte `json:"-"`
+}
+
+func (m *CriuPageServerInfo) Reset() { *m = CriuPageServerInfo{} }
+func (m *CriuPageServerInfo) String() string { return proto.CompactTextString(m) }
+func (*CriuPageServerInfo) ProtoMessage() {}
+func (*CriuPageServerInfo) Descriptor() ([]byte, []int) { return fileDescriptor0, []int{0} }
+
+func (m *CriuPageServerInfo) GetAddress() string {
+ if m != nil && m.Address != nil {
+ return *m.Address
+ }
+ return ""
+}
+
+func (m *CriuPageServerInfo) GetPort() int32 {
+ if m != nil && m.Port != nil {
+ return *m.Port
+ }
+ return 0
+}
+
+func (m *CriuPageServerInfo) GetPid() int32 {
+ if m != nil && m.Pid != nil {
+ return *m.Pid
+ }
+ return 0
+}
+
+func (m *CriuPageServerInfo) GetFd() int32 {
+ if m != nil && m.Fd != nil {
+ return *m.Fd
+ }
+ return 0
+}
+
+type CriuVethPair struct {
+ IfIn *string `protobuf:"bytes,1,req,name=if_in,json=ifIn" json:"if_in,omitempty"`
+ IfOut *string `protobuf:"bytes,2,req,name=if_out,json=ifOut" json:"if_out,omitempty"`
+ XXX_unrecognized []byte `json:"-"`
+}
+
+func (m *CriuVethPair) Reset() { *m = CriuVethPair{} }
+func (m *CriuVethPair) String() string { return proto.CompactTextString(m) }
+func (*CriuVethPair) ProtoMessage() {}
+func (*CriuVethPair) Descriptor() ([]byte, []int) { return fileDescriptor0, []int{1} }
+
+func (m *CriuVethPair) GetIfIn() string {
+ if m != nil && m.IfIn != nil {
+ return *m.IfIn
+ }
+ return ""
+}
+
+func (m *CriuVethPair) GetIfOut() string {
+ if m != nil && m.IfOut != nil {
+ return *m.IfOut
+ }
+ return ""
+}
+
+type ExtMountMap struct {
+ Key *string `protobuf:"bytes,1,req,name=key" json:"key,omitempty"`
+ Val *string `protobuf:"bytes,2,req,name=val" json:"val,omitempty"`
+ XXX_unrecognized []byte `json:"-"`
+}
+
+func (m *ExtMountMap) Reset() { *m = ExtMountMap{} }
+func (m *ExtMountMap) String() string { return proto.CompactTextString(m) }
+func (*ExtMountMap) ProtoMessage() {}
+func (*ExtMountMap) Descriptor() ([]byte, []int) { return fileDescriptor0, []int{2} }
+
+func (m *ExtMountMap) GetKey() string {
+ if m != nil && m.Key != nil {
+ return *m.Key
+ }
+ return ""
+}
+
+func (m *ExtMountMap) GetVal() string {
+ if m != nil && m.Val != nil {
+ return *m.Val
+ }
+ return ""
+}
+
+type JoinNamespace struct {
+ Ns *string `protobuf:"bytes,1,req,name=ns" json:"ns,omitempty"`
+ NsFile *string `protobuf:"bytes,2,req,name=ns_file,json=nsFile" json:"ns_file,omitempty"`
+ ExtraOpt *string `protobuf:"bytes,3,opt,name=extra_opt,json=extraOpt" json:"extra_opt,omitempty"`
+ XXX_unrecognized []byte `json:"-"`
+}
+
+func (m *JoinNamespace) Reset() { *m = JoinNamespace{} }
+func (m *JoinNamespace) String() string { return proto.CompactTextString(m) }
+func (*JoinNamespace) ProtoMessage() {}
+func (*JoinNamespace) Descriptor() ([]byte, []int) { return fileDescriptor0, []int{3} }
+
+func (m *JoinNamespace) GetNs() string {
+ if m != nil && m.Ns != nil {
+ return *m.Ns
+ }
+ return ""
+}
+
+func (m *JoinNamespace) GetNsFile() string {
+ if m != nil && m.NsFile != nil {
+ return *m.NsFile
+ }
+ return ""
+}
+
+func (m *JoinNamespace) GetExtraOpt() string {
+ if m != nil && m.ExtraOpt != nil {
+ return *m.ExtraOpt
+ }
+ return ""
+}
+
+type InheritFd struct {
+ Key *string `protobuf:"bytes,1,req,name=key" json:"key,omitempty"`
+ Fd *int32 `protobuf:"varint,2,req,name=fd" json:"fd,omitempty"`
+ XXX_unrecognized []byte `json:"-"`
+}
+
+func (m *InheritFd) Reset() { *m = InheritFd{} }
+func (m *InheritFd) String() string { return proto.CompactTextString(m) }
+func (*InheritFd) ProtoMessage() {}
+func (*InheritFd) Descriptor() ([]byte, []int) { return fileDescriptor0, []int{4} }
+
+func (m *InheritFd) GetKey() string {
+ if m != nil && m.Key != nil {
+ return *m.Key
+ }
+ return ""
+}
+
+func (m *InheritFd) GetFd() int32 {
+ if m != nil && m.Fd != nil {
+ return *m.Fd
+ }
+ return 0
+}
+
+type CgroupRoot struct {
+ Ctrl *string `protobuf:"bytes,1,opt,name=ctrl" json:"ctrl,omitempty"`
+ Path *string `protobuf:"bytes,2,req,name=path" json:"path,omitempty"`
+ XXX_unrecognized []byte `json:"-"`
+}
+
+func (m *CgroupRoot) Reset() { *m = CgroupRoot{} }
+func (m *CgroupRoot) String() string { return proto.CompactTextString(m) }
+func (*CgroupRoot) ProtoMessage() {}
+func (*CgroupRoot) Descriptor() ([]byte, []int) { return fileDescriptor0, []int{5} }
+
+func (m *CgroupRoot) GetCtrl() string {
+ if m != nil && m.Ctrl != nil {
+ return *m.Ctrl
+ }
+ return ""
+}
+
+func (m *CgroupRoot) GetPath() string {
+ if m != nil && m.Path != nil {
+ return *m.Path
+ }
+ return ""
+}
+
+type UnixSk struct {
+ Inode *uint32 `protobuf:"varint,1,req,name=inode" json:"inode,omitempty"`
+ XXX_unrecognized []byte `json:"-"`
+}
+
+func (m *UnixSk) Reset() { *m = UnixSk{} }
+func (m *UnixSk) String() string { return proto.CompactTextString(m) }
+func (*UnixSk) ProtoMessage() {}
+func (*UnixSk) Descriptor() ([]byte, []int) { return fileDescriptor0, []int{6} }
+
+func (m *UnixSk) GetInode() uint32 {
+ if m != nil && m.Inode != nil {
+ return *m.Inode
+ }
+ return 0
+}
+
+type CriuOpts struct {
+ ImagesDirFd *int32 `protobuf:"varint,1,req,name=images_dir_fd,json=imagesDirFd" json:"images_dir_fd,omitempty"`
+ Pid *int32 `protobuf:"varint,2,opt,name=pid" json:"pid,omitempty"`
+ LeaveRunning *bool `protobuf:"varint,3,opt,name=leave_running,json=leaveRunning" json:"leave_running,omitempty"`
+ ExtUnixSk *bool `protobuf:"varint,4,opt,name=ext_unix_sk,json=extUnixSk" json:"ext_unix_sk,omitempty"`
+ TcpEstablished *bool `protobuf:"varint,5,opt,name=tcp_established,json=tcpEstablished" json:"tcp_established,omitempty"`
+ EvasiveDevices *bool `protobuf:"varint,6,opt,name=evasive_devices,json=evasiveDevices" json:"evasive_devices,omitempty"`
+ ShellJob *bool `protobuf:"varint,7,opt,name=shell_job,json=shellJob" json:"shell_job,omitempty"`
+ FileLocks *bool `protobuf:"varint,8,opt,name=file_locks,json=fileLocks" json:"file_locks,omitempty"`
+ LogLevel *int32 `protobuf:"varint,9,opt,name=log_level,json=logLevel,def=2" json:"log_level,omitempty"`
+ LogFile *string `protobuf:"bytes,10,opt,name=log_file,json=logFile" json:"log_file,omitempty"`
+ Ps *CriuPageServerInfo `protobuf:"bytes,11,opt,name=ps" json:"ps,omitempty"`
+ NotifyScripts *bool `protobuf:"varint,12,opt,name=notify_scripts,json=notifyScripts" json:"notify_scripts,omitempty"`
+ Root *string `protobuf:"bytes,13,opt,name=root" json:"root,omitempty"`
+ ParentImg *string `protobuf:"bytes,14,opt,name=parent_img,json=parentImg" json:"parent_img,omitempty"`
+ TrackMem *bool `protobuf:"varint,15,opt,name=track_mem,json=trackMem" json:"track_mem,omitempty"`
+ AutoDedup *bool `protobuf:"varint,16,opt,name=auto_dedup,json=autoDedup" json:"auto_dedup,omitempty"`
+ WorkDirFd *int32 `protobuf:"varint,17,opt,name=work_dir_fd,json=workDirFd" json:"work_dir_fd,omitempty"`
+ LinkRemap *bool `protobuf:"varint,18,opt,name=link_remap,json=linkRemap" json:"link_remap,omitempty"`
+ Veths []*CriuVethPair `protobuf:"bytes,19,rep,name=veths" json:"veths,omitempty"`
+ CpuCap *uint32 `protobuf:"varint,20,opt,name=cpu_cap,json=cpuCap,def=4294967295" json:"cpu_cap,omitempty"`
+ ForceIrmap *bool `protobuf:"varint,21,opt,name=force_irmap,json=forceIrmap" json:"force_irmap,omitempty"`
+ ExecCmd []string `protobuf:"bytes,22,rep,name=exec_cmd,json=execCmd" json:"exec_cmd,omitempty"`
+ ExtMnt []*ExtMountMap `protobuf:"bytes,23,rep,name=ext_mnt,json=extMnt" json:"ext_mnt,omitempty"`
+ ManageCgroups *bool `protobuf:"varint,24,opt,name=manage_cgroups,json=manageCgroups" json:"manage_cgroups,omitempty"`
+ CgRoot []*CgroupRoot `protobuf:"bytes,25,rep,name=cg_root,json=cgRoot" json:"cg_root,omitempty"`
+ RstSibling *bool `protobuf:"varint,26,opt,name=rst_sibling,json=rstSibling" json:"rst_sibling,omitempty"`
+ InheritFd []*InheritFd `protobuf:"bytes,27,rep,name=inherit_fd,json=inheritFd" json:"inherit_fd,omitempty"`
+ AutoExtMnt *bool `protobuf:"varint,28,opt,name=auto_ext_mnt,json=autoExtMnt" json:"auto_ext_mnt,omitempty"`
+ ExtSharing *bool `protobuf:"varint,29,opt,name=ext_sharing,json=extSharing" json:"ext_sharing,omitempty"`
+ ExtMasters *bool `protobuf:"varint,30,opt,name=ext_masters,json=extMasters" json:"ext_masters,omitempty"`
+ SkipMnt []string `protobuf:"bytes,31,rep,name=skip_mnt,json=skipMnt" json:"skip_mnt,omitempty"`
+ EnableFs []string `protobuf:"bytes,32,rep,name=enable_fs,json=enableFs" json:"enable_fs,omitempty"`
+ UnixSkIno []*UnixSk `protobuf:"bytes,33,rep,name=unix_sk_ino,json=unixSkIno" json:"unix_sk_ino,omitempty"`
+ ManageCgroupsMode *CriuCgMode `protobuf:"varint,34,opt,name=manage_cgroups_mode,json=manageCgroupsMode,enum=CriuCgMode" json:"manage_cgroups_mode,omitempty"`
+ GhostLimit *uint32 `protobuf:"varint,35,opt,name=ghost_limit,json=ghostLimit,def=1048576" json:"ghost_limit,omitempty"`
+ IrmapScanPaths []string `protobuf:"bytes,36,rep,name=irmap_scan_paths,json=irmapScanPaths" json:"irmap_scan_paths,omitempty"`
+ External []string `protobuf:"bytes,37,rep,name=external" json:"external,omitempty"`
+ EmptyNs *uint32 `protobuf:"varint,38,opt,name=empty_ns,json=emptyNs" json:"empty_ns,omitempty"`
+ JoinNs []*JoinNamespace `protobuf:"bytes,39,rep,name=join_ns,json=joinNs" json:"join_ns,omitempty"`
+ CgroupProps *string `protobuf:"bytes,41,opt,name=cgroup_props,json=cgroupProps" json:"cgroup_props,omitempty"`
+ CgroupPropsFile *string `protobuf:"bytes,42,opt,name=cgroup_props_file,json=cgroupPropsFile" json:"cgroup_props_file,omitempty"`
+ CgroupDumpController []string `protobuf:"bytes,43,rep,name=cgroup_dump_controller,json=cgroupDumpController" json:"cgroup_dump_controller,omitempty"`
+ FreezeCgroup *string `protobuf:"bytes,44,opt,name=freeze_cgroup,json=freezeCgroup" json:"freeze_cgroup,omitempty"`
+ Timeout *uint32 `protobuf:"varint,45,opt,name=timeout" json:"timeout,omitempty"`
+ TcpSkipInFlight *bool `protobuf:"varint,46,opt,name=tcp_skip_in_flight,json=tcpSkipInFlight" json:"tcp_skip_in_flight,omitempty"`
+ WeakSysctls *bool `protobuf:"varint,47,opt,name=weak_sysctls,json=weakSysctls" json:"weak_sysctls,omitempty"`
+ LazyPages *bool `protobuf:"varint,48,opt,name=lazy_pages,json=lazyPages" json:"lazy_pages,omitempty"`
+ StatusFd *int32 `protobuf:"varint,49,opt,name=status_fd,json=statusFd" json:"status_fd,omitempty"`
+ OrphanPtsMaster *bool `protobuf:"varint,50,opt,name=orphan_pts_master,json=orphanPtsMaster" json:"orphan_pts_master,omitempty"`
+ XXX_unrecognized []byte `json:"-"`
+}
+
+func (m *CriuOpts) Reset() { *m = CriuOpts{} }
+func (m *CriuOpts) String() string { return proto.CompactTextString(m) }
+func (*CriuOpts) ProtoMessage() {}
+func (*CriuOpts) Descriptor() ([]byte, []int) { return fileDescriptor0, []int{7} }
+
+const Default_CriuOpts_LogLevel int32 = 2
+const Default_CriuOpts_CpuCap uint32 = 4294967295
+const Default_CriuOpts_GhostLimit uint32 = 1048576
+
+func (m *CriuOpts) GetImagesDirFd() int32 {
+ if m != nil && m.ImagesDirFd != nil {
+ return *m.ImagesDirFd
+ }
+ return 0
+}
+
+func (m *CriuOpts) GetPid() int32 {
+ if m != nil && m.Pid != nil {
+ return *m.Pid
+ }
+ return 0
+}
+
+func (m *CriuOpts) GetLeaveRunning() bool {
+ if m != nil && m.LeaveRunning != nil {
+ return *m.LeaveRunning
+ }
+ return false
+}
+
+func (m *CriuOpts) GetExtUnixSk() bool {
+ if m != nil && m.ExtUnixSk != nil {
+ return *m.ExtUnixSk
+ }
+ return false
+}
+
+func (m *CriuOpts) GetTcpEstablished() bool {
+ if m != nil && m.TcpEstablished != nil {
+ return *m.TcpEstablished
+ }
+ return false
+}
+
+func (m *CriuOpts) GetEvasiveDevices() bool {
+ if m != nil && m.EvasiveDevices != nil {
+ return *m.EvasiveDevices
+ }
+ return false
+}
+
+func (m *CriuOpts) GetShellJob() bool {
+ if m != nil && m.ShellJob != nil {
+ return *m.ShellJob
+ }
+ return false
+}
+
+func (m *CriuOpts) GetFileLocks() bool {
+ if m != nil && m.FileLocks != nil {
+ return *m.FileLocks
+ }
+ return false
+}
+
+func (m *CriuOpts) GetLogLevel() int32 {
+ if m != nil && m.LogLevel != nil {
+ return *m.LogLevel
+ }
+ return Default_CriuOpts_LogLevel
+}
+
+func (m *CriuOpts) GetLogFile() string {
+ if m != nil && m.LogFile != nil {
+ return *m.LogFile
+ }
+ return ""
+}
+
+func (m *CriuOpts) GetPs() *CriuPageServerInfo {
+ if m != nil {
+ return m.Ps
+ }
+ return nil
+}
+
+func (m *CriuOpts) GetNotifyScripts() bool {
+ if m != nil && m.NotifyScripts != nil {
+ return *m.NotifyScripts
+ }
+ return false
+}
+
+func (m *CriuOpts) GetRoot() string {
+ if m != nil && m.Root != nil {
+ return *m.Root
+ }
+ return ""
+}
+
+func (m *CriuOpts) GetParentImg() string {
+ if m != nil && m.ParentImg != nil {
+ return *m.ParentImg
+ }
+ return ""
+}
+
+func (m *CriuOpts) GetTrackMem() bool {
+ if m != nil && m.TrackMem != nil {
+ return *m.TrackMem
+ }
+ return false
+}
+
+func (m *CriuOpts) GetAutoDedup() bool {
+ if m != nil && m.AutoDedup != nil {
+ return *m.AutoDedup
+ }
+ return false
+}
+
+func (m *CriuOpts) GetWorkDirFd() int32 {
+ if m != nil && m.WorkDirFd != nil {
+ return *m.WorkDirFd
+ }
+ return 0
+}
+
+func (m *CriuOpts) GetLinkRemap() bool {
+ if m != nil && m.LinkRemap != nil {
+ return *m.LinkRemap
+ }
+ return false
+}
+
+func (m *CriuOpts) GetVeths() []*CriuVethPair {
+ if m != nil {
+ return m.Veths
+ }
+ return nil
+}
+
+func (m *CriuOpts) GetCpuCap() uint32 {
+ if m != nil && m.CpuCap != nil {
+ return *m.CpuCap
+ }
+ return Default_CriuOpts_CpuCap
+}
+
+func (m *CriuOpts) GetForceIrmap() bool {
+ if m != nil && m.ForceIrmap != nil {
+ return *m.ForceIrmap
+ }
+ return false
+}
+
+func (m *CriuOpts) GetExecCmd() []string {
+ if m != nil {
+ return m.ExecCmd
+ }
+ return nil
+}
+
+func (m *CriuOpts) GetExtMnt() []*ExtMountMap {
+ if m != nil {
+ return m.ExtMnt
+ }
+ return nil
+}
+
+func (m *CriuOpts) GetManageCgroups() bool {
+ if m != nil && m.ManageCgroups != nil {
+ return *m.ManageCgroups
+ }
+ return false
+}
+
+func (m *CriuOpts) GetCgRoot() []*CgroupRoot {
+ if m != nil {
+ return m.CgRoot
+ }
+ return nil
+}
+
+func (m *CriuOpts) GetRstSibling() bool {
+ if m != nil && m.RstSibling != nil {
+ return *m.RstSibling
+ }
+ return false
+}
+
+func (m *CriuOpts) GetInheritFd() []*InheritFd {
+ if m != nil {
+ return m.InheritFd
+ }
+ return nil
+}
+
+func (m *CriuOpts) GetAutoExtMnt() bool {
+ if m != nil && m.AutoExtMnt != nil {
+ return *m.AutoExtMnt
+ }
+ return false
+}
+
+func (m *CriuOpts) GetExtSharing() bool {
+ if m != nil && m.ExtSharing != nil {
+ return *m.ExtSharing
+ }
+ return false
+}
+
+func (m *CriuOpts) GetExtMasters() bool {
+ if m != nil && m.ExtMasters != nil {
+ return *m.ExtMasters
+ }
+ return false
+}
+
+func (m *CriuOpts) GetSkipMnt() []string {
+ if m != nil {
+ return m.SkipMnt
+ }
+ return nil
+}
+
+func (m *CriuOpts) GetEnableFs() []string {
+ if m != nil {
+ return m.EnableFs
+ }
+ return nil
+}
+
+func (m *CriuOpts) GetUnixSkIno() []*UnixSk {
+ if m != nil {
+ return m.UnixSkIno
+ }
+ return nil
+}
+
+func (m *CriuOpts) GetManageCgroupsMode() CriuCgMode {
+ if m != nil && m.ManageCgroupsMode != nil {
+ return *m.ManageCgroupsMode
+ }
+ return CriuCgMode_IGNORE
+}
+
+func (m *CriuOpts) GetGhostLimit() uint32 {
+ if m != nil && m.GhostLimit != nil {
+ return *m.GhostLimit
+ }
+ return Default_CriuOpts_GhostLimit
+}
+
+func (m *CriuOpts) GetIrmapScanPaths() []string {
+ if m != nil {
+ return m.IrmapScanPaths
+ }
+ return nil
+}
+
+func (m *CriuOpts) GetExternal() []string {
+ if m != nil {
+ return m.External
+ }
+ return nil
+}
+
+func (m *CriuOpts) GetEmptyNs() uint32 {
+ if m != nil && m.EmptyNs != nil {
+ return *m.EmptyNs
+ }
+ return 0
+}
+
+func (m *CriuOpts) GetJoinNs() []*JoinNamespace {
+ if m != nil {
+ return m.JoinNs
+ }
+ return nil
+}
+
+func (m *CriuOpts) GetCgroupProps() string {
+ if m != nil && m.CgroupProps != nil {
+ return *m.CgroupProps
+ }
+ return ""
+}
+
+func (m *CriuOpts) GetCgroupPropsFile() string {
+ if m != nil && m.CgroupPropsFile != nil {
+ return *m.CgroupPropsFile
+ }
+ return ""
+}
+
+func (m *CriuOpts) GetCgroupDumpController() []string {
+ if m != nil {
+ return m.CgroupDumpController
+ }
+ return nil
+}
+
+func (m *CriuOpts) GetFreezeCgroup() string {
+ if m != nil && m.FreezeCgroup != nil {
+ return *m.FreezeCgroup
+ }
+ return ""
+}
+
+func (m *CriuOpts) GetTimeout() uint32 {
+ if m != nil && m.Timeout != nil {
+ return *m.Timeout
+ }
+ return 0
+}
+
+func (m *CriuOpts) GetTcpSkipInFlight() bool {
+ if m != nil && m.TcpSkipInFlight != nil {
+ return *m.TcpSkipInFlight
+ }
+ return false
+}
+
+func (m *CriuOpts) GetWeakSysctls() bool {
+ if m != nil && m.WeakSysctls != nil {
+ return *m.WeakSysctls
+ }
+ return false
+}
+
+func (m *CriuOpts) GetLazyPages() bool {
+ if m != nil && m.LazyPages != nil {
+ return *m.LazyPages
+ }
+ return false
+}
+
+func (m *CriuOpts) GetStatusFd() int32 {
+ if m != nil && m.StatusFd != nil {
+ return *m.StatusFd
+ }
+ return 0
+}
+
+func (m *CriuOpts) GetOrphanPtsMaster() bool {
+ if m != nil && m.OrphanPtsMaster != nil {
+ return *m.OrphanPtsMaster
+ }
+ return false
+}
+
+type CriuDumpResp struct {
+ Restored *bool `protobuf:"varint,1,opt,name=restored" json:"restored,omitempty"`
+ XXX_unrecognized []byte `json:"-"`
+}
+
+func (m *CriuDumpResp) Reset() { *m = CriuDumpResp{} }
+func (m *CriuDumpResp) String() string { return proto.CompactTextString(m) }
+func (*CriuDumpResp) ProtoMessage() {}
+func (*CriuDumpResp) Descriptor() ([]byte, []int) { return fileDescriptor0, []int{8} }
+
+func (m *CriuDumpResp) GetRestored() bool {
+ if m != nil && m.Restored != nil {
+ return *m.Restored
+ }
+ return false
+}
+
+type CriuRestoreResp struct {
+ Pid *int32 `protobuf:"varint,1,req,name=pid" json:"pid,omitempty"`
+ XXX_unrecognized []byte `json:"-"`
+}
+
+func (m *CriuRestoreResp) Reset() { *m = CriuRestoreResp{} }
+func (m *CriuRestoreResp) String() string { return proto.CompactTextString(m) }
+func (*CriuRestoreResp) ProtoMessage() {}
+func (*CriuRestoreResp) Descriptor() ([]byte, []int) { return fileDescriptor0, []int{9} }
+
+func (m *CriuRestoreResp) GetPid() int32 {
+ if m != nil && m.Pid != nil {
+ return *m.Pid
+ }
+ return 0
+}
+
+type CriuNotify struct {
+ Script *string `protobuf:"bytes,1,opt,name=script" json:"script,omitempty"`
+ Pid *int32 `protobuf:"varint,2,opt,name=pid" json:"pid,omitempty"`
+ XXX_unrecognized []byte `json:"-"`
+}
+
+func (m *CriuNotify) Reset() { *m = CriuNotify{} }
+func (m *CriuNotify) String() string { return proto.CompactTextString(m) }
+func (*CriuNotify) ProtoMessage() {}
+func (*CriuNotify) Descriptor() ([]byte, []int) { return fileDescriptor0, []int{10} }
+
+func (m *CriuNotify) GetScript() string {
+ if m != nil && m.Script != nil {
+ return *m.Script
+ }
+ return ""
+}
+
+func (m *CriuNotify) GetPid() int32 {
+ if m != nil && m.Pid != nil {
+ return *m.Pid
+ }
+ return 0
+}
+
+//
+// List of features which can queried via
+// CRIU_REQ_TYPE__FEATURE_CHECK
+type CriuFeatures struct {
+ MemTrack *bool `protobuf:"varint,1,opt,name=mem_track,json=memTrack" json:"mem_track,omitempty"`
+ LazyPages *bool `protobuf:"varint,2,opt,name=lazy_pages,json=lazyPages" json:"lazy_pages,omitempty"`
+ XXX_unrecognized []byte `json:"-"`
+}
+
+func (m *CriuFeatures) Reset() { *m = CriuFeatures{} }
+func (m *CriuFeatures) String() string { return proto.CompactTextString(m) }
+func (*CriuFeatures) ProtoMessage() {}
+func (*CriuFeatures) Descriptor() ([]byte, []int) { return fileDescriptor0, []int{11} }
+
+func (m *CriuFeatures) GetMemTrack() bool {
+ if m != nil && m.MemTrack != nil {
+ return *m.MemTrack
+ }
+ return false
+}
+
+func (m *CriuFeatures) GetLazyPages() bool {
+ if m != nil && m.LazyPages != nil {
+ return *m.LazyPages
+ }
+ return false
+}
+
+type CriuReq struct {
+ Type *CriuReqType `protobuf:"varint,1,req,name=type,enum=CriuReqType" json:"type,omitempty"`
+ Opts *CriuOpts `protobuf:"bytes,2,opt,name=opts" json:"opts,omitempty"`
+ NotifySuccess *bool `protobuf:"varint,3,opt,name=notify_success,json=notifySuccess" json:"notify_success,omitempty"`
+ //
+ // When set service won't close the connection but
+ // will wait for more req-s to appear. Works not
+ // for all request types.
+ KeepOpen *bool `protobuf:"varint,4,opt,name=keep_open,json=keepOpen" json:"keep_open,omitempty"`
+ //
+ // 'features' can be used to query which features
+ // are supported by the installed criu/kernel
+ // via RPC.
+ Features *CriuFeatures `protobuf:"bytes,5,opt,name=features" json:"features,omitempty"`
+ XXX_unrecognized []byte `json:"-"`
+}
+
+func (m *CriuReq) Reset() { *m = CriuReq{} }
+func (m *CriuReq) String() string { return proto.CompactTextString(m) }
+func (*CriuReq) ProtoMessage() {}
+func (*CriuReq) Descriptor() ([]byte, []int) { return fileDescriptor0, []int{12} }
+
+func (m *CriuReq) GetType() CriuReqType {
+ if m != nil && m.Type != nil {
+ return *m.Type
+ }
+ return CriuReqType_EMPTY
+}
+
+func (m *CriuReq) GetOpts() *CriuOpts {
+ if m != nil {
+ return m.Opts
+ }
+ return nil
+}
+
+func (m *CriuReq) GetNotifySuccess() bool {
+ if m != nil && m.NotifySuccess != nil {
+ return *m.NotifySuccess
+ }
+ return false
+}
+
+func (m *CriuReq) GetKeepOpen() bool {
+ if m != nil && m.KeepOpen != nil {
+ return *m.KeepOpen
+ }
+ return false
+}
+
+func (m *CriuReq) GetFeatures() *CriuFeatures {
+ if m != nil {
+ return m.Features
+ }
+ return nil
+}
+
+type CriuResp struct {
+ Type *CriuReqType `protobuf:"varint,1,req,name=type,enum=CriuReqType" json:"type,omitempty"`
+ Success *bool `protobuf:"varint,2,req,name=success" json:"success,omitempty"`
+ Dump *CriuDumpResp `protobuf:"bytes,3,opt,name=dump" json:"dump,omitempty"`
+ Restore *CriuRestoreResp `protobuf:"bytes,4,opt,name=restore" json:"restore,omitempty"`
+ Notify *CriuNotify `protobuf:"bytes,5,opt,name=notify" json:"notify,omitempty"`
+ Ps *CriuPageServerInfo `protobuf:"bytes,6,opt,name=ps" json:"ps,omitempty"`
+ CrErrno *int32 `protobuf:"varint,7,opt,name=cr_errno,json=crErrno" json:"cr_errno,omitempty"`
+ Features *CriuFeatures `protobuf:"bytes,8,opt,name=features" json:"features,omitempty"`
+ CrErrmsg *string `protobuf:"bytes,9,opt,name=cr_errmsg,json=crErrmsg" json:"cr_errmsg,omitempty"`
+ Version *CriuVersion `protobuf:"bytes,10,opt,name=version" json:"version,omitempty"`
+ XXX_unrecognized []byte `json:"-"`
+}
+
+func (m *CriuResp) Reset() { *m = CriuResp{} }
+func (m *CriuResp) String() string { return proto.CompactTextString(m) }
+func (*CriuResp) ProtoMessage() {}
+func (*CriuResp) Descriptor() ([]byte, []int) { return fileDescriptor0, []int{13} }
+
+func (m *CriuResp) GetType() CriuReqType {
+ if m != nil && m.Type != nil {
+ return *m.Type
+ }
+ return CriuReqType_EMPTY
+}
+
+func (m *CriuResp) GetSuccess() bool {
+ if m != nil && m.Success != nil {
+ return *m.Success
+ }
+ return false
+}
+
+func (m *CriuResp) GetDump() *CriuDumpResp {
+ if m != nil {
+ return m.Dump
+ }
+ return nil
+}
+
+func (m *CriuResp) GetRestore() *CriuRestoreResp {
+ if m != nil {
+ return m.Restore
+ }
+ return nil
+}
+
+func (m *CriuResp) GetNotify() *CriuNotify {
+ if m != nil {
+ return m.Notify
+ }
+ return nil
+}
+
+func (m *CriuResp) GetPs() *CriuPageServerInfo {
+ if m != nil {
+ return m.Ps
+ }
+ return nil
+}
+
+func (m *CriuResp) GetCrErrno() int32 {
+ if m != nil && m.CrErrno != nil {
+ return *m.CrErrno
+ }
+ return 0
+}
+
+func (m *CriuResp) GetFeatures() *CriuFeatures {
+ if m != nil {
+ return m.Features
+ }
+ return nil
+}
+
+func (m *CriuResp) GetCrErrmsg() string {
+ if m != nil && m.CrErrmsg != nil {
+ return *m.CrErrmsg
+ }
+ return ""
+}
+
+func (m *CriuResp) GetVersion() *CriuVersion {
+ if m != nil {
+ return m.Version
+ }
+ return nil
+}
+
+// Answer for criu_req_type.VERSION requests
+type CriuVersion struct {
+ Major *int32 `protobuf:"varint,1,req,name=major" json:"major,omitempty"`
+ Minor *int32 `protobuf:"varint,2,req,name=minor" json:"minor,omitempty"`
+ Gitid *string `protobuf:"bytes,3,opt,name=gitid" json:"gitid,omitempty"`
+ Sublevel *int32 `protobuf:"varint,4,opt,name=sublevel" json:"sublevel,omitempty"`
+ Extra *int32 `protobuf:"varint,5,opt,name=extra" json:"extra,omitempty"`
+ Name *string `protobuf:"bytes,6,opt,name=name" json:"name,omitempty"`
+ XXX_unrecognized []byte `json:"-"`
+}
+
+func (m *CriuVersion) Reset() { *m = CriuVersion{} }
+func (m *CriuVersion) String() string { return proto.CompactTextString(m) }
+func (*CriuVersion) ProtoMessage() {}
+func (*CriuVersion) Descriptor() ([]byte, []int) { return fileDescriptor0, []int{14} }
+
+func (m *CriuVersion) GetMajor() int32 {
+ if m != nil && m.Major != nil {
+ return *m.Major
+ }
+ return 0
+}
+
+func (m *CriuVersion) GetMinor() int32 {
+ if m != nil && m.Minor != nil {
+ return *m.Minor
+ }
+ return 0
+}
+
+func (m *CriuVersion) GetGitid() string {
+ if m != nil && m.Gitid != nil {
+ return *m.Gitid
+ }
+ return ""
+}
+
+func (m *CriuVersion) GetSublevel() int32 {
+ if m != nil && m.Sublevel != nil {
+ return *m.Sublevel
+ }
+ return 0
+}
+
+func (m *CriuVersion) GetExtra() int32 {
+ if m != nil && m.Extra != nil {
+ return *m.Extra
+ }
+ return 0
+}
+
+func (m *CriuVersion) GetName() string {
+ if m != nil && m.Name != nil {
+ return *m.Name
+ }
+ return ""
+}
+
+func init() {
+ proto.RegisterType((*CriuPageServerInfo)(nil), "criu_page_server_info")
+ proto.RegisterType((*CriuVethPair)(nil), "criu_veth_pair")
+ proto.RegisterType((*ExtMountMap)(nil), "ext_mount_map")
+ proto.RegisterType((*JoinNamespace)(nil), "join_namespace")
+ proto.RegisterType((*InheritFd)(nil), "inherit_fd")
+ proto.RegisterType((*CgroupRoot)(nil), "cgroup_root")
+ proto.RegisterType((*UnixSk)(nil), "unix_sk")
+ proto.RegisterType((*CriuOpts)(nil), "criu_opts")
+ proto.RegisterType((*CriuDumpResp)(nil), "criu_dump_resp")
+ proto.RegisterType((*CriuRestoreResp)(nil), "criu_restore_resp")
+ proto.RegisterType((*CriuNotify)(nil), "criu_notify")
+ proto.RegisterType((*CriuFeatures)(nil), "criu_features")
+ proto.RegisterType((*CriuReq)(nil), "criu_req")
+ proto.RegisterType((*CriuResp)(nil), "criu_resp")
+ proto.RegisterType((*CriuVersion)(nil), "criu_version")
+ proto.RegisterEnum("CriuCgMode", CriuCgMode_name, CriuCgMode_value)
+ proto.RegisterEnum("CriuReqType", CriuReqType_name, CriuReqType_value)
+}
+
+func init() { proto.RegisterFile("criurpc.proto", fileDescriptor0) }
+
+var fileDescriptor0 = []byte{
+ // 1781 bytes of a gzipped FileDescriptorProto
+ 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0x8c, 0x56, 0xdd, 0x72, 0x5b, 0xb7,
+ 0x11, 0x0e, 0x29, 0xfe, 0x1c, 0x82, 0x3f, 0xa6, 0x10, 0xdb, 0x81, 0x93, 0xda, 0x62, 0xe8, 0x28,
+ 0x51, 0x15, 0x97, 0x4d, 0x58, 0x3b, 0xae, 0x33, 0xed, 0x85, 0x47, 0x22, 0x5d, 0x36, 0x92, 0xc8,
+ 0x01, 0x25, 0xcf, 0xe4, 0x0a, 0x73, 0x74, 0x0e, 0x48, 0xc1, 0x3c, 0x7f, 0x05, 0x40, 0x45, 0xf2,
+ 0x83, 0xf4, 0x29, 0xfa, 0x0c, 0x7d, 0x84, 0xbe, 0x4e, 0x6f, 0x3b, 0xbb, 0x00, 0x65, 0x29, 0xc9,
+ 0xb4, 0xbd, 0xc3, 0x7e, 0x58, 0x00, 0xbb, 0xfb, 0xed, 0x0f, 0x48, 0x3b, 0xd2, 0x6a, 0xad, 0x8b,
+ 0x68, 0x50, 0xe8, 0xdc, 0xe6, 0xfd, 0x25, 0x79, 0x00, 0x80, 0x28, 0xc2, 0xa5, 0x14, 0x46, 0xea,
+ 0x4b, 0xa9, 0x85, 0xca, 0x16, 0x39, 0x65, 0xa4, 0x1e, 0xc6, 0xb1, 0x96, 0xc6, 0xb0, 0x52, 0xaf,
+ 0xb4, 0xd7, 0xe0, 0x1b, 0x91, 0x52, 0x52, 0x29, 0x72, 0x6d, 0x59, 0xb9, 0x57, 0xda, 0xab, 0x72,
+ 0x5c, 0xd3, 0x2e, 0xd9, 0x2a, 0x54, 0xcc, 0xb6, 0x10, 0x82, 0x25, 0xed, 0x90, 0xf2, 0x22, 0x66,
+ 0x15, 0x04, 0xca, 0x8b, 0xb8, 0xff, 0x27, 0xd2, 0xc1, 0x87, 0x2e, 0xa5, 0xbd, 0x10, 0x45, 0xa8,
+ 0x34, 0xfd, 0x98, 0x54, 0xd5, 0x42, 0xa8, 0x8c, 0x95, 0x7a, 0xe5, 0xbd, 0x06, 0xaf, 0xa8, 0xc5,
+ 0x24, 0xa3, 0x0f, 0x48, 0x4d, 0x2d, 0x44, 0xbe, 0x86, 0xeb, 0x01, 0xad, 0xaa, 0xc5, 0x74, 0x6d,
+ 0xfb, 0x7f, 0x20, 0x6d, 0x79, 0x65, 0x45, 0x9a, 0xaf, 0x33, 0x2b, 0xd2, 0xb0, 0x80, 0x07, 0x57,
+ 0xf2, 0xda, 0x1f, 0x85, 0x25, 0x20, 0x97, 0x61, 0xe2, 0x8f, 0xc1, 0xb2, 0xff, 0x96, 0x74, 0xde,
+ 0xe5, 0x2a, 0x13, 0x59, 0x98, 0x4a, 0x53, 0x84, 0x91, 0x04, 0xa3, 0x32, 0xe3, 0x0f, 0x95, 0x33,
+ 0x43, 0x3f, 0x21, 0xf5, 0xcc, 0x88, 0x85, 0x4a, 0xa4, 0x3f, 0x57, 0xcb, 0xcc, 0x58, 0x25, 0x92,
+ 0x7e, 0x46, 0x1a, 0xf2, 0xca, 0xea, 0x50, 0xe4, 0x85, 0x45, 0xaf, 0x1a, 0x3c, 0x40, 0x60, 0x5a,
+ 0xd8, 0xfe, 0x80, 0x10, 0x95, 0x5d, 0x48, 0xad, 0xac, 0x58, 0xc4, 0xbf, 0x62, 0x89, 0x73, 0x1d,
+ 0x2e, 0x74, 0xae, 0xbf, 0x20, 0xcd, 0x68, 0xa9, 0xf3, 0x75, 0x21, 0x74, 0x9e, 0x5b, 0x88, 0x5f,
+ 0x64, 0x75, 0xe2, 0xc3, 0x8a, 0x6b, 0x8c, 0x69, 0x68, 0x2f, 0xbc, 0x15, 0xb8, 0xee, 0xef, 0x90,
+ 0xfa, 0x3a, 0x53, 0x57, 0xc2, 0xac, 0xe8, 0x7d, 0x52, 0x55, 0x59, 0x1e, 0x4b, 0x7c, 0xa5, 0xcd,
+ 0x9d, 0xd0, 0xff, 0x57, 0x9b, 0x34, 0x30, 0xa6, 0x79, 0x61, 0x0d, 0xed, 0x93, 0xb6, 0x4a, 0xc3,
+ 0xa5, 0x34, 0x22, 0x56, 0x5a, 0x2c, 0x62, 0xd4, 0xad, 0xf2, 0xa6, 0x03, 0x0f, 0x95, 0x1e, 0xc7,
+ 0x1b, 0x9a, 0xca, 0x1f, 0x68, 0x7a, 0x4a, 0xda, 0x89, 0x0c, 0x2f, 0xa5, 0xd0, 0xeb, 0x2c, 0x53,
+ 0xd9, 0x12, 0x9d, 0x0d, 0x78, 0x0b, 0x41, 0xee, 0x30, 0xfa, 0x84, 0x34, 0x21, 0xfa, 0xde, 0x1a,
+ 0x24, 0x35, 0xe0, 0x10, 0xa0, 0xb3, 0x4c, 0x5d, 0xcd, 0x57, 0xf4, 0x2b, 0x72, 0xcf, 0x46, 0x85,
+ 0x90, 0xc6, 0x86, 0xe7, 0x89, 0x32, 0x17, 0x32, 0x66, 0x55, 0xd4, 0xe9, 0xd8, 0xa8, 0x18, 0x7d,
+ 0x40, 0x41, 0x51, 0x5e, 0x86, 0x46, 0x5d, 0x4a, 0x11, 0xcb, 0x4b, 0x15, 0x49, 0xc3, 0x6a, 0x4e,
+ 0xd1, 0xc3, 0x87, 0x0e, 0x85, 0xf8, 0x9b, 0x0b, 0x99, 0x24, 0xe2, 0x5d, 0x7e, 0xce, 0xea, 0xa8,
+ 0x12, 0x20, 0xf0, 0xd7, 0xfc, 0x9c, 0x3e, 0x26, 0x04, 0x28, 0x13, 0x49, 0x1e, 0xad, 0x0c, 0x0b,
+ 0x9c, 0x35, 0x80, 0x1c, 0x01, 0x40, 0x9f, 0x90, 0x46, 0x92, 0x2f, 0x45, 0x22, 0x2f, 0x65, 0xc2,
+ 0x1a, 0xe0, 0xea, 0xf7, 0xa5, 0x21, 0x0f, 0x92, 0x7c, 0x79, 0x04, 0x10, 0x7d, 0x44, 0x60, 0xed,
+ 0x58, 0x27, 0x2e, 0xb5, 0x93, 0x7c, 0x89, 0xb4, 0x7f, 0x49, 0xca, 0x85, 0x61, 0xcd, 0x5e, 0x69,
+ 0xaf, 0x39, 0x7c, 0x38, 0xf8, 0xd5, 0xc2, 0xe0, 0xe5, 0xc2, 0xd0, 0x5d, 0xd2, 0xc9, 0x72, 0xab,
+ 0x16, 0xd7, 0xc2, 0x44, 0x5a, 0x15, 0xd6, 0xb0, 0x16, 0x5a, 0xd1, 0x76, 0xe8, 0xdc, 0x81, 0xc0,
+ 0x2a, 0x30, 0xce, 0xda, 0x8e, 0x69, 0x64, 0xff, 0x31, 0x21, 0x45, 0xa8, 0x65, 0x66, 0x85, 0x4a,
+ 0x97, 0xac, 0x83, 0x3b, 0x0d, 0x87, 0x4c, 0xd2, 0x25, 0x38, 0x6e, 0x75, 0x18, 0xad, 0x44, 0x2a,
+ 0x53, 0x76, 0xcf, 0x39, 0x8e, 0xc0, 0xb1, 0x4c, 0xe1, 0x6c, 0xb8, 0xb6, 0xb9, 0x88, 0x65, 0xbc,
+ 0x2e, 0x58, 0xd7, 0x39, 0x0e, 0xc8, 0x21, 0x00, 0x40, 0xd3, 0x4f, 0xb9, 0x5e, 0x6d, 0xf8, 0xdf,
+ 0x46, 0x96, 0x1b, 0x00, 0x39, 0xf6, 0x1f, 0x13, 0x92, 0xa8, 0x6c, 0x25, 0xb4, 0x4c, 0xc3, 0x82,
+ 0x51, 0x77, 0x1c, 0x10, 0x0e, 0x00, 0xdd, 0x25, 0x55, 0x28, 0x4e, 0xc3, 0x3e, 0xee, 0x6d, 0xed,
+ 0x35, 0x87, 0xf7, 0x06, 0x77, 0xeb, 0x95, 0xbb, 0x5d, 0xfa, 0x94, 0xd4, 0xa3, 0x62, 0x2d, 0xa2,
+ 0xb0, 0x60, 0xf7, 0x7b, 0xa5, 0xbd, 0xf6, 0xf7, 0xe4, 0xf9, 0xf0, 0xd5, 0xf3, 0x57, 0xdf, 0xbd,
+ 0x1c, 0xbe, 0x7a, 0xc1, 0x6b, 0x51, 0xb1, 0x3e, 0x08, 0x0b, 0xba, 0x43, 0x9a, 0x8b, 0x5c, 0x47,
+ 0x52, 0x28, 0x0d, 0x6f, 0x3d, 0xc0, 0xb7, 0x08, 0x42, 0x13, 0x40, 0x80, 0x04, 0x79, 0x25, 0x23,
+ 0x11, 0xa5, 0x31, 0x7b, 0xd8, 0xdb, 0x02, 0x12, 0x40, 0x3e, 0x48, 0x21, 0x49, 0xea, 0x58, 0xeb,
+ 0x99, 0x65, 0x9f, 0xa0, 0x25, 0x9d, 0xc1, 0x9d, 0xda, 0xe7, 0x35, 0x79, 0x65, 0x8f, 0x33, 0x0b,
+ 0x2c, 0xa4, 0x61, 0x06, 0xfc, 0xb8, 0xf2, 0x32, 0x8c, 0x39, 0x16, 0x1c, 0x7a, 0xe0, 0x40, 0xba,
+ 0x4b, 0xea, 0xd1, 0x12, 0x4b, 0x8f, 0x3d, 0xc2, 0xfb, 0x5a, 0x83, 0x5b, 0xe5, 0xc8, 0x6b, 0xd1,
+ 0x92, 0x03, 0x31, 0x3b, 0xa4, 0xa9, 0x8d, 0x15, 0x46, 0x9d, 0x27, 0x50, 0x07, 0x9f, 0x3a, 0x93,
+ 0xb5, 0xb1, 0x73, 0x87, 0xd0, 0xfd, 0xdb, 0x65, 0xcf, 0x3e, 0xc3, 0xab, 0x9a, 0x83, 0x0f, 0x10,
+ 0x6f, 0xf8, 0xf5, 0x38, 0xa6, 0x3d, 0xd2, 0x42, 0xa6, 0x36, 0x8e, 0xfc, 0xc6, 0xdd, 0x06, 0xd8,
+ 0xc8, 0x19, 0xbf, 0xe3, 0x6a, 0xca, 0x5c, 0x84, 0x1a, 0x9e, 0x7b, 0xec, 0x14, 0xe4, 0x95, 0x9d,
+ 0x3b, 0x64, 0xa3, 0x90, 0x86, 0xc6, 0x4a, 0x6d, 0xd8, 0x93, 0x1b, 0x85, 0x63, 0x87, 0x40, 0x08,
+ 0xcd, 0x4a, 0x15, 0x78, 0xff, 0x8e, 0x0b, 0x21, 0xc8, 0x70, 0x39, 0xb4, 0xaf, 0x2c, 0x3c, 0x4f,
+ 0xa4, 0x58, 0x18, 0xd6, 0xc3, 0xbd, 0xc0, 0x01, 0x63, 0x43, 0xf7, 0x48, 0xd3, 0x57, 0xb2, 0x50,
+ 0x59, 0xce, 0x3e, 0x47, 0x47, 0x82, 0x81, 0xc7, 0x78, 0x63, 0x8d, 0x45, 0x3d, 0xc9, 0x72, 0xfa,
+ 0x67, 0xf2, 0xf1, 0xdd, 0x00, 0x8b, 0x14, 0x9a, 0x50, 0xbf, 0x57, 0xda, 0xeb, 0x0c, 0xdb, 0x2e,
+ 0x3f, 0xa2, 0x25, 0x82, 0x7c, 0xfb, 0x4e, 0xd0, 0x8f, 0xf3, 0x58, 0xc2, 0x43, 0xcb, 0x8b, 0xdc,
+ 0x58, 0x91, 0xa8, 0x54, 0x59, 0xf6, 0x14, 0xb3, 0xa5, 0xfe, 0xed, 0x37, 0xcf, 0xff, 0xf8, 0xe2,
+ 0xe5, 0x77, 0x9c, 0xe0, 0xde, 0x11, 0x6c, 0xd1, 0x3d, 0xd2, 0xc5, 0x44, 0x11, 0x26, 0x0a, 0x33,
+ 0x01, 0xdd, 0xcf, 0xb0, 0x2f, 0xd0, 0xec, 0x0e, 0xe2, 0xf3, 0x28, 0xcc, 0x66, 0x80, 0xd2, 0x4f,
+ 0x21, 0x6f, 0xac, 0xd4, 0x59, 0x98, 0xb0, 0x5d, 0xef, 0x98, 0x97, 0x31, 0xa7, 0xd2, 0xc2, 0x5e,
+ 0x8b, 0xcc, 0xb0, 0x2f, 0xe1, 0x31, 0x5e, 0x47, 0xf9, 0x04, 0x7c, 0xae, 0xbb, 0x51, 0x60, 0xd8,
+ 0x57, 0x3e, 0xbb, 0xef, 0x8e, 0x06, 0x5e, 0x03, 0xf9, 0xc4, 0xd0, 0xcf, 0x49, 0xcb, 0x67, 0x47,
+ 0xa1, 0xf3, 0xc2, 0xb0, 0xdf, 0x62, 0x85, 0xfa, 0x06, 0x3e, 0x03, 0x88, 0xee, 0x93, 0xed, 0xdb,
+ 0x2a, 0xae, 0x93, 0xec, 0xa3, 0xde, 0xbd, 0x5b, 0x7a, 0xd8, 0x51, 0x9e, 0x93, 0x87, 0x5e, 0x37,
+ 0x5e, 0xa7, 0x85, 0x88, 0xf2, 0xcc, 0xea, 0x3c, 0x49, 0xa4, 0x66, 0x5f, 0xa3, 0xf5, 0xf7, 0xdd,
+ 0xee, 0xe1, 0x3a, 0x2d, 0x0e, 0x6e, 0xf6, 0xa0, 0x2b, 0x2f, 0xb4, 0x94, 0xef, 0x37, 0x81, 0x67,
+ 0xcf, 0xf0, 0xf6, 0x96, 0x03, 0x5d, 0x8c, 0x61, 0x42, 0x5b, 0x95, 0x4a, 0x98, 0x95, 0xbf, 0x73,
+ 0xde, 0x7a, 0x91, 0x7e, 0x4d, 0x28, 0xf4, 0x63, 0xcc, 0x0e, 0x95, 0x89, 0x45, 0xa2, 0x96, 0x17,
+ 0x96, 0x0d, 0x30, 0x83, 0xa0, 0x53, 0xcf, 0x57, 0xaa, 0x98, 0x64, 0x63, 0x84, 0xc1, 0xe1, 0x9f,
+ 0x64, 0xb8, 0x12, 0xe6, 0xda, 0x44, 0x36, 0x31, 0xec, 0xf7, 0xa8, 0xd6, 0x04, 0x6c, 0xee, 0x20,
+ 0x6c, 0x1c, 0xe1, 0xfb, 0x6b, 0xec, 0x85, 0x86, 0x7d, 0xe3, 0x1b, 0x47, 0xf8, 0xfe, 0x7a, 0x06,
+ 0x00, 0x36, 0x6b, 0x1b, 0xda, 0xb5, 0x81, 0xba, 0xf8, 0x16, 0xbb, 0x4e, 0xe0, 0x80, 0x71, 0x0c,
+ 0xc1, 0xca, 0x75, 0x71, 0x01, 0xb4, 0x5a, 0xe3, 0xb3, 0x99, 0x0d, 0x9d, 0x29, 0x6e, 0x63, 0x66,
+ 0x8d, 0x4b, 0xe9, 0xfe, 0x33, 0xff, 0x47, 0xc0, 0x50, 0x69, 0x69, 0x0a, 0xa0, 0x5b, 0x4b, 0x63,
+ 0x73, 0x2d, 0x63, 0x9c, 0x97, 0x01, 0xbf, 0x91, 0xfb, 0xbb, 0x64, 0x1b, 0xb5, 0x3d, 0xe0, 0x0e,
+ 0xf8, 0x09, 0xe7, 0x66, 0x1f, 0x2c, 0xfb, 0x2f, 0x49, 0x13, 0xd5, 0x5c, 0x6b, 0xa6, 0x0f, 0x49,
+ 0xcd, 0xf5, 0x6c, 0x3f, 0x7f, 0xbd, 0xf4, 0xcb, 0xd1, 0xd8, 0xff, 0xc1, 0xfd, 0x95, 0xc4, 0x42,
+ 0x86, 0x76, 0xad, 0x9d, 0x9f, 0xa9, 0x4c, 0x05, 0xb6, 0xe3, 0x8d, 0x35, 0xa9, 0x4c, 0x4f, 0x41,
+ 0xfe, 0x59, 0x8c, 0xca, 0x3f, 0x8b, 0x51, 0xff, 0x9f, 0x25, 0x12, 0x78, 0x6b, 0xff, 0x46, 0xfb,
+ 0xa4, 0x62, 0xaf, 0x0b, 0x37, 0xcd, 0x3b, 0xc3, 0xce, 0x60, 0xb3, 0x21, 0x00, 0xe5, 0xb8, 0x47,
+ 0x9f, 0x90, 0x0a, 0x8c, 0x75, 0xbc, 0xa9, 0x39, 0x24, 0x83, 0x9b, 0x41, 0xcf, 0x11, 0xbf, 0x3d,
+ 0x82, 0xd6, 0x51, 0x04, 0xdf, 0xb4, 0xad, 0x3b, 0x23, 0xc8, 0x81, 0x60, 0xf3, 0x4a, 0xca, 0x42,
+ 0xe4, 0x85, 0xcc, 0xfc, 0xe0, 0x0e, 0x00, 0x98, 0x16, 0x32, 0xa3, 0xfb, 0x24, 0xd8, 0x38, 0x87,
+ 0x03, 0xbb, 0xb9, 0xb1, 0x65, 0x83, 0xf2, 0x9b, 0xfd, 0xfe, 0xbf, 0xcb, 0xfe, 0xb3, 0x81, 0x61,
+ 0xfe, 0x7f, 0x3c, 0x60, 0xa4, 0xbe, 0x31, 0x0d, 0xbe, 0x35, 0x01, 0xdf, 0x88, 0xf4, 0x29, 0xa9,
+ 0x00, 0xc5, 0x68, 0xf1, 0xcd, 0xa0, 0xb9, 0x21, 0x9d, 0xe3, 0x26, 0x7d, 0x46, 0xea, 0x9e, 0x59,
+ 0xb4, 0xbb, 0x39, 0xa4, 0x83, 0x5f, 0xd0, 0xcd, 0x37, 0x2a, 0xf4, 0x0b, 0x52, 0x73, 0x8e, 0x7b,
+ 0x47, 0x5a, 0x83, 0x5b, 0xa4, 0x73, 0xbf, 0xe7, 0xe7, 0x7b, 0xed, 0x7f, 0xce, 0xf7, 0x47, 0x40,
+ 0x96, 0x90, 0x5a, 0x67, 0x39, 0xfe, 0x3e, 0xaa, 0xbc, 0x1e, 0xe9, 0x11, 0x88, 0x77, 0x62, 0x16,
+ 0xfc, 0xf7, 0x98, 0x41, 0xf0, 0xdd, 0x35, 0xa9, 0x59, 0xe2, 0x4f, 0xa4, 0xc1, 0x03, 0xbc, 0x27,
+ 0x35, 0x4b, 0x18, 0x73, 0x97, 0x52, 0x1b, 0x95, 0x67, 0xf8, 0x0b, 0x69, 0x6e, 0x1a, 0xaa, 0x07,
+ 0xf9, 0x66, 0xb7, 0xff, 0xf7, 0x12, 0x69, 0xdd, 0xde, 0x81, 0xdf, 0x60, 0x1a, 0xbe, 0xcb, 0xb5,
+ 0xcf, 0x72, 0x27, 0x20, 0xaa, 0xb2, 0x5c, 0xfb, 0x8f, 0xa7, 0x13, 0x00, 0x5d, 0x2a, 0xeb, 0xbf,
+ 0xe6, 0x0d, 0xee, 0x04, 0x28, 0x2b, 0xb3, 0x3e, 0x77, 0x3f, 0xa4, 0x8a, 0x2f, 0x58, 0x2f, 0xc3,
+ 0x09, 0xfc, 0xe9, 0x62, 0x20, 0xab, 0xdc, 0x09, 0xf0, 0x95, 0x81, 0x5e, 0x89, 0xb1, 0x6b, 0x70,
+ 0x5c, 0xef, 0x0b, 0x6f, 0x97, 0x1f, 0x01, 0x94, 0x90, 0xda, 0xe4, 0xcd, 0xc9, 0x94, 0x8f, 0xba,
+ 0x1f, 0xd1, 0x26, 0xa9, 0x1f, 0xbc, 0x11, 0x27, 0xd3, 0x93, 0x51, 0xb7, 0x44, 0x1b, 0xa4, 0x3a,
+ 0xe3, 0xd3, 0xd9, 0xbc, 0x5b, 0xa6, 0x01, 0xa9, 0xcc, 0xa7, 0xe3, 0xd3, 0xee, 0x16, 0xac, 0xc6,
+ 0x67, 0x47, 0x47, 0xdd, 0x0a, 0x9c, 0x9b, 0x9f, 0xf2, 0xc9, 0xc1, 0x69, 0xb7, 0x0a, 0xe7, 0x0e,
+ 0x47, 0xe3, 0xd7, 0x67, 0x47, 0xa7, 0xdd, 0xda, 0xfe, 0x3f, 0x4a, 0xbe, 0x04, 0x37, 0x99, 0x05,
+ 0x37, 0x8d, 0x8e, 0x67, 0xa7, 0x3f, 0x76, 0x3f, 0x82, 0xf3, 0x87, 0x67, 0xc7, 0xb3, 0x6e, 0x09,
+ 0xce, 0xf0, 0xd1, 0xfc, 0x14, 0x1e, 0x2e, 0x83, 0xc6, 0xc1, 0x5f, 0x46, 0x07, 0x3f, 0x74, 0xb7,
+ 0x68, 0x8b, 0x04, 0x33, 0x3e, 0x12, 0xa8, 0x55, 0xa1, 0xf7, 0x48, 0x73, 0xf6, 0xfa, 0xcd, 0x48,
+ 0xcc, 0x47, 0xfc, 0xed, 0x88, 0x77, 0xab, 0xf0, 0xec, 0xc9, 0xf4, 0x74, 0x32, 0xfe, 0xb1, 0x5b,
+ 0xa3, 0x5d, 0xd2, 0x3a, 0x98, 0x9d, 0x4d, 0x4e, 0xc6, 0x53, 0xa7, 0x5e, 0xa7, 0xdb, 0xa4, 0xbd,
+ 0x41, 0xdc, 0x7d, 0x01, 0x40, 0xe3, 0xd1, 0xeb, 0xd3, 0x33, 0x3e, 0xf2, 0x50, 0x03, 0x9e, 0x7e,
+ 0x3b, 0xe2, 0xf3, 0xc9, 0xf4, 0xa4, 0x4b, 0xfe, 0x13, 0x00, 0x00, 0xff, 0xff, 0x5f, 0x2a, 0xaf,
+ 0x49, 0x5b, 0x0d, 0x00, 0x00,
+}
--- /dev/null
+syntax = "proto2";
+
+message criu_page_server_info {
+ optional string address = 1;
+ optional int32 port = 2;
+ optional int32 pid = 3;
+ optional int32 fd = 4;
+}
+
+message criu_veth_pair {
+ required string if_in = 1;
+ required string if_out = 2;
+};
+
+message ext_mount_map {
+ required string key = 1;
+ required string val = 2;
+};
+
+message join_namespace {
+ required string ns = 1;
+ required string ns_file = 2;
+ optional string extra_opt = 3;
+}
+
+message inherit_fd {
+ required string key = 1;
+ required int32 fd = 2;
+};
+
+message cgroup_root {
+ optional string ctrl = 1;
+ required string path = 2;
+};
+
+message unix_sk {
+ required uint32 inode = 1;
+};
+
+enum criu_cg_mode {
+ IGNORE = 0;
+ CG_NONE = 1;
+ PROPS = 2;
+ SOFT = 3;
+ FULL = 4;
+ STRICT = 5;
+ DEFAULT = 6;
+};
+
+message criu_opts {
+ required int32 images_dir_fd = 1;
+ optional int32 pid = 2; /* if not set on dump, will dump requesting process */
+
+ optional bool leave_running = 3;
+ optional bool ext_unix_sk = 4;
+ optional bool tcp_established = 5;
+ optional bool evasive_devices = 6;
+ optional bool shell_job = 7;
+ optional bool file_locks = 8;
+ optional int32 log_level = 9 [default = 2];
+ optional string log_file = 10; /* No subdirs are allowed. Consider using work-dir */
+
+ optional criu_page_server_info ps = 11;
+
+ optional bool notify_scripts = 12;
+
+ optional string root = 13;
+ optional string parent_img = 14;
+ optional bool track_mem = 15;
+ optional bool auto_dedup = 16;
+
+ optional int32 work_dir_fd = 17;
+ optional bool link_remap = 18;
+ repeated criu_veth_pair veths = 19; /* DEPRECATED, use external instead */
+
+ optional uint32 cpu_cap = 20 [default = 0xffffffff];
+ optional bool force_irmap = 21;
+ repeated string exec_cmd = 22;
+
+ repeated ext_mount_map ext_mnt = 23; /* DEPRECATED, use external instead */
+ optional bool manage_cgroups = 24; /* backward compatibility */
+ repeated cgroup_root cg_root = 25;
+
+ optional bool rst_sibling = 26; /* swrk only */
+ repeated inherit_fd inherit_fd = 27; /* swrk only */
+
+ optional bool auto_ext_mnt = 28;
+ optional bool ext_sharing = 29;
+ optional bool ext_masters = 30;
+
+ repeated string skip_mnt = 31;
+ repeated string enable_fs = 32;
+
+ repeated unix_sk unix_sk_ino = 33; /* DEPRECATED, use external instead */
+
+ optional criu_cg_mode manage_cgroups_mode = 34;
+ optional uint32 ghost_limit = 35 [default = 0x100000];
+ repeated string irmap_scan_paths = 36;
+ repeated string external = 37;
+ optional uint32 empty_ns = 38;
+ repeated join_namespace join_ns = 39;
+
+ optional string cgroup_props = 41;
+ optional string cgroup_props_file = 42;
+ repeated string cgroup_dump_controller = 43;
+
+ optional string freeze_cgroup = 44;
+ optional uint32 timeout = 45;
+ optional bool tcp_skip_in_flight = 46;
+ optional bool weak_sysctls = 47;
+ optional bool lazy_pages = 48;
+ optional int32 status_fd = 49;
+ optional bool orphan_pts_master = 50;
+}
+
+message criu_dump_resp {
+ optional bool restored = 1;
+}
+
+message criu_restore_resp {
+ required int32 pid = 1;
+}
+
+message criu_notify {
+ optional string script = 1;
+ optional int32 pid = 2;
+}
+
+enum criu_req_type {
+ EMPTY = 0;
+ DUMP = 1;
+ RESTORE = 2;
+ CHECK = 3;
+ PRE_DUMP = 4;
+ PAGE_SERVER = 5;
+
+ NOTIFY = 6;
+
+ CPUINFO_DUMP = 7;
+ CPUINFO_CHECK = 8;
+
+ FEATURE_CHECK = 9;
+
+ VERSION = 10;
+}
+
+/*
+ * List of features which can queried via
+ * CRIU_REQ_TYPE__FEATURE_CHECK
+ */
+message criu_features {
+ optional bool mem_track = 1;
+ optional bool lazy_pages = 2;
+}
+
+/*
+ * Request -- each type corresponds to must-be-there
+ * request arguments of respective type
+ */
+
+message criu_req {
+ required criu_req_type type = 1;
+
+ optional criu_opts opts = 2;
+ optional bool notify_success = 3;
+
+ /*
+ * When set service won't close the connection but
+ * will wait for more req-s to appear. Works not
+ * for all request types.
+ */
+ optional bool keep_open = 4;
+ /*
+ * 'features' can be used to query which features
+ * are supported by the installed criu/kernel
+ * via RPC.
+ */
+ optional criu_features features = 5;
+}
+
+/*
+ * Response -- it states whether the request was served
+ * and additional request-specific information
+ */
+
+message criu_resp {
+ required criu_req_type type = 1;
+ required bool success = 2;
+
+ optional criu_dump_resp dump = 3;
+ optional criu_restore_resp restore = 4;
+ optional criu_notify notify = 5;
+ optional criu_page_server_info ps = 6;
+
+ optional int32 cr_errno = 7;
+ optional criu_features features = 8;
+ optional string cr_errmsg = 9;
+ optional criu_version version = 10;
+}
+
+/* Answer for criu_req_type.VERSION requests */
+message criu_version {
+ required int32 major = 1;
+ required int32 minor = 2;
+ optional string gitid = 3;
+ optional int32 sublevel = 4;
+ optional int32 extra = 5;
+ optional string name = 6;
+}
--- /dev/null
+package devices
+
+import (
+ "errors"
+ "io/ioutil"
+ "os"
+ "path/filepath"
+
+ "github.com/opencontainers/runc/libcontainer/configs"
+
+ "golang.org/x/sys/unix"
+)
+
+var (
+ ErrNotADevice = errors.New("not a device node")
+)
+
+// Testing dependencies
+var (
+ unixLstat = unix.Lstat
+ ioutilReadDir = ioutil.ReadDir
+)
+
+// Given the path to a device and its cgroup_permissions(which cannot be easily queried) look up the information about a linux device and return that information as a Device struct.
+func DeviceFromPath(path, permissions string) (*configs.Device, error) {
+ var stat unix.Stat_t
+ err := unixLstat(path, &stat)
+ if err != nil {
+ return nil, err
+ }
+
+ var (
+ devNumber = uint64(stat.Rdev)
+ major = unix.Major(devNumber)
+ minor = unix.Minor(devNumber)
+ )
+ if major == 0 {
+ return nil, ErrNotADevice
+ }
+
+ var (
+ devType rune
+ mode = stat.Mode
+ )
+ switch {
+ case mode&unix.S_IFBLK == unix.S_IFBLK:
+ devType = 'b'
+ case mode&unix.S_IFCHR == unix.S_IFCHR:
+ devType = 'c'
+ }
+ return &configs.Device{
+ Type: devType,
+ Path: path,
+ Major: int64(major),
+ Minor: int64(minor),
+ Permissions: permissions,
+ FileMode: os.FileMode(mode),
+ Uid: stat.Uid,
+ Gid: stat.Gid,
+ }, nil
+}
+
+func HostDevices() ([]*configs.Device, error) {
+ return getDevices("/dev")
+}
+
+func getDevices(path string) ([]*configs.Device, error) {
+ files, err := ioutilReadDir(path)
+ if err != nil {
+ return nil, err
+ }
+ out := []*configs.Device{}
+ for _, f := range files {
+ switch {
+ case f.IsDir():
+ switch f.Name() {
+ // ".lxc" & ".lxd-mounts" added to address https://github.com/lxc/lxd/issues/2825
+ case "pts", "shm", "fd", "mqueue", ".lxc", ".lxd-mounts":
+ continue
+ default:
+ sub, err := getDevices(filepath.Join(path, f.Name()))
+ if err != nil {
+ return nil, err
+ }
+
+ out = append(out, sub...)
+ continue
+ }
+ case f.Name() == "console":
+ continue
+ }
+ device, err := DeviceFromPath(filepath.Join(path, f.Name()), "rwm")
+ if err != nil {
+ if err == ErrNotADevice {
+ continue
+ }
+ if os.IsNotExist(err) {
+ continue
+ }
+ return nil, err
+ }
+ out = append(out, device)
+ }
+ return out, nil
+}
--- /dev/null
+package devices
+
+import (
+ "errors"
+ "os"
+ "testing"
+
+ "golang.org/x/sys/unix"
+)
+
+func TestDeviceFromPathLstatFailure(t *testing.T) {
+ testError := errors.New("test error")
+
+ // Override unix.Lstat to inject error.
+ unixLstat = func(path string, stat *unix.Stat_t) error {
+ return testError
+ }
+
+ _, err := DeviceFromPath("", "")
+ if err != testError {
+ t.Fatalf("Unexpected error %v, expected %v", err, testError)
+ }
+}
+
+func TestHostDevicesIoutilReadDirFailure(t *testing.T) {
+ testError := errors.New("test error")
+
+ // Override ioutil.ReadDir to inject error.
+ ioutilReadDir = func(dirname string) ([]os.FileInfo, error) {
+ return nil, testError
+ }
+
+ _, err := HostDevices()
+ if err != testError {
+ t.Fatalf("Unexpected error %v, expected %v", err, testError)
+ }
+}
+
+func TestHostDevicesIoutilReadDirDeepFailure(t *testing.T) {
+ testError := errors.New("test error")
+ called := false
+
+ // Override ioutil.ReadDir to inject error after the first call.
+ ioutilReadDir = func(dirname string) ([]os.FileInfo, error) {
+ if called {
+ return nil, testError
+ }
+ called = true
+
+ // Provoke a second call.
+ fi, err := os.Lstat("/tmp")
+ if err != nil {
+ t.Fatalf("Unexpected error %v", err)
+ }
+
+ return []os.FileInfo{fi}, nil
+ }
+
+ _, err := HostDevices()
+ if err != testError {
+ t.Fatalf("Unexpected error %v, expected %v", err, testError)
+ }
+}
--- /dev/null
+package libcontainer
+
+import "io"
+
+// ErrorCode is the API error code type.
+type ErrorCode int
+
+// API error codes.
+const (
+ // Factory errors
+ IdInUse ErrorCode = iota
+ InvalidIdFormat
+
+ // Container errors
+ ContainerNotExists
+ ContainerPaused
+ ContainerNotStopped
+ ContainerNotRunning
+ ContainerNotPaused
+
+ // Process errors
+ NoProcessOps
+
+ // Common errors
+ ConfigInvalid
+ ConsoleExists
+ SystemError
+)
+
+func (c ErrorCode) String() string {
+ switch c {
+ case IdInUse:
+ return "Id already in use"
+ case InvalidIdFormat:
+ return "Invalid format"
+ case ContainerPaused:
+ return "Container paused"
+ case ConfigInvalid:
+ return "Invalid configuration"
+ case SystemError:
+ return "System error"
+ case ContainerNotExists:
+ return "Container does not exist"
+ case ContainerNotStopped:
+ return "Container is not stopped"
+ case ContainerNotRunning:
+ return "Container is not running"
+ case ConsoleExists:
+ return "Console exists for process"
+ case ContainerNotPaused:
+ return "Container is not paused"
+ case NoProcessOps:
+ return "No process operations"
+ default:
+ return "Unknown error"
+ }
+}
+
+// Error is the API error type.
+type Error interface {
+ error
+
+ // Returns an error if it failed to write the detail of the Error to w.
+ // The detail of the Error may include the error message and a
+ // representation of the stack trace.
+ Detail(w io.Writer) error
+
+ // Returns the error code for this error.
+ Code() ErrorCode
+}
--- /dev/null
+package libcontainer
+
+import "testing"
+
+func TestErrorCode(t *testing.T) {
+ codes := map[ErrorCode]string{
+ IdInUse: "Id already in use",
+ InvalidIdFormat: "Invalid format",
+ ContainerPaused: "Container paused",
+ ConfigInvalid: "Invalid configuration",
+ SystemError: "System error",
+ ContainerNotExists: "Container does not exist",
+ ContainerNotStopped: "Container is not stopped",
+ ContainerNotRunning: "Container is not running",
+ ConsoleExists: "Console exists for process",
+ ContainerNotPaused: "Container is not paused",
+ NoProcessOps: "No process operations",
+ }
+
+ for code, expected := range codes {
+ if actual := code.String(); actual != expected {
+ t.Fatalf("expected string %q but received %q", expected, actual)
+ }
+ }
+}
--- /dev/null
+package libcontainer
+
+import (
+ "github.com/opencontainers/runc/libcontainer/configs"
+)
+
+type Factory interface {
+ // Creates a new container with the given id and starts the initial process inside it.
+ // id must be a string containing only letters, digits and underscores and must contain
+ // between 1 and 1024 characters, inclusive.
+ //
+ // The id must not already be in use by an existing container. Containers created using
+ // a factory with the same path (and filesystem) must have distinct ids.
+ //
+ // Returns the new container with a running process.
+ //
+ // errors:
+ // IdInUse - id is already in use by a container
+ // InvalidIdFormat - id has incorrect format
+ // ConfigInvalid - config is invalid
+ // Systemerror - System error
+ //
+ // On error, any partially created container parts are cleaned up (the operation is atomic).
+ Create(id string, config *configs.Config) (Container, error)
+
+ // Load takes an ID for an existing container and returns the container information
+ // from the state. This presents a read only view of the container.
+ //
+ // errors:
+ // Path does not exist
+ // System error
+ Load(id string) (Container, error)
+
+ // StartInitialization is an internal API to libcontainer used during the reexec of the
+ // container.
+ //
+ // Errors:
+ // Pipe connection error
+ // System error
+ StartInitialization() error
+
+ // Type returns info string about factory type (e.g. lxc, libcontainer...)
+ Type() string
+}
--- /dev/null
+// +build linux
+
+package libcontainer
+
+import (
+ "encoding/json"
+ "fmt"
+ "os"
+ "path/filepath"
+ "regexp"
+ "runtime/debug"
+ "strconv"
+
+ "github.com/cyphar/filepath-securejoin"
+ "github.com/opencontainers/runc/libcontainer/cgroups"
+ "github.com/opencontainers/runc/libcontainer/cgroups/fs"
+ "github.com/opencontainers/runc/libcontainer/cgroups/systemd"
+ "github.com/opencontainers/runc/libcontainer/configs"
+ "github.com/opencontainers/runc/libcontainer/configs/validate"
+ "github.com/opencontainers/runc/libcontainer/intelrdt"
+ "github.com/opencontainers/runc/libcontainer/mount"
+ "github.com/opencontainers/runc/libcontainer/utils"
+
+ "golang.org/x/sys/unix"
+)
+
+const (
+ stateFilename = "state.json"
+ execFifoFilename = "exec.fifo"
+)
+
+var idRegex = regexp.MustCompile(`^[\w+-\.]+$`)
+
+// InitArgs returns an options func to configure a LinuxFactory with the
+// provided init binary path and arguments.
+func InitArgs(args ...string) func(*LinuxFactory) error {
+ return func(l *LinuxFactory) (err error) {
+ if len(args) > 0 {
+ // Resolve relative paths to ensure that its available
+ // after directory changes.
+ if args[0], err = filepath.Abs(args[0]); err != nil {
+ return newGenericError(err, ConfigInvalid)
+ }
+ }
+
+ l.InitArgs = args
+ return nil
+ }
+}
+
+// SystemdCgroups is an options func to configure a LinuxFactory to return
+// containers that use systemd to create and manage cgroups.
+func SystemdCgroups(l *LinuxFactory) error {
+ l.NewCgroupsManager = func(config *configs.Cgroup, paths map[string]string) cgroups.Manager {
+ return &systemd.Manager{
+ Cgroups: config,
+ Paths: paths,
+ }
+ }
+ return nil
+}
+
+// Cgroupfs is an options func to configure a LinuxFactory to return containers
+// that use the native cgroups filesystem implementation to create and manage
+// cgroups.
+func Cgroupfs(l *LinuxFactory) error {
+ l.NewCgroupsManager = func(config *configs.Cgroup, paths map[string]string) cgroups.Manager {
+ return &fs.Manager{
+ Cgroups: config,
+ Paths: paths,
+ }
+ }
+ return nil
+}
+
+// RootlessCgroupfs is an options func to configure a LinuxFactory to return
+// containers that use the native cgroups filesystem implementation to create
+// and manage cgroups. The difference between RootlessCgroupfs and Cgroupfs is
+// that RootlessCgroupfs can transparently handle permission errors that occur
+// during rootless container (including euid=0 in userns) setup (while still allowing cgroup usage if
+// they've been set up properly).
+func RootlessCgroupfs(l *LinuxFactory) error {
+ l.NewCgroupsManager = func(config *configs.Cgroup, paths map[string]string) cgroups.Manager {
+ return &fs.Manager{
+ Cgroups: config,
+ Rootless: true,
+ Paths: paths,
+ }
+ }
+ return nil
+}
+
+// IntelRdtfs is an options func to configure a LinuxFactory to return
+// containers that use the Intel RDT "resource control" filesystem to
+// create and manage Intel RDT resources (e.g., L3 cache, memory bandwidth).
+func IntelRdtFs(l *LinuxFactory) error {
+ l.NewIntelRdtManager = func(config *configs.Config, id string, path string) intelrdt.Manager {
+ return &intelrdt.IntelRdtManager{
+ Config: config,
+ Id: id,
+ Path: path,
+ }
+ }
+ return nil
+}
+
+// TmpfsRoot is an option func to mount LinuxFactory.Root to tmpfs.
+func TmpfsRoot(l *LinuxFactory) error {
+ mounted, err := mount.Mounted(l.Root)
+ if err != nil {
+ return err
+ }
+ if !mounted {
+ if err := unix.Mount("tmpfs", l.Root, "tmpfs", 0, ""); err != nil {
+ return err
+ }
+ }
+ return nil
+}
+
+// CriuPath returns an option func to configure a LinuxFactory with the
+// provided criupath
+func CriuPath(criupath string) func(*LinuxFactory) error {
+ return func(l *LinuxFactory) error {
+ l.CriuPath = criupath
+ return nil
+ }
+}
+
+// New returns a linux based container factory based in the root directory and
+// configures the factory with the provided option funcs.
+func New(root string, options ...func(*LinuxFactory) error) (Factory, error) {
+ if root != "" {
+ if err := os.MkdirAll(root, 0700); err != nil {
+ return nil, newGenericError(err, SystemError)
+ }
+ }
+ l := &LinuxFactory{
+ Root: root,
+ InitPath: "/proc/self/exe",
+ InitArgs: []string{os.Args[0], "init"},
+ Validator: validate.New(),
+ CriuPath: "criu",
+ }
+ Cgroupfs(l)
+ for _, opt := range options {
+ if opt == nil {
+ continue
+ }
+ if err := opt(l); err != nil {
+ return nil, err
+ }
+ }
+ return l, nil
+}
+
+// LinuxFactory implements the default factory interface for linux based systems.
+type LinuxFactory struct {
+ // Root directory for the factory to store state.
+ Root string
+
+ // InitPath is the path for calling the init responsibilities for spawning
+ // a container.
+ InitPath string
+
+ // InitArgs are arguments for calling the init responsibilities for spawning
+ // a container.
+ InitArgs []string
+
+ // CriuPath is the path to the criu binary used for checkpoint and restore of
+ // containers.
+ CriuPath string
+
+ // New{u,g}uidmapPath is the path to the binaries used for mapping with
+ // rootless containers.
+ NewuidmapPath string
+ NewgidmapPath string
+
+ // Validator provides validation to container configurations.
+ Validator validate.Validator
+
+ // NewCgroupsManager returns an initialized cgroups manager for a single container.
+ NewCgroupsManager func(config *configs.Cgroup, paths map[string]string) cgroups.Manager
+
+ // NewIntelRdtManager returns an initialized Intel RDT manager for a single container.
+ NewIntelRdtManager func(config *configs.Config, id string, path string) intelrdt.Manager
+}
+
+func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, error) {
+ if l.Root == "" {
+ return nil, newGenericError(fmt.Errorf("invalid root"), ConfigInvalid)
+ }
+ if err := l.validateID(id); err != nil {
+ return nil, err
+ }
+ if err := l.Validator.Validate(config); err != nil {
+ return nil, newGenericError(err, ConfigInvalid)
+ }
+ containerRoot, err := securejoin.SecureJoin(l.Root, id)
+ if err != nil {
+ return nil, err
+ }
+ if _, err := os.Stat(containerRoot); err == nil {
+ return nil, newGenericError(fmt.Errorf("container with id exists: %v", id), IdInUse)
+ } else if !os.IsNotExist(err) {
+ return nil, newGenericError(err, SystemError)
+ }
+ if err := os.MkdirAll(containerRoot, 0711); err != nil {
+ return nil, newGenericError(err, SystemError)
+ }
+ if err := os.Chown(containerRoot, unix.Geteuid(), unix.Getegid()); err != nil {
+ return nil, newGenericError(err, SystemError)
+ }
+ c := &linuxContainer{
+ id: id,
+ root: containerRoot,
+ config: config,
+ initPath: l.InitPath,
+ initArgs: l.InitArgs,
+ criuPath: l.CriuPath,
+ newuidmapPath: l.NewuidmapPath,
+ newgidmapPath: l.NewgidmapPath,
+ cgroupManager: l.NewCgroupsManager(config.Cgroups, nil),
+ }
+ if intelrdt.IsCatEnabled() || intelrdt.IsMbaEnabled() {
+ c.intelRdtManager = l.NewIntelRdtManager(config, id, "")
+ }
+ c.state = &stoppedState{c: c}
+ return c, nil
+}
+
+func (l *LinuxFactory) Load(id string) (Container, error) {
+ if l.Root == "" {
+ return nil, newGenericError(fmt.Errorf("invalid root"), ConfigInvalid)
+ }
+ //when load, we need to check id is valid or not.
+ if err := l.validateID(id); err != nil {
+ return nil, err
+ }
+ containerRoot, err := securejoin.SecureJoin(l.Root, id)
+ if err != nil {
+ return nil, err
+ }
+ state, err := l.loadState(containerRoot, id)
+ if err != nil {
+ return nil, err
+ }
+ r := &nonChildProcess{
+ processPid: state.InitProcessPid,
+ processStartTime: state.InitProcessStartTime,
+ fds: state.ExternalDescriptors,
+ }
+ c := &linuxContainer{
+ initProcess: r,
+ initProcessStartTime: state.InitProcessStartTime,
+ id: id,
+ config: &state.Config,
+ initPath: l.InitPath,
+ initArgs: l.InitArgs,
+ criuPath: l.CriuPath,
+ newuidmapPath: l.NewuidmapPath,
+ newgidmapPath: l.NewgidmapPath,
+ cgroupManager: l.NewCgroupsManager(state.Config.Cgroups, state.CgroupPaths),
+ root: containerRoot,
+ created: state.Created,
+ }
+ c.state = &loadedState{c: c}
+ if err := c.refreshState(); err != nil {
+ return nil, err
+ }
+ if intelrdt.IsCatEnabled() || intelrdt.IsMbaEnabled() {
+ c.intelRdtManager = l.NewIntelRdtManager(&state.Config, id, state.IntelRdtPath)
+ }
+ return c, nil
+}
+
+func (l *LinuxFactory) Type() string {
+ return "libcontainer"
+}
+
+// StartInitialization loads a container by opening the pipe fd from the parent to read the configuration and state
+// This is a low level implementation detail of the reexec and should not be consumed externally
+func (l *LinuxFactory) StartInitialization() (err error) {
+ var (
+ pipefd, fifofd int
+ consoleSocket *os.File
+ envInitPipe = os.Getenv("_LIBCONTAINER_INITPIPE")
+ envFifoFd = os.Getenv("_LIBCONTAINER_FIFOFD")
+ envConsole = os.Getenv("_LIBCONTAINER_CONSOLE")
+ )
+
+ // Get the INITPIPE.
+ pipefd, err = strconv.Atoi(envInitPipe)
+ if err != nil {
+ return fmt.Errorf("unable to convert _LIBCONTAINER_INITPIPE=%s to int: %s", envInitPipe, err)
+ }
+
+ var (
+ pipe = os.NewFile(uintptr(pipefd), "pipe")
+ it = initType(os.Getenv("_LIBCONTAINER_INITTYPE"))
+ )
+ defer pipe.Close()
+
+ // Only init processes have FIFOFD.
+ fifofd = -1
+ if it == initStandard {
+ if fifofd, err = strconv.Atoi(envFifoFd); err != nil {
+ return fmt.Errorf("unable to convert _LIBCONTAINER_FIFOFD=%s to int: %s", envFifoFd, err)
+ }
+ }
+
+ if envConsole != "" {
+ console, err := strconv.Atoi(envConsole)
+ if err != nil {
+ return fmt.Errorf("unable to convert _LIBCONTAINER_CONSOLE=%s to int: %s", envConsole, err)
+ }
+ consoleSocket = os.NewFile(uintptr(console), "console-socket")
+ defer consoleSocket.Close()
+ }
+
+ // clear the current process's environment to clean any libcontainer
+ // specific env vars.
+ os.Clearenv()
+
+ defer func() {
+ // We have an error during the initialization of the container's init,
+ // send it back to the parent process in the form of an initError.
+ if werr := utils.WriteJSON(pipe, syncT{procError}); werr != nil {
+ fmt.Fprintln(os.Stderr, err)
+ return
+ }
+ if werr := utils.WriteJSON(pipe, newSystemError(err)); werr != nil {
+ fmt.Fprintln(os.Stderr, err)
+ return
+ }
+ }()
+ defer func() {
+ if e := recover(); e != nil {
+ err = fmt.Errorf("panic from initialization: %v, %v", e, string(debug.Stack()))
+ }
+ }()
+
+ i, err := newContainerInit(it, pipe, consoleSocket, fifofd)
+ if err != nil {
+ return err
+ }
+
+ // If Init succeeds, syscall.Exec will not return, hence none of the defers will be called.
+ return i.Init()
+}
+
+func (l *LinuxFactory) loadState(root, id string) (*State, error) {
+ stateFilePath, err := securejoin.SecureJoin(root, stateFilename)
+ if err != nil {
+ return nil, err
+ }
+ f, err := os.Open(stateFilePath)
+ if err != nil {
+ if os.IsNotExist(err) {
+ return nil, newGenericError(fmt.Errorf("container %q does not exist", id), ContainerNotExists)
+ }
+ return nil, newGenericError(err, SystemError)
+ }
+ defer f.Close()
+ var state *State
+ if err := json.NewDecoder(f).Decode(&state); err != nil {
+ return nil, newGenericError(err, SystemError)
+ }
+ return state, nil
+}
+
+func (l *LinuxFactory) validateID(id string) error {
+ if !idRegex.MatchString(id) || string(os.PathSeparator)+id != utils.CleanPath(string(os.PathSeparator)+id) {
+ return newGenericError(fmt.Errorf("invalid id format: %v", id), InvalidIdFormat)
+ }
+
+ return nil
+}
+
+// NewuidmapPath returns an option func to configure a LinuxFactory with the
+// provided ..
+func NewuidmapPath(newuidmapPath string) func(*LinuxFactory) error {
+ return func(l *LinuxFactory) error {
+ l.NewuidmapPath = newuidmapPath
+ return nil
+ }
+}
+
+// NewgidmapPath returns an option func to configure a LinuxFactory with the
+// provided ..
+func NewgidmapPath(newgidmapPath string) func(*LinuxFactory) error {
+ return func(l *LinuxFactory) error {
+ l.NewgidmapPath = newgidmapPath
+ return nil
+ }
+}
--- /dev/null
+// +build linux
+
+package libcontainer
+
+import (
+ "io/ioutil"
+ "os"
+ "path/filepath"
+ "reflect"
+ "testing"
+
+ "github.com/opencontainers/runc/libcontainer/configs"
+ "github.com/opencontainers/runc/libcontainer/mount"
+ "github.com/opencontainers/runc/libcontainer/utils"
+ "github.com/opencontainers/runtime-spec/specs-go"
+
+ "golang.org/x/sys/unix"
+)
+
+func newTestRoot() (string, error) {
+ dir, err := ioutil.TempDir("", "libcontainer")
+ if err != nil {
+ return "", err
+ }
+ return dir, nil
+}
+
+func TestFactoryNew(t *testing.T) {
+ root, rerr := newTestRoot()
+ if rerr != nil {
+ t.Fatal(rerr)
+ }
+ defer os.RemoveAll(root)
+ factory, err := New(root, Cgroupfs)
+ if err != nil {
+ t.Fatal(err)
+ }
+ if factory == nil {
+ t.Fatal("factory should not be nil")
+ }
+ lfactory, ok := factory.(*LinuxFactory)
+ if !ok {
+ t.Fatal("expected linux factory returned on linux based systems")
+ }
+ if lfactory.Root != root {
+ t.Fatalf("expected factory root to be %q but received %q", root, lfactory.Root)
+ }
+
+ if factory.Type() != "libcontainer" {
+ t.Fatalf("unexpected factory type: %q, expected %q", factory.Type(), "libcontainer")
+ }
+}
+
+func TestFactoryNewIntelRdt(t *testing.T) {
+ root, rerr := newTestRoot()
+ if rerr != nil {
+ t.Fatal(rerr)
+ }
+ defer os.RemoveAll(root)
+ factory, err := New(root, Cgroupfs, IntelRdtFs)
+ if err != nil {
+ t.Fatal(err)
+ }
+ if factory == nil {
+ t.Fatal("factory should not be nil")
+ }
+ lfactory, ok := factory.(*LinuxFactory)
+ if !ok {
+ t.Fatal("expected linux factory returned on linux based systems")
+ }
+ if lfactory.Root != root {
+ t.Fatalf("expected factory root to be %q but received %q", root, lfactory.Root)
+ }
+
+ if factory.Type() != "libcontainer" {
+ t.Fatalf("unexpected factory type: %q, expected %q", factory.Type(), "libcontainer")
+ }
+}
+
+func TestFactoryNewTmpfs(t *testing.T) {
+ root, rerr := newTestRoot()
+ if rerr != nil {
+ t.Fatal(rerr)
+ }
+ defer os.RemoveAll(root)
+ factory, err := New(root, Cgroupfs, TmpfsRoot)
+ if err != nil {
+ t.Fatal(err)
+ }
+ if factory == nil {
+ t.Fatal("factory should not be nil")
+ }
+ lfactory, ok := factory.(*LinuxFactory)
+ if !ok {
+ t.Fatal("expected linux factory returned on linux based systems")
+ }
+ if lfactory.Root != root {
+ t.Fatalf("expected factory root to be %q but received %q", root, lfactory.Root)
+ }
+
+ if factory.Type() != "libcontainer" {
+ t.Fatalf("unexpected factory type: %q, expected %q", factory.Type(), "libcontainer")
+ }
+ mounted, err := mount.Mounted(lfactory.Root)
+ if err != nil {
+ t.Fatal(err)
+ }
+ if !mounted {
+ t.Fatalf("Factory Root is not mounted")
+ }
+ mounts, err := mount.GetMounts()
+ if err != nil {
+ t.Fatal(err)
+ }
+ var found bool
+ for _, m := range mounts {
+ if m.Mountpoint == lfactory.Root {
+ if m.Fstype != "tmpfs" {
+ t.Fatalf("Fstype of root: %s, expected %s", m.Fstype, "tmpfs")
+ }
+ if m.Source != "tmpfs" {
+ t.Fatalf("Source of root: %s, expected %s", m.Source, "tmpfs")
+ }
+ found = true
+ }
+ }
+ if !found {
+ t.Fatalf("Factory Root is not listed in mounts list")
+ }
+ defer unix.Unmount(root, unix.MNT_DETACH)
+}
+
+func TestFactoryLoadNotExists(t *testing.T) {
+ root, rerr := newTestRoot()
+ if rerr != nil {
+ t.Fatal(rerr)
+ }
+ defer os.RemoveAll(root)
+ factory, err := New(root, Cgroupfs)
+ if err != nil {
+ t.Fatal(err)
+ }
+ _, err = factory.Load("nocontainer")
+ if err == nil {
+ t.Fatal("expected nil error loading non-existing container")
+ }
+ lerr, ok := err.(Error)
+ if !ok {
+ t.Fatal("expected libcontainer error type")
+ }
+ if lerr.Code() != ContainerNotExists {
+ t.Fatalf("expected error code %s but received %s", ContainerNotExists, lerr.Code())
+ }
+}
+
+func TestFactoryLoadContainer(t *testing.T) {
+ root, err := newTestRoot()
+ if err != nil {
+ t.Fatal(err)
+ }
+ defer os.RemoveAll(root)
+ // setup default container config and state for mocking
+ var (
+ id = "1"
+ expectedHooks = &configs.Hooks{
+ Prestart: []configs.Hook{
+ configs.CommandHook{Command: configs.Command{Path: "prestart-hook"}},
+ },
+ Poststart: []configs.Hook{
+ configs.CommandHook{Command: configs.Command{Path: "poststart-hook"}},
+ },
+ Poststop: []configs.Hook{
+ unserializableHook{},
+ configs.CommandHook{Command: configs.Command{Path: "poststop-hook"}},
+ },
+ }
+ expectedConfig = &configs.Config{
+ Rootfs: "/mycontainer/root",
+ Hooks: expectedHooks,
+ }
+ expectedState = &State{
+ BaseState: BaseState{
+ InitProcessPid: 1024,
+ Config: *expectedConfig,
+ },
+ }
+ )
+ if err := os.Mkdir(filepath.Join(root, id), 0700); err != nil {
+ t.Fatal(err)
+ }
+ if err := marshal(filepath.Join(root, id, stateFilename), expectedState); err != nil {
+ t.Fatal(err)
+ }
+ factory, err := New(root, Cgroupfs, IntelRdtFs)
+ if err != nil {
+ t.Fatal(err)
+ }
+ container, err := factory.Load(id)
+ if err != nil {
+ t.Fatal(err)
+ }
+ if container.ID() != id {
+ t.Fatalf("expected container id %q but received %q", id, container.ID())
+ }
+ config := container.Config()
+ if config.Rootfs != expectedConfig.Rootfs {
+ t.Fatalf("expected rootfs %q but received %q", expectedConfig.Rootfs, config.Rootfs)
+ }
+ expectedHooks.Poststop = expectedHooks.Poststop[1:] // expect unserializable hook to be skipped
+ if !reflect.DeepEqual(config.Hooks, expectedHooks) {
+ t.Fatalf("expects hooks %q but received %q", expectedHooks, config.Hooks)
+ }
+ lcontainer, ok := container.(*linuxContainer)
+ if !ok {
+ t.Fatal("expected linux container on linux based systems")
+ }
+ if lcontainer.initProcess.pid() != expectedState.InitProcessPid {
+ t.Fatalf("expected init pid %d but received %d", expectedState.InitProcessPid, lcontainer.initProcess.pid())
+ }
+}
+
+func marshal(path string, v interface{}) error {
+ f, err := os.Create(path)
+ if err != nil {
+ return err
+ }
+ defer f.Close()
+ return utils.WriteJSON(f, v)
+}
+
+type unserializableHook struct{}
+
+func (unserializableHook) Run(*specs.State) error {
+ return nil
+}
--- /dev/null
+package libcontainer
+
+import (
+ "fmt"
+ "io"
+ "text/template"
+ "time"
+
+ "github.com/opencontainers/runc/libcontainer/stacktrace"
+)
+
+var errorTemplate = template.Must(template.New("error").Parse(`Timestamp: {{.Timestamp}}
+Code: {{.ECode}}
+{{if .Message }}
+Message: {{.Message}}
+{{end}}
+Frames:{{range $i, $frame := .Stack.Frames}}
+---
+{{$i}}: {{$frame.Function}}
+Package: {{$frame.Package}}
+File: {{$frame.File}}@{{$frame.Line}}{{end}}
+`))
+
+func newGenericError(err error, c ErrorCode) Error {
+ if le, ok := err.(Error); ok {
+ return le
+ }
+ gerr := &genericError{
+ Timestamp: time.Now(),
+ Err: err,
+ ECode: c,
+ Stack: stacktrace.Capture(1),
+ }
+ if err != nil {
+ gerr.Message = err.Error()
+ }
+ return gerr
+}
+
+func newSystemError(err error) Error {
+ return createSystemError(err, "")
+}
+
+func newSystemErrorWithCausef(err error, cause string, v ...interface{}) Error {
+ return createSystemError(err, fmt.Sprintf(cause, v...))
+}
+
+func newSystemErrorWithCause(err error, cause string) Error {
+ return createSystemError(err, cause)
+}
+
+// createSystemError creates the specified error with the correct number of
+// stack frames skipped. This is only to be called by the other functions for
+// formatting the error.
+func createSystemError(err error, cause string) Error {
+ gerr := &genericError{
+ Timestamp: time.Now(),
+ Err: err,
+ ECode: SystemError,
+ Cause: cause,
+ Stack: stacktrace.Capture(2),
+ }
+ if err != nil {
+ gerr.Message = err.Error()
+ }
+ return gerr
+}
+
+type genericError struct {
+ Timestamp time.Time
+ ECode ErrorCode
+ Err error `json:"-"`
+ Cause string
+ Message string
+ Stack stacktrace.Stacktrace
+}
+
+func (e *genericError) Error() string {
+ if e.Cause == "" {
+ return e.Message
+ }
+ frame := e.Stack.Frames[0]
+ return fmt.Sprintf("%s:%d: %s caused %q", frame.File, frame.Line, e.Cause, e.Message)
+}
+
+func (e *genericError) Code() ErrorCode {
+ return e.ECode
+}
+
+func (e *genericError) Detail(w io.Writer) error {
+ return errorTemplate.Execute(w, e)
+}
--- /dev/null
+package libcontainer
+
+import (
+ "fmt"
+ "io/ioutil"
+ "testing"
+)
+
+func TestErrorDetail(t *testing.T) {
+ err := newGenericError(fmt.Errorf("test error"), SystemError)
+ if derr := err.Detail(ioutil.Discard); derr != nil {
+ t.Fatal(derr)
+ }
+}
+
+func TestErrorWithCode(t *testing.T) {
+ err := newGenericError(fmt.Errorf("test error"), SystemError)
+ if code := err.Code(); code != SystemError {
+ t.Fatalf("expected err code %q but %q", SystemError, code)
+ }
+}
+
+func TestErrorWithError(t *testing.T) {
+ cc := []struct {
+ errmsg string
+ cause string
+ }{
+ {
+ errmsg: "test error",
+ },
+ {
+ errmsg: "test error",
+ cause: "test",
+ },
+ }
+
+ for _, v := range cc {
+ err := newSystemErrorWithCause(fmt.Errorf(v.errmsg), v.cause)
+
+ msg := err.Error()
+ if v.cause == "" && msg != v.errmsg {
+ t.Fatalf("expected err(%q) equal errmsg(%q)", msg, v.errmsg)
+ }
+ if v.cause != "" && msg == v.errmsg {
+ t.Fatalf("unexpected err(%q) equal errmsg(%q)", msg, v.errmsg)
+ }
+
+ }
+}
--- /dev/null
+// +build linux
+
+package libcontainer
+
+import (
+ "encoding/json"
+ "fmt"
+ "io"
+ "io/ioutil"
+ "net"
+ "os"
+ "strings"
+ "syscall" // only for Errno
+ "unsafe"
+
+ "golang.org/x/sys/unix"
+
+ "github.com/containerd/console"
+ "github.com/opencontainers/runc/libcontainer/cgroups"
+ "github.com/opencontainers/runc/libcontainer/configs"
+ "github.com/opencontainers/runc/libcontainer/system"
+ "github.com/opencontainers/runc/libcontainer/user"
+ "github.com/opencontainers/runc/libcontainer/utils"
+ "github.com/pkg/errors"
+ "github.com/sirupsen/logrus"
+ "github.com/vishvananda/netlink"
+)
+
+type initType string
+
+const (
+ initSetns initType = "setns"
+ initStandard initType = "standard"
+)
+
+type pid struct {
+ Pid int `json:"pid"`
+ PidFirstChild int `json:"pid_first"`
+}
+
+// network is an internal struct used to setup container networks.
+type network struct {
+ configs.Network
+
+ // TempVethPeerName is a unique temporary veth peer name that was placed into
+ // the container's namespace.
+ TempVethPeerName string `json:"temp_veth_peer_name"`
+}
+
+// initConfig is used for transferring parameters from Exec() to Init()
+type initConfig struct {
+ Args []string `json:"args"`
+ Env []string `json:"env"`
+ Cwd string `json:"cwd"`
+ Capabilities *configs.Capabilities `json:"capabilities"`
+ ProcessLabel string `json:"process_label"`
+ AppArmorProfile string `json:"apparmor_profile"`
+ NoNewPrivileges bool `json:"no_new_privileges"`
+ User string `json:"user"`
+ AdditionalGroups []string `json:"additional_groups"`
+ Config *configs.Config `json:"config"`
+ Networks []*network `json:"network"`
+ PassedFilesCount int `json:"passed_files_count"`
+ ContainerId string `json:"containerid"`
+ Rlimits []configs.Rlimit `json:"rlimits"`
+ CreateConsole bool `json:"create_console"`
+ ConsoleWidth uint16 `json:"console_width"`
+ ConsoleHeight uint16 `json:"console_height"`
+ RootlessEUID bool `json:"rootless_euid,omitempty"`
+ RootlessCgroups bool `json:"rootless_cgroups,omitempty"`
+}
+
+type initer interface {
+ Init() error
+}
+
+func newContainerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd int) (initer, error) {
+ var config *initConfig
+ if err := json.NewDecoder(pipe).Decode(&config); err != nil {
+ return nil, err
+ }
+ if err := populateProcessEnvironment(config.Env); err != nil {
+ return nil, err
+ }
+ switch t {
+ case initSetns:
+ return &linuxSetnsInit{
+ pipe: pipe,
+ consoleSocket: consoleSocket,
+ config: config,
+ }, nil
+ case initStandard:
+ return &linuxStandardInit{
+ pipe: pipe,
+ consoleSocket: consoleSocket,
+ parentPid: unix.Getppid(),
+ config: config,
+ fifoFd: fifoFd,
+ }, nil
+ }
+ return nil, fmt.Errorf("unknown init type %q", t)
+}
+
+// populateProcessEnvironment loads the provided environment variables into the
+// current processes's environment.
+func populateProcessEnvironment(env []string) error {
+ for _, pair := range env {
+ p := strings.SplitN(pair, "=", 2)
+ if len(p) < 2 {
+ return fmt.Errorf("invalid environment '%v'", pair)
+ }
+ if err := os.Setenv(p[0], p[1]); err != nil {
+ return err
+ }
+ }
+ return nil
+}
+
+// finalizeNamespace drops the caps, sets the correct user
+// and working dir, and closes any leaked file descriptors
+// before executing the command inside the namespace
+func finalizeNamespace(config *initConfig) error {
+ // Ensure that all unwanted fds we may have accidentally
+ // inherited are marked close-on-exec so they stay out of the
+ // container
+ if err := utils.CloseExecFrom(config.PassedFilesCount + 3); err != nil {
+ return errors.Wrap(err, "close exec fds")
+ }
+
+ capabilities := &configs.Capabilities{}
+ if config.Capabilities != nil {
+ capabilities = config.Capabilities
+ } else if config.Config.Capabilities != nil {
+ capabilities = config.Config.Capabilities
+ }
+ w, err := newContainerCapList(capabilities)
+ if err != nil {
+ return err
+ }
+ // drop capabilities in bounding set before changing user
+ if err := w.ApplyBoundingSet(); err != nil {
+ return errors.Wrap(err, "apply bounding set")
+ }
+ // preserve existing capabilities while we change users
+ if err := system.SetKeepCaps(); err != nil {
+ return errors.Wrap(err, "set keep caps")
+ }
+ if err := setupUser(config); err != nil {
+ return errors.Wrap(err, "setup user")
+ }
+ if err := system.ClearKeepCaps(); err != nil {
+ return errors.Wrap(err, "clear keep caps")
+ }
+ if err := w.ApplyCaps(); err != nil {
+ return errors.Wrap(err, "apply caps")
+ }
+ if config.Cwd != "" {
+ if err := unix.Chdir(config.Cwd); err != nil {
+ return fmt.Errorf("chdir to cwd (%q) set in config.json failed: %v", config.Cwd, err)
+ }
+ }
+ return nil
+}
+
+// setupConsole sets up the console from inside the container, and sends the
+// master pty fd to the config.Pipe (using cmsg). This is done to ensure that
+// consoles are scoped to a container properly (see runc#814 and the many
+// issues related to that). This has to be run *after* we've pivoted to the new
+// rootfs (and the users' configuration is entirely set up).
+func setupConsole(socket *os.File, config *initConfig, mount bool) error {
+ defer socket.Close()
+ // At this point, /dev/ptmx points to something that we would expect. We
+ // used to change the owner of the slave path, but since the /dev/pts mount
+ // can have gid=X set (at the users' option). So touching the owner of the
+ // slave PTY is not necessary, as the kernel will handle that for us. Note
+ // however, that setupUser (specifically fixStdioPermissions) *will* change
+ // the UID owner of the console to be the user the process will run as (so
+ // they can actually control their console).
+
+ pty, slavePath, err := console.NewPty()
+ if err != nil {
+ return err
+ }
+
+ if config.ConsoleHeight != 0 && config.ConsoleWidth != 0 {
+ err = pty.Resize(console.WinSize{
+ Height: config.ConsoleHeight,
+ Width: config.ConsoleWidth,
+ })
+
+ if err != nil {
+ return err
+ }
+ }
+
+ // After we return from here, we don't need the console anymore.
+ defer pty.Close()
+
+ // Mount the console inside our rootfs.
+ if mount {
+ if err := mountConsole(slavePath); err != nil {
+ return err
+ }
+ }
+ // While we can access console.master, using the API is a good idea.
+ if err := utils.SendFd(socket, pty.Name(), pty.Fd()); err != nil {
+ return err
+ }
+ // Now, dup over all the things.
+ return dupStdio(slavePath)
+}
+
+// syncParentReady sends to the given pipe a JSON payload which indicates that
+// the init is ready to Exec the child process. It then waits for the parent to
+// indicate that it is cleared to Exec.
+func syncParentReady(pipe io.ReadWriter) error {
+ // Tell parent.
+ if err := writeSync(pipe, procReady); err != nil {
+ return err
+ }
+
+ // Wait for parent to give the all-clear.
+ return readSync(pipe, procRun)
+}
+
+// syncParentHooks sends to the given pipe a JSON payload which indicates that
+// the parent should execute pre-start hooks. It then waits for the parent to
+// indicate that it is cleared to resume.
+func syncParentHooks(pipe io.ReadWriter) error {
+ // Tell parent.
+ if err := writeSync(pipe, procHooks); err != nil {
+ return err
+ }
+
+ // Wait for parent to give the all-clear.
+ return readSync(pipe, procResume)
+}
+
+// setupUser changes the groups, gid, and uid for the user inside the container
+func setupUser(config *initConfig) error {
+ // Set up defaults.
+ defaultExecUser := user.ExecUser{
+ Uid: 0,
+ Gid: 0,
+ Home: "/",
+ }
+
+ passwdPath, err := user.GetPasswdPath()
+ if err != nil {
+ return err
+ }
+
+ groupPath, err := user.GetGroupPath()
+ if err != nil {
+ return err
+ }
+
+ execUser, err := user.GetExecUserPath(config.User, &defaultExecUser, passwdPath, groupPath)
+ if err != nil {
+ return err
+ }
+
+ var addGroups []int
+ if len(config.AdditionalGroups) > 0 {
+ addGroups, err = user.GetAdditionalGroupsPath(config.AdditionalGroups, groupPath)
+ if err != nil {
+ return err
+ }
+ }
+
+ // Rather than just erroring out later in setuid(2) and setgid(2), check
+ // that the user is mapped here.
+ if _, err := config.Config.HostUID(execUser.Uid); err != nil {
+ return fmt.Errorf("cannot set uid to unmapped user in user namespace")
+ }
+ if _, err := config.Config.HostGID(execUser.Gid); err != nil {
+ return fmt.Errorf("cannot set gid to unmapped user in user namespace")
+ }
+
+ if config.RootlessEUID {
+ // We cannot set any additional groups in a rootless container and thus
+ // we bail if the user asked us to do so. TODO: We currently can't do
+ // this check earlier, but if libcontainer.Process.User was typesafe
+ // this might work.
+ if len(addGroups) > 0 {
+ return fmt.Errorf("cannot set any additional groups in a rootless container")
+ }
+ }
+
+ // Before we change to the container's user make sure that the processes
+ // STDIO is correctly owned by the user that we are switching to.
+ if err := fixStdioPermissions(config, execUser); err != nil {
+ return err
+ }
+
+ setgroups, err := ioutil.ReadFile("/proc/self/setgroups")
+ if err != nil && !os.IsNotExist(err) {
+ return err
+ }
+
+ // This isn't allowed in an unprivileged user namespace since Linux 3.19.
+ // There's nothing we can do about /etc/group entries, so we silently
+ // ignore setting groups here (since the user didn't explicitly ask us to
+ // set the group).
+ allowSupGroups := !config.RootlessEUID && strings.TrimSpace(string(setgroups)) != "deny"
+
+ if allowSupGroups {
+ suppGroups := append(execUser.Sgids, addGroups...)
+ if err := unix.Setgroups(suppGroups); err != nil {
+ return err
+ }
+ }
+
+ if err := system.Setgid(execUser.Gid); err != nil {
+ return err
+ }
+ if err := system.Setuid(execUser.Uid); err != nil {
+ return err
+ }
+
+ // if we didn't get HOME already, set it based on the user's HOME
+ if envHome := os.Getenv("HOME"); envHome == "" {
+ if err := os.Setenv("HOME", execUser.Home); err != nil {
+ return err
+ }
+ }
+ return nil
+}
+
+// fixStdioPermissions fixes the permissions of PID 1's STDIO within the container to the specified user.
+// The ownership needs to match because it is created outside of the container and needs to be
+// localized.
+func fixStdioPermissions(config *initConfig, u *user.ExecUser) error {
+ var null unix.Stat_t
+ if err := unix.Stat("/dev/null", &null); err != nil {
+ return err
+ }
+ for _, fd := range []uintptr{
+ os.Stdin.Fd(),
+ os.Stderr.Fd(),
+ os.Stdout.Fd(),
+ } {
+ var s unix.Stat_t
+ if err := unix.Fstat(int(fd), &s); err != nil {
+ return err
+ }
+
+ // Skip chown of /dev/null if it was used as one of the STDIO fds.
+ if s.Rdev == null.Rdev {
+ continue
+ }
+
+ // We only change the uid owner (as it is possible for the mount to
+ // prefer a different gid, and there's no reason for us to change it).
+ // The reason why we don't just leave the default uid=X mount setup is
+ // that users expect to be able to actually use their console. Without
+ // this code, you couldn't effectively run as a non-root user inside a
+ // container and also have a console set up.
+ if err := unix.Fchown(int(fd), u.Uid, int(s.Gid)); err != nil {
+ // If we've hit an EINVAL then s.Gid isn't mapped in the user
+ // namespace. If we've hit an EPERM then the inode's current owner
+ // is not mapped in our user namespace (in particular,
+ // privileged_wrt_inode_uidgid() has failed). In either case, we
+ // are in a configuration where it's better for us to just not
+ // touch the stdio rather than bail at this point.
+ if err == unix.EINVAL || err == unix.EPERM {
+ continue
+ }
+ return err
+ }
+ }
+ return nil
+}
+
+// setupNetwork sets up and initializes any network interface inside the container.
+func setupNetwork(config *initConfig) error {
+ for _, config := range config.Networks {
+ strategy, err := getStrategy(config.Type)
+ if err != nil {
+ return err
+ }
+ if err := strategy.initialize(config); err != nil {
+ return err
+ }
+ }
+ return nil
+}
+
+func setupRoute(config *configs.Config) error {
+ for _, config := range config.Routes {
+ _, dst, err := net.ParseCIDR(config.Destination)
+ if err != nil {
+ return err
+ }
+ src := net.ParseIP(config.Source)
+ if src == nil {
+ return fmt.Errorf("Invalid source for route: %s", config.Source)
+ }
+ gw := net.ParseIP(config.Gateway)
+ if gw == nil {
+ return fmt.Errorf("Invalid gateway for route: %s", config.Gateway)
+ }
+ l, err := netlink.LinkByName(config.InterfaceName)
+ if err != nil {
+ return err
+ }
+ route := &netlink.Route{
+ Scope: netlink.SCOPE_UNIVERSE,
+ Dst: dst,
+ Src: src,
+ Gw: gw,
+ LinkIndex: l.Attrs().Index,
+ }
+ if err := netlink.RouteAdd(route); err != nil {
+ return err
+ }
+ }
+ return nil
+}
+
+func setupRlimits(limits []configs.Rlimit, pid int) error {
+ for _, rlimit := range limits {
+ if err := system.Prlimit(pid, rlimit.Type, unix.Rlimit{Max: rlimit.Hard, Cur: rlimit.Soft}); err != nil {
+ return fmt.Errorf("error setting rlimit type %v: %v", rlimit.Type, err)
+ }
+ }
+ return nil
+}
+
+const _P_PID = 1
+
+type siginfo struct {
+ si_signo int32
+ si_errno int32
+ si_code int32
+ // below here is a union; si_pid is the only field we use
+ si_pid int32
+ // Pad to 128 bytes as detailed in blockUntilWaitable
+ pad [96]byte
+}
+
+// isWaitable returns true if the process has exited false otherwise.
+// Its based off blockUntilWaitable in src/os/wait_waitid.go
+func isWaitable(pid int) (bool, error) {
+ si := &siginfo{}
+ _, _, e := unix.Syscall6(unix.SYS_WAITID, _P_PID, uintptr(pid), uintptr(unsafe.Pointer(si)), unix.WEXITED|unix.WNOWAIT|unix.WNOHANG, 0, 0)
+ if e != 0 {
+ return false, os.NewSyscallError("waitid", e)
+ }
+
+ return si.si_pid != 0, nil
+}
+
+// isNoChildren returns true if err represents a unix.ECHILD (formerly syscall.ECHILD) false otherwise
+func isNoChildren(err error) bool {
+ switch err := err.(type) {
+ case syscall.Errno:
+ if err == unix.ECHILD {
+ return true
+ }
+ case *os.SyscallError:
+ if err.Err == unix.ECHILD {
+ return true
+ }
+ }
+ return false
+}
+
+// signalAllProcesses freezes then iterates over all the processes inside the
+// manager's cgroups sending the signal s to them.
+// If s is SIGKILL then it will wait for each process to exit.
+// For all other signals it will check if the process is ready to report its
+// exit status and only if it is will a wait be performed.
+func signalAllProcesses(m cgroups.Manager, s os.Signal) error {
+ var procs []*os.Process
+ if err := m.Freeze(configs.Frozen); err != nil {
+ logrus.Warn(err)
+ }
+ pids, err := m.GetAllPids()
+ if err != nil {
+ m.Freeze(configs.Thawed)
+ return err
+ }
+ for _, pid := range pids {
+ p, err := os.FindProcess(pid)
+ if err != nil {
+ logrus.Warn(err)
+ continue
+ }
+ procs = append(procs, p)
+ if err := p.Signal(s); err != nil {
+ logrus.Warn(err)
+ }
+ }
+ if err := m.Freeze(configs.Thawed); err != nil {
+ logrus.Warn(err)
+ }
+
+ subreaper, err := system.GetSubreaper()
+ if err != nil {
+ // The error here means that PR_GET_CHILD_SUBREAPER is not
+ // supported because this code might run on a kernel older
+ // than 3.4. We don't want to throw an error in that case,
+ // and we simplify things, considering there is no subreaper
+ // set.
+ subreaper = 0
+ }
+
+ for _, p := range procs {
+ if s != unix.SIGKILL {
+ if ok, err := isWaitable(p.Pid); err != nil {
+ if !isNoChildren(err) {
+ logrus.Warn("signalAllProcesses: ", p.Pid, err)
+ }
+ continue
+ } else if !ok {
+ // Not ready to report so don't wait
+ continue
+ }
+ }
+
+ // In case a subreaper has been setup, this code must not
+ // wait for the process. Otherwise, we cannot be sure the
+ // current process will be reaped by the subreaper, while
+ // the subreaper might be waiting for this process in order
+ // to retrieve its exit code.
+ if subreaper == 0 {
+ if _, err := p.Wait(); err != nil {
+ if !isNoChildren(err) {
+ logrus.Warn("wait: ", err)
+ }
+ }
+ }
+ }
+ return nil
+}
--- /dev/null
+package integration
+
+import (
+ "bufio"
+ "bytes"
+ "io/ioutil"
+ "os"
+ "os/exec"
+ "path/filepath"
+ "strings"
+ "testing"
+
+ "github.com/opencontainers/runc/libcontainer"
+ "github.com/opencontainers/runc/libcontainer/configs"
+
+ "golang.org/x/sys/unix"
+)
+
+func showFile(t *testing.T, fname string) error {
+ t.Logf("=== %s ===\n", fname)
+
+ f, err := os.Open(fname)
+ if err != nil {
+ t.Log(err)
+ return err
+ }
+ defer f.Close()
+
+ scanner := bufio.NewScanner(f)
+ for scanner.Scan() {
+ t.Log(scanner.Text())
+ }
+
+ if err := scanner.Err(); err != nil {
+ return err
+ }
+
+ t.Logf("=== END ===\n")
+
+ return nil
+}
+
+func TestUsernsCheckpoint(t *testing.T) {
+ if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) {
+ t.Skip("userns is unsupported")
+ }
+ cmd := exec.Command("criu", "check", "--feature", "userns")
+ if err := cmd.Run(); err != nil {
+ t.Skip("Unable to c/r a container with userns")
+ }
+ testCheckpoint(t, true)
+}
+
+func TestCheckpoint(t *testing.T) {
+ testCheckpoint(t, false)
+}
+
+func testCheckpoint(t *testing.T, userns bool) {
+ if testing.Short() {
+ return
+ }
+ root, err := newTestRoot()
+ if err != nil {
+ t.Fatal(err)
+ }
+ defer os.RemoveAll(root)
+
+ rootfs, err := newRootfs()
+ if err != nil {
+ t.Fatal(err)
+ }
+ defer remove(rootfs)
+
+ config := newTemplateConfig(rootfs)
+
+ config.Mounts = append(config.Mounts, &configs.Mount{
+ Destination: "/sys/fs/cgroup",
+ Device: "cgroup",
+ Flags: defaultMountFlags | unix.MS_RDONLY,
+ })
+
+ if userns {
+ config.UidMappings = []configs.IDMap{{HostID: 0, ContainerID: 0, Size: 1000}}
+ config.GidMappings = []configs.IDMap{{HostID: 0, ContainerID: 0, Size: 1000}}
+ config.Namespaces = append(config.Namespaces, configs.Namespace{Type: configs.NEWUSER})
+ }
+
+ factory, err := libcontainer.New(root, libcontainer.Cgroupfs)
+
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ container, err := factory.Create("test", config)
+ if err != nil {
+ t.Fatal(err)
+ }
+ defer container.Destroy()
+
+ stdinR, stdinW, err := os.Pipe()
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ var stdout bytes.Buffer
+
+ pconfig := libcontainer.Process{
+ Cwd: "/",
+ Args: []string{"cat"},
+ Env: standardEnvironment,
+ Stdin: stdinR,
+ Stdout: &stdout,
+ Init: true,
+ }
+
+ err = container.Run(&pconfig)
+ stdinR.Close()
+ defer stdinW.Close()
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ pid, err := pconfig.Pid()
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ process, err := os.FindProcess(pid)
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ parentDir, err := ioutil.TempDir("", "criu-parent")
+ if err != nil {
+ t.Fatal(err)
+ }
+ defer os.RemoveAll(parentDir)
+
+ preDumpOpts := &libcontainer.CriuOpts{
+ ImagesDirectory: parentDir,
+ WorkDirectory: parentDir,
+ PreDump: true,
+ }
+ preDumpLog := filepath.Join(preDumpOpts.WorkDirectory, "dump.log")
+
+ if err := container.Checkpoint(preDumpOpts); err != nil {
+ showFile(t, preDumpLog)
+ t.Fatal(err)
+ }
+
+ state, err := container.Status()
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ if state != libcontainer.Running {
+ t.Fatal("Unexpected preDump state: ", state)
+ }
+
+ imagesDir, err := ioutil.TempDir("", "criu")
+ if err != nil {
+ t.Fatal(err)
+ }
+ defer os.RemoveAll(imagesDir)
+
+ checkpointOpts := &libcontainer.CriuOpts{
+ ImagesDirectory: imagesDir,
+ WorkDirectory: imagesDir,
+ ParentImage: "../criu-parent",
+ }
+ dumpLog := filepath.Join(checkpointOpts.WorkDirectory, "dump.log")
+ restoreLog := filepath.Join(checkpointOpts.WorkDirectory, "restore.log")
+
+ if err := container.Checkpoint(checkpointOpts); err != nil {
+ showFile(t, dumpLog)
+ t.Fatal(err)
+ }
+
+ state, err = container.Status()
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ if state != libcontainer.Stopped {
+ t.Fatal("Unexpected state checkpoint: ", state)
+ }
+
+ stdinW.Close()
+ _, err = process.Wait()
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ // reload the container
+ container, err = factory.Load("test")
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ restoreStdinR, restoreStdinW, err := os.Pipe()
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ restoreProcessConfig := &libcontainer.Process{
+ Cwd: "/",
+ Stdin: restoreStdinR,
+ Stdout: &stdout,
+ Init: true,
+ }
+
+ err = container.Restore(restoreProcessConfig, checkpointOpts)
+ restoreStdinR.Close()
+ defer restoreStdinW.Close()
+ if err != nil {
+ showFile(t, restoreLog)
+ t.Fatal(err)
+ }
+
+ state, err = container.Status()
+ if err != nil {
+ t.Fatal(err)
+ }
+ if state != libcontainer.Running {
+ t.Fatal("Unexpected restore state: ", state)
+ }
+
+ pid, err = restoreProcessConfig.Pid()
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ process, err = os.FindProcess(pid)
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ _, err = restoreStdinW.WriteString("Hello!")
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ restoreStdinW.Close()
+ s, err := process.Wait()
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ if !s.Success() {
+ t.Fatal(s.String(), pid)
+ }
+
+ output := string(stdout.Bytes())
+ if !strings.Contains(output, "Hello!") {
+ t.Fatal("Did not restore the pipe correctly:", output)
+ }
+}
--- /dev/null
+// integration is used for integration testing of libcontainer
+package integration
--- /dev/null
+package integration
+
+import (
+ "bytes"
+ "encoding/json"
+ "fmt"
+ "io/ioutil"
+ "os"
+ "os/exec"
+ "path/filepath"
+ "reflect"
+ "strconv"
+ "strings"
+ "testing"
+
+ "github.com/opencontainers/runc/libcontainer"
+ "github.com/opencontainers/runc/libcontainer/cgroups/systemd"
+ "github.com/opencontainers/runc/libcontainer/configs"
+ "github.com/opencontainers/runtime-spec/specs-go"
+
+ "golang.org/x/sys/unix"
+)
+
+func TestExecPS(t *testing.T) {
+ testExecPS(t, false)
+}
+
+func TestUsernsExecPS(t *testing.T) {
+ if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) {
+ t.Skip("userns is unsupported")
+ }
+ testExecPS(t, true)
+}
+
+func testExecPS(t *testing.T, userns bool) {
+ if testing.Short() {
+ return
+ }
+ rootfs, err := newRootfs()
+ ok(t, err)
+ defer remove(rootfs)
+ config := newTemplateConfig(rootfs)
+ if userns {
+ config.UidMappings = []configs.IDMap{{HostID: 0, ContainerID: 0, Size: 1000}}
+ config.GidMappings = []configs.IDMap{{HostID: 0, ContainerID: 0, Size: 1000}}
+ config.Namespaces = append(config.Namespaces, configs.Namespace{Type: configs.NEWUSER})
+ }
+
+ buffers, exitCode, err := runContainer(config, "", "ps", "-o", "pid,user,comm")
+ if err != nil {
+ t.Fatalf("%s: %s", buffers, err)
+ }
+ if exitCode != 0 {
+ t.Fatalf("exit code not 0. code %d stderr %q", exitCode, buffers.Stderr)
+ }
+ lines := strings.Split(buffers.Stdout.String(), "\n")
+ if len(lines) < 2 {
+ t.Fatalf("more than one process running for output %q", buffers.Stdout.String())
+ }
+ expected := `1 root ps`
+ actual := strings.Trim(lines[1], "\n ")
+ if actual != expected {
+ t.Fatalf("expected output %q but received %q", expected, actual)
+ }
+}
+
+func TestIPCPrivate(t *testing.T) {
+ if testing.Short() {
+ return
+ }
+
+ rootfs, err := newRootfs()
+ ok(t, err)
+ defer remove(rootfs)
+
+ l, err := os.Readlink("/proc/1/ns/ipc")
+ ok(t, err)
+
+ config := newTemplateConfig(rootfs)
+ buffers, exitCode, err := runContainer(config, "", "readlink", "/proc/self/ns/ipc")
+ ok(t, err)
+
+ if exitCode != 0 {
+ t.Fatalf("exit code not 0. code %d stderr %q", exitCode, buffers.Stderr)
+ }
+
+ if actual := strings.Trim(buffers.Stdout.String(), "\n"); actual == l {
+ t.Fatalf("ipc link should be private to the container but equals host %q %q", actual, l)
+ }
+}
+
+func TestIPCHost(t *testing.T) {
+ if testing.Short() {
+ return
+ }
+
+ rootfs, err := newRootfs()
+ ok(t, err)
+ defer remove(rootfs)
+
+ l, err := os.Readlink("/proc/1/ns/ipc")
+ ok(t, err)
+
+ config := newTemplateConfig(rootfs)
+ config.Namespaces.Remove(configs.NEWIPC)
+ buffers, exitCode, err := runContainer(config, "", "readlink", "/proc/self/ns/ipc")
+ ok(t, err)
+
+ if exitCode != 0 {
+ t.Fatalf("exit code not 0. code %d stderr %q", exitCode, buffers.Stderr)
+ }
+
+ if actual := strings.Trim(buffers.Stdout.String(), "\n"); actual != l {
+ t.Fatalf("ipc link not equal to host link %q %q", actual, l)
+ }
+}
+
+func TestIPCJoinPath(t *testing.T) {
+ if testing.Short() {
+ return
+ }
+
+ rootfs, err := newRootfs()
+ ok(t, err)
+ defer remove(rootfs)
+
+ l, err := os.Readlink("/proc/1/ns/ipc")
+ ok(t, err)
+
+ config := newTemplateConfig(rootfs)
+ config.Namespaces.Add(configs.NEWIPC, "/proc/1/ns/ipc")
+
+ buffers, exitCode, err := runContainer(config, "", "readlink", "/proc/self/ns/ipc")
+ ok(t, err)
+
+ if exitCode != 0 {
+ t.Fatalf("exit code not 0. code %d stderr %q", exitCode, buffers.Stderr)
+ }
+
+ if actual := strings.Trim(buffers.Stdout.String(), "\n"); actual != l {
+ t.Fatalf("ipc link not equal to host link %q %q", actual, l)
+ }
+}
+
+func TestIPCBadPath(t *testing.T) {
+ if testing.Short() {
+ return
+ }
+
+ rootfs, err := newRootfs()
+ ok(t, err)
+ defer remove(rootfs)
+
+ config := newTemplateConfig(rootfs)
+ config.Namespaces.Add(configs.NEWIPC, "/proc/1/ns/ipcc")
+
+ _, _, err = runContainer(config, "", "true")
+ if err == nil {
+ t.Fatal("container succeeded with bad ipc path")
+ }
+}
+
+func TestRlimit(t *testing.T) {
+ testRlimit(t, false)
+}
+
+func TestUsernsRlimit(t *testing.T) {
+ if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) {
+ t.Skip("userns is unsupported")
+ }
+
+ testRlimit(t, true)
+}
+
+func testRlimit(t *testing.T, userns bool) {
+ if testing.Short() {
+ return
+ }
+
+ rootfs, err := newRootfs()
+ ok(t, err)
+ defer remove(rootfs)
+
+ config := newTemplateConfig(rootfs)
+ if userns {
+ config.UidMappings = []configs.IDMap{{HostID: 0, ContainerID: 0, Size: 1000}}
+ config.GidMappings = []configs.IDMap{{HostID: 0, ContainerID: 0, Size: 1000}}
+ config.Namespaces = append(config.Namespaces, configs.Namespace{Type: configs.NEWUSER})
+ }
+
+ // ensure limit is lower than what the config requests to test that in a user namespace
+ // the Setrlimit call happens early enough that we still have permissions to raise the limit.
+ ok(t, unix.Setrlimit(unix.RLIMIT_NOFILE, &unix.Rlimit{
+ Max: 1024,
+ Cur: 1024,
+ }))
+
+ out, _, err := runContainer(config, "", "/bin/sh", "-c", "ulimit -n")
+ ok(t, err)
+ if limit := strings.TrimSpace(out.Stdout.String()); limit != "1025" {
+ t.Fatalf("expected rlimit to be 1025, got %s", limit)
+ }
+}
+
+func TestEnter(t *testing.T) {
+ if testing.Short() {
+ return
+ }
+ root, err := newTestRoot()
+ ok(t, err)
+ defer os.RemoveAll(root)
+
+ rootfs, err := newRootfs()
+ ok(t, err)
+ defer remove(rootfs)
+
+ config := newTemplateConfig(rootfs)
+
+ container, err := factory.Create("test", config)
+ ok(t, err)
+ defer container.Destroy()
+
+ // Execute a first process in the container
+ stdinR, stdinW, err := os.Pipe()
+ ok(t, err)
+
+ var stdout, stdout2 bytes.Buffer
+
+ pconfig := libcontainer.Process{
+ Cwd: "/",
+ Args: []string{"sh", "-c", "cat && readlink /proc/self/ns/pid"},
+ Env: standardEnvironment,
+ Stdin: stdinR,
+ Stdout: &stdout,
+ Init: true,
+ }
+ err = container.Run(&pconfig)
+ stdinR.Close()
+ defer stdinW.Close()
+ ok(t, err)
+ pid, err := pconfig.Pid()
+ ok(t, err)
+
+ // Execute another process in the container
+ stdinR2, stdinW2, err := os.Pipe()
+ ok(t, err)
+ pconfig2 := libcontainer.Process{
+ Cwd: "/",
+ Env: standardEnvironment,
+ }
+ pconfig2.Args = []string{"sh", "-c", "cat && readlink /proc/self/ns/pid"}
+ pconfig2.Stdin = stdinR2
+ pconfig2.Stdout = &stdout2
+
+ err = container.Run(&pconfig2)
+ stdinR2.Close()
+ defer stdinW2.Close()
+ ok(t, err)
+
+ pid2, err := pconfig2.Pid()
+ ok(t, err)
+
+ processes, err := container.Processes()
+ ok(t, err)
+
+ n := 0
+ for i := range processes {
+ if processes[i] == pid || processes[i] == pid2 {
+ n++
+ }
+ }
+ if n != 2 {
+ t.Fatal("unexpected number of processes", processes, pid, pid2)
+ }
+
+ // Wait processes
+ stdinW2.Close()
+ waitProcess(&pconfig2, t)
+
+ stdinW.Close()
+ waitProcess(&pconfig, t)
+
+ // Check that both processes live in the same pidns
+ pidns := string(stdout.Bytes())
+ ok(t, err)
+
+ pidns2 := string(stdout2.Bytes())
+ ok(t, err)
+
+ if pidns != pidns2 {
+ t.Fatal("The second process isn't in the required pid namespace", pidns, pidns2)
+ }
+}
+
+func TestProcessEnv(t *testing.T) {
+ if testing.Short() {
+ return
+ }
+ root, err := newTestRoot()
+ ok(t, err)
+ defer os.RemoveAll(root)
+
+ rootfs, err := newRootfs()
+ ok(t, err)
+ defer remove(rootfs)
+
+ config := newTemplateConfig(rootfs)
+
+ container, err := factory.Create("test", config)
+ ok(t, err)
+ defer container.Destroy()
+
+ var stdout bytes.Buffer
+ pconfig := libcontainer.Process{
+ Cwd: "/",
+ Args: []string{"sh", "-c", "env"},
+ Env: []string{
+ "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
+ "HOSTNAME=integration",
+ "TERM=xterm",
+ "FOO=BAR",
+ },
+ Stdin: nil,
+ Stdout: &stdout,
+ Init: true,
+ }
+ err = container.Run(&pconfig)
+ ok(t, err)
+
+ // Wait for process
+ waitProcess(&pconfig, t)
+
+ outputEnv := string(stdout.Bytes())
+
+ // Check that the environment has the key/value pair we added
+ if !strings.Contains(outputEnv, "FOO=BAR") {
+ t.Fatal("Environment doesn't have the expected FOO=BAR key/value pair: ", outputEnv)
+ }
+
+ // Make sure that HOME is set
+ if !strings.Contains(outputEnv, "HOME=/root") {
+ t.Fatal("Environment doesn't have HOME set: ", outputEnv)
+ }
+}
+
+func TestProcessEmptyCaps(t *testing.T) {
+ if testing.Short() {
+ return
+ }
+ root, err := newTestRoot()
+ ok(t, err)
+ defer os.RemoveAll(root)
+
+ rootfs, err := newRootfs()
+ ok(t, err)
+ defer remove(rootfs)
+
+ config := newTemplateConfig(rootfs)
+ config.Capabilities = nil
+
+ container, err := factory.Create("test", config)
+ ok(t, err)
+ defer container.Destroy()
+
+ var stdout bytes.Buffer
+ pconfig := libcontainer.Process{
+ Cwd: "/",
+ Args: []string{"sh", "-c", "cat /proc/self/status"},
+ Env: standardEnvironment,
+ Stdin: nil,
+ Stdout: &stdout,
+ Init: true,
+ }
+ err = container.Run(&pconfig)
+ ok(t, err)
+
+ // Wait for process
+ waitProcess(&pconfig, t)
+
+ outputStatus := string(stdout.Bytes())
+
+ lines := strings.Split(outputStatus, "\n")
+
+ effectiveCapsLine := ""
+ for _, l := range lines {
+ line := strings.TrimSpace(l)
+ if strings.Contains(line, "CapEff:") {
+ effectiveCapsLine = line
+ break
+ }
+ }
+
+ if effectiveCapsLine == "" {
+ t.Fatal("Couldn't find effective caps: ", outputStatus)
+ }
+}
+
+func TestProcessCaps(t *testing.T) {
+ if testing.Short() {
+ return
+ }
+ root, err := newTestRoot()
+ ok(t, err)
+ defer os.RemoveAll(root)
+
+ rootfs, err := newRootfs()
+ ok(t, err)
+ defer remove(rootfs)
+
+ config := newTemplateConfig(rootfs)
+
+ container, err := factory.Create("test", config)
+ ok(t, err)
+ defer container.Destroy()
+
+ var stdout bytes.Buffer
+ pconfig := libcontainer.Process{
+ Cwd: "/",
+ Args: []string{"sh", "-c", "cat /proc/self/status"},
+ Env: standardEnvironment,
+ Stdin: nil,
+ Stdout: &stdout,
+ Capabilities: &configs.Capabilities{},
+ Init: true,
+ }
+ pconfig.Capabilities.Bounding = append(config.Capabilities.Bounding, "CAP_NET_ADMIN")
+ pconfig.Capabilities.Permitted = append(config.Capabilities.Permitted, "CAP_NET_ADMIN")
+ pconfig.Capabilities.Effective = append(config.Capabilities.Effective, "CAP_NET_ADMIN")
+ pconfig.Capabilities.Inheritable = append(config.Capabilities.Inheritable, "CAP_NET_ADMIN")
+ err = container.Run(&pconfig)
+ ok(t, err)
+
+ // Wait for process
+ waitProcess(&pconfig, t)
+
+ outputStatus := string(stdout.Bytes())
+
+ lines := strings.Split(outputStatus, "\n")
+
+ effectiveCapsLine := ""
+ for _, l := range lines {
+ line := strings.TrimSpace(l)
+ if strings.Contains(line, "CapEff:") {
+ effectiveCapsLine = line
+ break
+ }
+ }
+
+ if effectiveCapsLine == "" {
+ t.Fatal("Couldn't find effective caps: ", outputStatus)
+ }
+
+ parts := strings.Split(effectiveCapsLine, ":")
+ effectiveCapsStr := strings.TrimSpace(parts[1])
+
+ effectiveCaps, err := strconv.ParseUint(effectiveCapsStr, 16, 64)
+ if err != nil {
+ t.Fatal("Could not parse effective caps", err)
+ }
+
+ var netAdminMask uint64
+ var netAdminBit uint
+ netAdminBit = 12 // from capability.h
+ netAdminMask = 1 << netAdminBit
+ if effectiveCaps&netAdminMask != netAdminMask {
+ t.Fatal("CAP_NET_ADMIN is not set as expected")
+ }
+}
+
+func TestAdditionalGroups(t *testing.T) {
+ if testing.Short() {
+ return
+ }
+ root, err := newTestRoot()
+ ok(t, err)
+ defer os.RemoveAll(root)
+
+ rootfs, err := newRootfs()
+ ok(t, err)
+ defer remove(rootfs)
+
+ config := newTemplateConfig(rootfs)
+
+ factory, err := libcontainer.New(root, libcontainer.Cgroupfs)
+ ok(t, err)
+
+ container, err := factory.Create("test", config)
+ ok(t, err)
+ defer container.Destroy()
+
+ var stdout bytes.Buffer
+ pconfig := libcontainer.Process{
+ Cwd: "/",
+ Args: []string{"sh", "-c", "id", "-Gn"},
+ Env: standardEnvironment,
+ Stdin: nil,
+ Stdout: &stdout,
+ AdditionalGroups: []string{"plugdev", "audio"},
+ Init: true,
+ }
+ err = container.Run(&pconfig)
+ ok(t, err)
+
+ // Wait for process
+ waitProcess(&pconfig, t)
+
+ outputGroups := string(stdout.Bytes())
+
+ // Check that the groups output has the groups that we specified
+ if !strings.Contains(outputGroups, "audio") {
+ t.Fatalf("Listed groups do not contain the audio group as expected: %v", outputGroups)
+ }
+
+ if !strings.Contains(outputGroups, "plugdev") {
+ t.Fatalf("Listed groups do not contain the plugdev group as expected: %v", outputGroups)
+ }
+}
+
+func TestFreeze(t *testing.T) {
+ testFreeze(t, false)
+}
+
+func TestSystemdFreeze(t *testing.T) {
+ if !systemd.UseSystemd() {
+ t.Skip("Systemd is unsupported")
+ }
+ testFreeze(t, true)
+}
+
+func testFreeze(t *testing.T, systemd bool) {
+ if testing.Short() {
+ return
+ }
+ root, err := newTestRoot()
+ ok(t, err)
+ defer os.RemoveAll(root)
+
+ rootfs, err := newRootfs()
+ ok(t, err)
+ defer remove(rootfs)
+
+ config := newTemplateConfig(rootfs)
+ f := factory
+ if systemd {
+ f = systemdFactory
+ }
+
+ container, err := f.Create("test", config)
+ ok(t, err)
+ defer container.Destroy()
+
+ stdinR, stdinW, err := os.Pipe()
+ ok(t, err)
+
+ pconfig := &libcontainer.Process{
+ Cwd: "/",
+ Args: []string{"cat"},
+ Env: standardEnvironment,
+ Stdin: stdinR,
+ Init: true,
+ }
+ err = container.Run(pconfig)
+ stdinR.Close()
+ defer stdinW.Close()
+ ok(t, err)
+
+ err = container.Pause()
+ ok(t, err)
+ state, err := container.Status()
+ ok(t, err)
+ err = container.Resume()
+ ok(t, err)
+ if state != libcontainer.Paused {
+ t.Fatal("Unexpected state: ", state)
+ }
+
+ stdinW.Close()
+ waitProcess(pconfig, t)
+}
+
+func TestCpuShares(t *testing.T) {
+ testCpuShares(t, false)
+}
+
+func TestCpuSharesSystemd(t *testing.T) {
+ if !systemd.UseSystemd() {
+ t.Skip("Systemd is unsupported")
+ }
+ testCpuShares(t, true)
+}
+
+func testCpuShares(t *testing.T, systemd bool) {
+ if testing.Short() {
+ return
+ }
+ rootfs, err := newRootfs()
+ ok(t, err)
+ defer remove(rootfs)
+
+ config := newTemplateConfig(rootfs)
+ if systemd {
+ config.Cgroups.Parent = "system.slice"
+ }
+ config.Cgroups.Resources.CpuShares = 1
+
+ _, _, err = runContainer(config, "", "ps")
+ if err == nil {
+ t.Fatalf("runContainer should failed with invalid CpuShares")
+ }
+}
+
+func TestPids(t *testing.T) {
+ testPids(t, false)
+}
+
+func TestPidsSystemd(t *testing.T) {
+ if !systemd.UseSystemd() {
+ t.Skip("Systemd is unsupported")
+ }
+ testPids(t, true)
+}
+
+func testPids(t *testing.T, systemd bool) {
+ if testing.Short() {
+ return
+ }
+
+ rootfs, err := newRootfs()
+ ok(t, err)
+ defer remove(rootfs)
+
+ config := newTemplateConfig(rootfs)
+ if systemd {
+ config.Cgroups.Parent = "system.slice"
+ }
+ config.Cgroups.Resources.PidsLimit = -1
+
+ // Running multiple processes.
+ _, ret, err := runContainer(config, "", "/bin/sh", "-c", "/bin/true | /bin/true | /bin/true | /bin/true")
+ if err != nil && strings.Contains(err.Error(), "no such directory for pids.max") {
+ t.Skip("PIDs cgroup is unsupported")
+ }
+ ok(t, err)
+
+ if ret != 0 {
+ t.Fatalf("expected fork() to succeed with no pids limit")
+ }
+
+ // Enforce a permissive limit. This needs to be fairly hand-wavey due to the
+ // issues with running Go binaries with pids restrictions (see below).
+ config.Cgroups.Resources.PidsLimit = 64
+ _, ret, err = runContainer(config, "", "/bin/sh", "-c", `
+ /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | bin/true | /bin/true |
+ /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | bin/true | /bin/true |
+ /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | bin/true | /bin/true |
+ /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | bin/true | /bin/true`)
+ if err != nil && strings.Contains(err.Error(), "no such directory for pids.max") {
+ t.Skip("PIDs cgroup is unsupported")
+ }
+ ok(t, err)
+
+ if ret != 0 {
+ t.Fatalf("expected fork() to succeed with permissive pids limit")
+ }
+
+ // Enforce a restrictive limit. 64 * /bin/true + 1 * shell should cause this
+ // to fail reliability.
+ config.Cgroups.Resources.PidsLimit = 64
+ out, _, err := runContainer(config, "", "/bin/sh", "-c", `
+ /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | bin/true | /bin/true |
+ /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | bin/true | /bin/true |
+ /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | bin/true | /bin/true |
+ /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | bin/true | /bin/true |
+ /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | bin/true | /bin/true |
+ /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | bin/true | /bin/true |
+ /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | bin/true | /bin/true |
+ /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | bin/true | /bin/true`)
+ if err != nil && strings.Contains(err.Error(), "no such directory for pids.max") {
+ t.Skip("PIDs cgroup is unsupported")
+ }
+ if err != nil && !strings.Contains(out.String(), "sh: can't fork") {
+ ok(t, err)
+ }
+
+ if err == nil {
+ t.Fatalf("expected fork() to fail with restrictive pids limit")
+ }
+
+ // Minimal restrictions are not really supported, due to quirks in using Go
+ // due to the fact that it spawns random processes. While we do our best with
+ // late setting cgroup values, it's just too unreliable with very small pids.max.
+ // As such, we don't test that case. YMMV.
+}
+
+func TestRunWithKernelMemory(t *testing.T) {
+ testRunWithKernelMemory(t, false)
+}
+
+func TestRunWithKernelMemorySystemd(t *testing.T) {
+ if !systemd.UseSystemd() {
+ t.Skip("Systemd is unsupported")
+ }
+ testRunWithKernelMemory(t, true)
+}
+
+func testRunWithKernelMemory(t *testing.T, systemd bool) {
+ if testing.Short() {
+ return
+ }
+ rootfs, err := newRootfs()
+ ok(t, err)
+ defer remove(rootfs)
+
+ config := newTemplateConfig(rootfs)
+ if systemd {
+ config.Cgroups.Parent = "system.slice"
+ }
+ config.Cgroups.Resources.KernelMemory = 52428800
+
+ _, _, err = runContainer(config, "", "ps")
+ if err != nil {
+ t.Fatalf("runContainer failed with kernel memory limit: %v", err)
+ }
+}
+
+func TestContainerState(t *testing.T) {
+ if testing.Short() {
+ return
+ }
+ root, err := newTestRoot()
+ if err != nil {
+ t.Fatal(err)
+ }
+ defer os.RemoveAll(root)
+
+ rootfs, err := newRootfs()
+ if err != nil {
+ t.Fatal(err)
+ }
+ defer remove(rootfs)
+
+ l, err := os.Readlink("/proc/1/ns/ipc")
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ config := newTemplateConfig(rootfs)
+ config.Namespaces = configs.Namespaces([]configs.Namespace{
+ {Type: configs.NEWNS},
+ {Type: configs.NEWUTS},
+ // host for IPC
+ //{Type: configs.NEWIPC},
+ {Type: configs.NEWPID},
+ {Type: configs.NEWNET},
+ })
+
+ container, err := factory.Create("test", config)
+ if err != nil {
+ t.Fatal(err)
+ }
+ defer container.Destroy()
+
+ stdinR, stdinW, err := os.Pipe()
+ if err != nil {
+ t.Fatal(err)
+ }
+ p := &libcontainer.Process{
+ Cwd: "/",
+ Args: []string{"cat"},
+ Env: standardEnvironment,
+ Stdin: stdinR,
+ Init: true,
+ }
+ err = container.Run(p)
+ if err != nil {
+ t.Fatal(err)
+ }
+ stdinR.Close()
+ defer stdinW.Close()
+
+ st, err := container.State()
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ l1, err := os.Readlink(st.NamespacePaths[configs.NEWIPC])
+ if err != nil {
+ t.Fatal(err)
+ }
+ if l1 != l {
+ t.Fatal("Container using non-host ipc namespace")
+ }
+ stdinW.Close()
+ waitProcess(p, t)
+}
+
+func TestPassExtraFiles(t *testing.T) {
+ if testing.Short() {
+ return
+ }
+
+ rootfs, err := newRootfs()
+ if err != nil {
+ t.Fatal(err)
+ }
+ defer remove(rootfs)
+
+ config := newTemplateConfig(rootfs)
+
+ container, err := factory.Create("test", config)
+ if err != nil {
+ t.Fatal(err)
+ }
+ defer container.Destroy()
+
+ var stdout bytes.Buffer
+ pipeout1, pipein1, err := os.Pipe()
+ if err != nil {
+ t.Fatal(err)
+ }
+ pipeout2, pipein2, err := os.Pipe()
+ if err != nil {
+ t.Fatal(err)
+ }
+ process := libcontainer.Process{
+ Cwd: "/",
+ Args: []string{"sh", "-c", "cd /proc/$$/fd; echo -n *; echo -n 1 >3; echo -n 2 >4"},
+ Env: []string{"PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"},
+ ExtraFiles: []*os.File{pipein1, pipein2},
+ Stdin: nil,
+ Stdout: &stdout,
+ Init: true,
+ }
+ err = container.Run(&process)
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ waitProcess(&process, t)
+
+ out := string(stdout.Bytes())
+ // fd 5 is the directory handle for /proc/$$/fd
+ if out != "0 1 2 3 4 5" {
+ t.Fatalf("expected to have the file descriptors '0 1 2 3 4 5' passed to init, got '%s'", out)
+ }
+ var buf = []byte{0}
+ _, err = pipeout1.Read(buf)
+ if err != nil {
+ t.Fatal(err)
+ }
+ out1 := string(buf)
+ if out1 != "1" {
+ t.Fatalf("expected first pipe to receive '1', got '%s'", out1)
+ }
+
+ _, err = pipeout2.Read(buf)
+ if err != nil {
+ t.Fatal(err)
+ }
+ out2 := string(buf)
+ if out2 != "2" {
+ t.Fatalf("expected second pipe to receive '2', got '%s'", out2)
+ }
+}
+
+func TestMountCmds(t *testing.T) {
+ if testing.Short() {
+ return
+ }
+ root, err := newTestRoot()
+ if err != nil {
+ t.Fatal(err)
+ }
+ defer os.RemoveAll(root)
+
+ rootfs, err := newRootfs()
+ if err != nil {
+ t.Fatal(err)
+ }
+ defer remove(rootfs)
+
+ tmpDir, err := ioutil.TempDir("", "tmpdir")
+ if err != nil {
+ t.Fatal(err)
+ }
+ defer os.RemoveAll(tmpDir)
+
+ config := newTemplateConfig(rootfs)
+ config.Mounts = append(config.Mounts, &configs.Mount{
+ Source: tmpDir,
+ Destination: "/tmp",
+ Device: "bind",
+ Flags: unix.MS_BIND | unix.MS_REC,
+ PremountCmds: []configs.Command{
+ {Path: "touch", Args: []string{filepath.Join(tmpDir, "hello")}},
+ {Path: "touch", Args: []string{filepath.Join(tmpDir, "world")}},
+ },
+ PostmountCmds: []configs.Command{
+ {Path: "cp", Args: []string{filepath.Join(rootfs, "tmp", "hello"), filepath.Join(rootfs, "tmp", "hello-backup")}},
+ {Path: "cp", Args: []string{filepath.Join(rootfs, "tmp", "world"), filepath.Join(rootfs, "tmp", "world-backup")}},
+ },
+ })
+
+ container, err := factory.Create("test", config)
+ if err != nil {
+ t.Fatal(err)
+ }
+ defer container.Destroy()
+
+ pconfig := libcontainer.Process{
+ Cwd: "/",
+ Args: []string{"sh", "-c", "env"},
+ Env: standardEnvironment,
+ Init: true,
+ }
+ err = container.Run(&pconfig)
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ // Wait for process
+ waitProcess(&pconfig, t)
+
+ entries, err := ioutil.ReadDir(tmpDir)
+ if err != nil {
+ t.Fatal(err)
+ }
+ expected := []string{"hello", "hello-backup", "world", "world-backup"}
+ for i, e := range entries {
+ if e.Name() != expected[i] {
+ t.Errorf("Got(%s), expect %s", e.Name(), expected[i])
+ }
+ }
+}
+
+func TestSysctl(t *testing.T) {
+ if testing.Short() {
+ return
+ }
+ root, err := newTestRoot()
+ ok(t, err)
+ defer os.RemoveAll(root)
+
+ rootfs, err := newRootfs()
+ ok(t, err)
+ defer remove(rootfs)
+
+ config := newTemplateConfig(rootfs)
+ config.Sysctl = map[string]string{
+ "kernel.shmmni": "8192",
+ }
+
+ container, err := factory.Create("test", config)
+ ok(t, err)
+ defer container.Destroy()
+
+ var stdout bytes.Buffer
+ pconfig := libcontainer.Process{
+ Cwd: "/",
+ Args: []string{"sh", "-c", "cat /proc/sys/kernel/shmmni"},
+ Env: standardEnvironment,
+ Stdin: nil,
+ Stdout: &stdout,
+ Init: true,
+ }
+ err = container.Run(&pconfig)
+ ok(t, err)
+
+ // Wait for process
+ waitProcess(&pconfig, t)
+
+ shmmniOutput := strings.TrimSpace(string(stdout.Bytes()))
+ if shmmniOutput != "8192" {
+ t.Fatalf("kernel.shmmni property expected to be 8192, but is %s", shmmniOutput)
+ }
+}
+
+func TestMountCgroupRO(t *testing.T) {
+ if testing.Short() {
+ return
+ }
+ rootfs, err := newRootfs()
+ ok(t, err)
+ defer remove(rootfs)
+ config := newTemplateConfig(rootfs)
+
+ config.Mounts = append(config.Mounts, &configs.Mount{
+ Destination: "/sys/fs/cgroup",
+ Device: "cgroup",
+ Flags: defaultMountFlags | unix.MS_RDONLY,
+ })
+
+ buffers, exitCode, err := runContainer(config, "", "mount")
+ if err != nil {
+ t.Fatalf("%s: %s", buffers, err)
+ }
+ if exitCode != 0 {
+ t.Fatalf("exit code not 0. code %d stderr %q", exitCode, buffers.Stderr)
+ }
+ mountInfo := buffers.Stdout.String()
+ lines := strings.Split(mountInfo, "\n")
+ for _, l := range lines {
+ if strings.HasPrefix(l, "tmpfs on /sys/fs/cgroup") {
+ if !strings.Contains(l, "ro") ||
+ !strings.Contains(l, "nosuid") ||
+ !strings.Contains(l, "nodev") ||
+ !strings.Contains(l, "noexec") {
+ t.Fatalf("Mode expected to contain 'ro,nosuid,nodev,noexec': %s", l)
+ }
+ if !strings.Contains(l, "mode=755") {
+ t.Fatalf("Mode expected to contain 'mode=755': %s", l)
+ }
+ continue
+ }
+ if !strings.HasPrefix(l, "cgroup") {
+ continue
+ }
+ if !strings.Contains(l, "ro") ||
+ !strings.Contains(l, "nosuid") ||
+ !strings.Contains(l, "nodev") ||
+ !strings.Contains(l, "noexec") {
+ t.Fatalf("Mode expected to contain 'ro,nosuid,nodev,noexec': %s", l)
+ }
+ }
+}
+
+func TestMountCgroupRW(t *testing.T) {
+ if testing.Short() {
+ return
+ }
+ rootfs, err := newRootfs()
+ ok(t, err)
+ defer remove(rootfs)
+ config := newTemplateConfig(rootfs)
+
+ config.Mounts = append(config.Mounts, &configs.Mount{
+ Destination: "/sys/fs/cgroup",
+ Device: "cgroup",
+ Flags: defaultMountFlags,
+ })
+
+ buffers, exitCode, err := runContainer(config, "", "mount")
+ if err != nil {
+ t.Fatalf("%s: %s", buffers, err)
+ }
+ if exitCode != 0 {
+ t.Fatalf("exit code not 0. code %d stderr %q", exitCode, buffers.Stderr)
+ }
+ mountInfo := buffers.Stdout.String()
+ lines := strings.Split(mountInfo, "\n")
+ for _, l := range lines {
+ if strings.HasPrefix(l, "tmpfs on /sys/fs/cgroup") {
+ if !strings.Contains(l, "rw") ||
+ !strings.Contains(l, "nosuid") ||
+ !strings.Contains(l, "nodev") ||
+ !strings.Contains(l, "noexec") {
+ t.Fatalf("Mode expected to contain 'rw,nosuid,nodev,noexec': %s", l)
+ }
+ if !strings.Contains(l, "mode=755") {
+ t.Fatalf("Mode expected to contain 'mode=755': %s", l)
+ }
+ continue
+ }
+ if !strings.HasPrefix(l, "cgroup") {
+ continue
+ }
+ if !strings.Contains(l, "rw") ||
+ !strings.Contains(l, "nosuid") ||
+ !strings.Contains(l, "nodev") ||
+ !strings.Contains(l, "noexec") {
+ t.Fatalf("Mode expected to contain 'rw,nosuid,nodev,noexec': %s", l)
+ }
+ }
+}
+
+func TestOomScoreAdj(t *testing.T) {
+ if testing.Short() {
+ return
+ }
+ root, err := newTestRoot()
+ ok(t, err)
+ defer os.RemoveAll(root)
+
+ rootfs, err := newRootfs()
+ ok(t, err)
+ defer remove(rootfs)
+
+ config := newTemplateConfig(rootfs)
+ config.OomScoreAdj = ptrInt(200)
+
+ factory, err := libcontainer.New(root, libcontainer.Cgroupfs)
+ ok(t, err)
+
+ container, err := factory.Create("test", config)
+ ok(t, err)
+ defer container.Destroy()
+
+ var stdout bytes.Buffer
+ pconfig := libcontainer.Process{
+ Cwd: "/",
+ Args: []string{"sh", "-c", "cat /proc/self/oom_score_adj"},
+ Env: standardEnvironment,
+ Stdin: nil,
+ Stdout: &stdout,
+ Init: true,
+ }
+ err = container.Run(&pconfig)
+ ok(t, err)
+
+ // Wait for process
+ waitProcess(&pconfig, t)
+ outputOomScoreAdj := strings.TrimSpace(string(stdout.Bytes()))
+
+ // Check that the oom_score_adj matches the value that was set as part of config.
+ if outputOomScoreAdj != strconv.Itoa(*config.OomScoreAdj) {
+ t.Fatalf("Expected oom_score_adj %d; got %q", *config.OomScoreAdj, outputOomScoreAdj)
+ }
+}
+
+func TestHook(t *testing.T) {
+ if testing.Short() {
+ return
+ }
+
+ bundle, err := newTestBundle()
+ ok(t, err)
+ defer remove(bundle)
+
+ rootfs, err := newRootfs()
+ ok(t, err)
+ defer remove(rootfs)
+
+ config := newTemplateConfig(rootfs)
+ expectedBundle := bundle
+ config.Labels = append(config.Labels, fmt.Sprintf("bundle=%s", expectedBundle))
+
+ getRootfsFromBundle := func(bundle string) (string, error) {
+ f, err := os.Open(filepath.Join(bundle, "config.json"))
+ if err != nil {
+ return "", err
+ }
+
+ var config configs.Config
+ if err = json.NewDecoder(f).Decode(&config); err != nil {
+ return "", err
+ }
+ return config.Rootfs, nil
+ }
+
+ config.Hooks = &configs.Hooks{
+ Prestart: []configs.Hook{
+ configs.NewFunctionHook(func(s *specs.State) error {
+ if s.Bundle != expectedBundle {
+ t.Fatalf("Expected prestart hook bundlePath '%s'; got '%s'", expectedBundle, s.Bundle)
+ }
+
+ root, err := getRootfsFromBundle(s.Bundle)
+ if err != nil {
+ return err
+ }
+ f, err := os.Create(filepath.Join(root, "test"))
+ if err != nil {
+ return err
+ }
+ return f.Close()
+ }),
+ },
+ Poststart: []configs.Hook{
+ configs.NewFunctionHook(func(s *specs.State) error {
+ if s.Bundle != expectedBundle {
+ t.Fatalf("Expected poststart hook bundlePath '%s'; got '%s'", expectedBundle, s.Bundle)
+ }
+
+ root, err := getRootfsFromBundle(s.Bundle)
+ if err != nil {
+ return err
+ }
+ return ioutil.WriteFile(filepath.Join(root, "test"), []byte("hello world"), 0755)
+ }),
+ },
+ Poststop: []configs.Hook{
+ configs.NewFunctionHook(func(s *specs.State) error {
+ if s.Bundle != expectedBundle {
+ t.Fatalf("Expected poststop hook bundlePath '%s'; got '%s'", expectedBundle, s.Bundle)
+ }
+
+ root, err := getRootfsFromBundle(s.Bundle)
+ if err != nil {
+ return err
+ }
+ return os.RemoveAll(filepath.Join(root, "test"))
+ }),
+ },
+ }
+
+ // write config of json format into config.json under bundle
+ f, err := os.OpenFile(filepath.Join(bundle, "config.json"), os.O_CREATE|os.O_RDWR, 0644)
+ ok(t, err)
+ ok(t, json.NewEncoder(f).Encode(config))
+
+ container, err := factory.Create("test", config)
+ ok(t, err)
+
+ var stdout bytes.Buffer
+ pconfig := libcontainer.Process{
+ Cwd: "/",
+ Args: []string{"sh", "-c", "ls /test"},
+ Env: standardEnvironment,
+ Stdin: nil,
+ Stdout: &stdout,
+ Init: true,
+ }
+ err = container.Run(&pconfig)
+ ok(t, err)
+
+ // Wait for process
+ waitProcess(&pconfig, t)
+
+ outputLs := string(stdout.Bytes())
+
+ // Check that the ls output has the expected file touched by the prestart hook
+ if !strings.Contains(outputLs, "/test") {
+ container.Destroy()
+ t.Fatalf("ls output doesn't have the expected file: %s", outputLs)
+ }
+
+ // Check that the file is written by the poststart hook
+ testFilePath := filepath.Join(rootfs, "test")
+ contents, err := ioutil.ReadFile(testFilePath)
+ if err != nil {
+ t.Fatalf("cannot read file '%s': %s", testFilePath, err)
+ }
+ if string(contents) != "hello world" {
+ t.Fatalf("Expected test file to contain 'hello world'; got '%s'", string(contents))
+ }
+
+ if err := container.Destroy(); err != nil {
+ t.Fatalf("container destroy %s", err)
+ }
+ fi, err := os.Stat(filepath.Join(rootfs, "test"))
+ if err == nil || !os.IsNotExist(err) {
+ t.Fatalf("expected file to not exist, got %s", fi.Name())
+ }
+}
+
+func TestSTDIOPermissions(t *testing.T) {
+ if testing.Short() {
+ return
+ }
+
+ rootfs, err := newRootfs()
+ ok(t, err)
+ defer remove(rootfs)
+ config := newTemplateConfig(rootfs)
+ buffers, exitCode, err := runContainer(config, "", "sh", "-c", "echo hi > /dev/stderr")
+ ok(t, err)
+ if exitCode != 0 {
+ t.Fatalf("exit code not 0. code %d stderr %q", exitCode, buffers.Stderr)
+ }
+
+ if actual := strings.Trim(buffers.Stderr.String(), "\n"); actual != "hi" {
+ t.Fatalf("stderr should equal be equal %q %q", actual, "hi")
+ }
+}
+
+func unmountOp(path string) error {
+ return unix.Unmount(path, unix.MNT_DETACH)
+}
+
+// Launch container with rootfsPropagation in rslave mode. Also
+// bind mount a volume /mnt1host at /mnt1cont at the time of launch. Now do
+// another mount on host (/mnt1host/mnt2host) and this new mount should
+// propagate to container (/mnt1cont/mnt2host)
+func TestRootfsPropagationSlaveMount(t *testing.T) {
+ var mountPropagated bool
+ var dir1cont string
+ var dir2cont string
+
+ dir1cont = "/root/mnt1cont"
+
+ if testing.Short() {
+ return
+ }
+ rootfs, err := newRootfs()
+ ok(t, err)
+ defer remove(rootfs)
+ config := newTemplateConfig(rootfs)
+
+ config.RootPropagation = unix.MS_SLAVE | unix.MS_REC
+
+ // Bind mount a volume
+ dir1host, err := ioutil.TempDir("", "mnt1host")
+ ok(t, err)
+ defer os.RemoveAll(dir1host)
+
+ // Make this dir a "shared" mount point. This will make sure a
+ // slave relationship can be established in container.
+ err = unix.Mount(dir1host, dir1host, "bind", unix.MS_BIND|unix.MS_REC, "")
+ ok(t, err)
+ err = unix.Mount("", dir1host, "", unix.MS_SHARED|unix.MS_REC, "")
+ ok(t, err)
+ defer unmountOp(dir1host)
+
+ config.Mounts = append(config.Mounts, &configs.Mount{
+ Source: dir1host,
+ Destination: dir1cont,
+ Device: "bind",
+ Flags: unix.MS_BIND | unix.MS_REC})
+
+ // TODO: systemd specific processing
+ f := factory
+
+ container, err := f.Create("testSlaveMount", config)
+ ok(t, err)
+ defer container.Destroy()
+
+ stdinR, stdinW, err := os.Pipe()
+ ok(t, err)
+
+ pconfig := &libcontainer.Process{
+ Cwd: "/",
+ Args: []string{"cat"},
+ Env: standardEnvironment,
+ Stdin: stdinR,
+ Init: true,
+ }
+
+ err = container.Run(pconfig)
+ stdinR.Close()
+ defer stdinW.Close()
+ ok(t, err)
+
+ // Create mnt1host/mnt2host and bind mount itself on top of it. This
+ // should be visible in container.
+ dir2host, err := ioutil.TempDir(dir1host, "mnt2host")
+ ok(t, err)
+ defer os.RemoveAll(dir2host)
+
+ err = unix.Mount(dir2host, dir2host, "bind", unix.MS_BIND, "")
+ defer unmountOp(dir2host)
+ ok(t, err)
+
+ // Run "cat /proc/self/mountinfo" in container and look at mount points.
+ var stdout2 bytes.Buffer
+
+ stdinR2, stdinW2, err := os.Pipe()
+ ok(t, err)
+
+ pconfig2 := &libcontainer.Process{
+ Cwd: "/",
+ Args: []string{"cat", "/proc/self/mountinfo"},
+ Env: standardEnvironment,
+ Stdin: stdinR2,
+ Stdout: &stdout2,
+ }
+
+ err = container.Run(pconfig2)
+ stdinR2.Close()
+ defer stdinW2.Close()
+ ok(t, err)
+
+ stdinW2.Close()
+ waitProcess(pconfig2, t)
+ stdinW.Close()
+ waitProcess(pconfig, t)
+
+ mountPropagated = false
+ dir2cont = filepath.Join(dir1cont, filepath.Base(dir2host))
+
+ propagationInfo := string(stdout2.Bytes())
+ lines := strings.Split(propagationInfo, "\n")
+ for _, l := range lines {
+ linefields := strings.Split(l, " ")
+ if len(linefields) < 5 {
+ continue
+ }
+
+ if linefields[4] == dir2cont {
+ mountPropagated = true
+ break
+ }
+ }
+
+ if mountPropagated != true {
+ t.Fatalf("Mount on host %s did not propagate in container at %s\n", dir2host, dir2cont)
+ }
+}
+
+// Launch container with rootfsPropagation 0 so no propagation flags are
+// applied. Also bind mount a volume /mnt1host at /mnt1cont at the time of
+// launch. Now do a mount in container (/mnt1cont/mnt2cont) and this new
+// mount should propagate to host (/mnt1host/mnt2cont)
+
+func TestRootfsPropagationSharedMount(t *testing.T) {
+ var dir1cont string
+ var dir2cont string
+
+ dir1cont = "/root/mnt1cont"
+
+ if testing.Short() {
+ return
+ }
+ rootfs, err := newRootfs()
+ ok(t, err)
+ defer remove(rootfs)
+ config := newTemplateConfig(rootfs)
+ config.RootPropagation = unix.MS_PRIVATE
+
+ // Bind mount a volume
+ dir1host, err := ioutil.TempDir("", "mnt1host")
+ ok(t, err)
+ defer os.RemoveAll(dir1host)
+
+ // Make this dir a "shared" mount point. This will make sure a
+ // shared relationship can be established in container.
+ err = unix.Mount(dir1host, dir1host, "bind", unix.MS_BIND|unix.MS_REC, "")
+ ok(t, err)
+ err = unix.Mount("", dir1host, "", unix.MS_SHARED|unix.MS_REC, "")
+ ok(t, err)
+ defer unmountOp(dir1host)
+
+ config.Mounts = append(config.Mounts, &configs.Mount{
+ Source: dir1host,
+ Destination: dir1cont,
+ Device: "bind",
+ Flags: unix.MS_BIND | unix.MS_REC})
+
+ // TODO: systemd specific processing
+ f := factory
+
+ container, err := f.Create("testSharedMount", config)
+ ok(t, err)
+ defer container.Destroy()
+
+ stdinR, stdinW, err := os.Pipe()
+ ok(t, err)
+
+ pconfig := &libcontainer.Process{
+ Cwd: "/",
+ Args: []string{"cat"},
+ Env: standardEnvironment,
+ Stdin: stdinR,
+ Init: true,
+ }
+
+ err = container.Run(pconfig)
+ stdinR.Close()
+ defer stdinW.Close()
+ ok(t, err)
+
+ // Create mnt1host/mnt2cont. This will become visible inside container
+ // at mnt1cont/mnt2cont. Bind mount itself on top of it. This
+ // should be visible on host now.
+ dir2host, err := ioutil.TempDir(dir1host, "mnt2cont")
+ ok(t, err)
+ defer os.RemoveAll(dir2host)
+
+ dir2cont = filepath.Join(dir1cont, filepath.Base(dir2host))
+
+ // Mount something in container and see if it is visible on host.
+ var stdout2 bytes.Buffer
+
+ stdinR2, stdinW2, err := os.Pipe()
+ ok(t, err)
+
+ pconfig2 := &libcontainer.Process{
+ Cwd: "/",
+ Args: []string{"mount", "--bind", dir2cont, dir2cont},
+ Env: standardEnvironment,
+ Stdin: stdinR2,
+ Stdout: &stdout2,
+ Capabilities: &configs.Capabilities{},
+ }
+
+ // Provide CAP_SYS_ADMIN
+ pconfig2.Capabilities.Bounding = append(config.Capabilities.Bounding, "CAP_SYS_ADMIN")
+ pconfig2.Capabilities.Permitted = append(config.Capabilities.Permitted, "CAP_SYS_ADMIN")
+ pconfig2.Capabilities.Effective = append(config.Capabilities.Effective, "CAP_SYS_ADMIN")
+ pconfig2.Capabilities.Inheritable = append(config.Capabilities.Inheritable, "CAP_SYS_ADMIN")
+
+ err = container.Run(pconfig2)
+ stdinR2.Close()
+ defer stdinW2.Close()
+ ok(t, err)
+
+ // Wait for process
+ stdinW2.Close()
+ waitProcess(pconfig2, t)
+ stdinW.Close()
+ waitProcess(pconfig, t)
+
+ defer unmountOp(dir2host)
+
+ // Check if mount is visible on host or not.
+ out, err := exec.Command("findmnt", "-n", "-f", "-oTARGET", dir2host).CombinedOutput()
+ outtrim := strings.TrimSpace(string(out))
+ if err != nil {
+ t.Logf("findmnt error %q: %q", err, outtrim)
+ }
+
+ if string(outtrim) != dir2host {
+ t.Fatalf("Mount in container on %s did not propagate to host on %s. finmnt output=%s", dir2cont, dir2host, outtrim)
+ }
+}
+
+func TestPIDHost(t *testing.T) {
+ if testing.Short() {
+ return
+ }
+
+ rootfs, err := newRootfs()
+ ok(t, err)
+ defer remove(rootfs)
+
+ l, err := os.Readlink("/proc/1/ns/pid")
+ ok(t, err)
+
+ config := newTemplateConfig(rootfs)
+ config.Namespaces.Remove(configs.NEWPID)
+ buffers, exitCode, err := runContainer(config, "", "readlink", "/proc/self/ns/pid")
+ ok(t, err)
+
+ if exitCode != 0 {
+ t.Fatalf("exit code not 0. code %d stderr %q", exitCode, buffers.Stderr)
+ }
+
+ if actual := strings.Trim(buffers.Stdout.String(), "\n"); actual != l {
+ t.Fatalf("ipc link not equal to host link %q %q", actual, l)
+ }
+}
+
+func TestInitJoinPID(t *testing.T) {
+ if testing.Short() {
+ return
+ }
+ rootfs, err := newRootfs()
+ ok(t, err)
+ defer remove(rootfs)
+
+ // Execute a long-running container
+ container1, err := newContainer(newTemplateConfig(rootfs))
+ ok(t, err)
+ defer container1.Destroy()
+
+ stdinR1, stdinW1, err := os.Pipe()
+ ok(t, err)
+ init1 := &libcontainer.Process{
+ Cwd: "/",
+ Args: []string{"cat"},
+ Env: standardEnvironment,
+ Stdin: stdinR1,
+ Init: true,
+ }
+ err = container1.Run(init1)
+ stdinR1.Close()
+ defer stdinW1.Close()
+ ok(t, err)
+
+ // get the state of the first container
+ state1, err := container1.State()
+ ok(t, err)
+ pidns1 := state1.NamespacePaths[configs.NEWPID]
+
+ // Run a container inside the existing pidns but with different cgroups
+ config2 := newTemplateConfig(rootfs)
+ config2.Namespaces.Add(configs.NEWPID, pidns1)
+ config2.Cgroups.Path = "integration/test2"
+ container2, err := newContainerWithName("testCT2", config2)
+ ok(t, err)
+ defer container2.Destroy()
+
+ stdinR2, stdinW2, err := os.Pipe()
+ ok(t, err)
+ init2 := &libcontainer.Process{
+ Cwd: "/",
+ Args: []string{"cat"},
+ Env: standardEnvironment,
+ Stdin: stdinR2,
+ Init: true,
+ }
+ err = container2.Run(init2)
+ stdinR2.Close()
+ defer stdinW2.Close()
+ ok(t, err)
+ // get the state of the second container
+ state2, err := container2.State()
+ ok(t, err)
+
+ ns1, err := os.Readlink(fmt.Sprintf("/proc/%d/ns/pid", state1.InitProcessPid))
+ ok(t, err)
+ ns2, err := os.Readlink(fmt.Sprintf("/proc/%d/ns/pid", state2.InitProcessPid))
+ ok(t, err)
+ if ns1 != ns2 {
+ t.Errorf("pidns(%s), wanted %s", ns2, ns1)
+ }
+
+ // check that namespaces are not the same
+ if reflect.DeepEqual(state2.NamespacePaths, state1.NamespacePaths) {
+ t.Errorf("Namespaces(%v), original %v", state2.NamespacePaths,
+ state1.NamespacePaths)
+ }
+ // check that pidns is joined correctly. The initial container process list
+ // should contain the second container's init process
+ buffers := newStdBuffers()
+ ps := &libcontainer.Process{
+ Cwd: "/",
+ Args: []string{"ps"},
+ Env: standardEnvironment,
+ Stdout: buffers.Stdout,
+ }
+ err = container1.Run(ps)
+ ok(t, err)
+ waitProcess(ps, t)
+
+ // Stop init processes one by one. Stop the second container should
+ // not stop the first.
+ stdinW2.Close()
+ waitProcess(init2, t)
+ stdinW1.Close()
+ waitProcess(init1, t)
+
+ out := strings.TrimSpace(buffers.Stdout.String())
+ // output of ps inside the initial PID namespace should have
+ // 1 line of header,
+ // 2 lines of init processes,
+ // 1 line of ps process
+ if len(strings.Split(out, "\n")) != 4 {
+ t.Errorf("unexpected running process, output %q", out)
+ }
+}
+
+func TestInitJoinNetworkAndUser(t *testing.T) {
+ if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) {
+ t.Skip("userns is unsupported")
+ }
+ if testing.Short() {
+ return
+ }
+ rootfs, err := newRootfs()
+ ok(t, err)
+ defer remove(rootfs)
+
+ // Execute a long-running container
+ config1 := newTemplateConfig(rootfs)
+ config1.UidMappings = []configs.IDMap{{HostID: 0, ContainerID: 0, Size: 1000}}
+ config1.GidMappings = []configs.IDMap{{HostID: 0, ContainerID: 0, Size: 1000}}
+ config1.Namespaces = append(config1.Namespaces, configs.Namespace{Type: configs.NEWUSER})
+ container1, err := newContainer(config1)
+ ok(t, err)
+ defer container1.Destroy()
+
+ stdinR1, stdinW1, err := os.Pipe()
+ ok(t, err)
+ init1 := &libcontainer.Process{
+ Cwd: "/",
+ Args: []string{"cat"},
+ Env: standardEnvironment,
+ Stdin: stdinR1,
+ Init: true,
+ }
+ err = container1.Run(init1)
+ stdinR1.Close()
+ defer stdinW1.Close()
+ ok(t, err)
+
+ // get the state of the first container
+ state1, err := container1.State()
+ ok(t, err)
+ netns1 := state1.NamespacePaths[configs.NEWNET]
+ userns1 := state1.NamespacePaths[configs.NEWUSER]
+
+ // Run a container inside the existing pidns but with different cgroups
+ rootfs2, err := newRootfs()
+ ok(t, err)
+ defer remove(rootfs2)
+
+ config2 := newTemplateConfig(rootfs2)
+ config2.UidMappings = []configs.IDMap{{HostID: 0, ContainerID: 0, Size: 1000}}
+ config2.GidMappings = []configs.IDMap{{HostID: 0, ContainerID: 0, Size: 1000}}
+ config2.Namespaces.Add(configs.NEWNET, netns1)
+ config2.Namespaces.Add(configs.NEWUSER, userns1)
+ config2.Cgroups.Path = "integration/test2"
+ container2, err := newContainerWithName("testCT2", config2)
+ ok(t, err)
+ defer container2.Destroy()
+
+ stdinR2, stdinW2, err := os.Pipe()
+ ok(t, err)
+ init2 := &libcontainer.Process{
+ Cwd: "/",
+ Args: []string{"cat"},
+ Env: standardEnvironment,
+ Stdin: stdinR2,
+ Init: true,
+ }
+ err = container2.Run(init2)
+ stdinR2.Close()
+ defer stdinW2.Close()
+ ok(t, err)
+
+ // get the state of the second container
+ state2, err := container2.State()
+ ok(t, err)
+
+ for _, ns := range []string{"net", "user"} {
+ ns1, err := os.Readlink(fmt.Sprintf("/proc/%d/ns/%s", state1.InitProcessPid, ns))
+ ok(t, err)
+ ns2, err := os.Readlink(fmt.Sprintf("/proc/%d/ns/%s", state2.InitProcessPid, ns))
+ ok(t, err)
+ if ns1 != ns2 {
+ t.Errorf("%s(%s), wanted %s", ns, ns2, ns1)
+ }
+ }
+
+ // check that namespaces are not the same
+ if reflect.DeepEqual(state2.NamespacePaths, state1.NamespacePaths) {
+ t.Errorf("Namespaces(%v), original %v", state2.NamespacePaths,
+ state1.NamespacePaths)
+ }
+ // Stop init processes one by one. Stop the second container should
+ // not stop the first.
+ stdinW2.Close()
+ waitProcess(init2, t)
+ stdinW1.Close()
+ waitProcess(init1, t)
+}
+
+func TestTmpfsCopyUp(t *testing.T) {
+ if testing.Short() {
+ return
+ }
+ root, err := newTestRoot()
+ ok(t, err)
+ defer os.RemoveAll(root)
+
+ rootfs, err := newRootfs()
+ ok(t, err)
+ defer remove(rootfs)
+
+ config := newTemplateConfig(rootfs)
+
+ config.Mounts = append(config.Mounts, &configs.Mount{
+ Source: "tmpfs",
+ Destination: "/etc",
+ Device: "tmpfs",
+ Extensions: configs.EXT_COPYUP,
+ })
+
+ factory, err := libcontainer.New(root, libcontainer.Cgroupfs)
+ ok(t, err)
+
+ container, err := factory.Create("test", config)
+ ok(t, err)
+ defer container.Destroy()
+
+ var stdout bytes.Buffer
+ pconfig := libcontainer.Process{
+ Args: []string{"ls", "/etc/passwd"},
+ Env: standardEnvironment,
+ Stdin: nil,
+ Stdout: &stdout,
+ Init: true,
+ }
+ err = container.Run(&pconfig)
+ ok(t, err)
+
+ // Wait for process
+ waitProcess(&pconfig, t)
+
+ outputLs := string(stdout.Bytes())
+
+ // Check that the ls output has /etc/passwd
+ if !strings.Contains(outputLs, "/etc/passwd") {
+ t.Fatalf("/etc/passwd not copied up as expected: %v", outputLs)
+ }
+}
+
+func TestCGROUPPrivate(t *testing.T) {
+ if _, err := os.Stat("/proc/self/ns/cgroup"); os.IsNotExist(err) {
+ t.Skip("cgroupns is unsupported")
+ }
+ if testing.Short() {
+ return
+ }
+
+ rootfs, err := newRootfs()
+ ok(t, err)
+ defer remove(rootfs)
+
+ l, err := os.Readlink("/proc/1/ns/cgroup")
+ ok(t, err)
+
+ config := newTemplateConfig(rootfs)
+ config.Namespaces.Add(configs.NEWCGROUP, "")
+ buffers, exitCode, err := runContainer(config, "", "readlink", "/proc/self/ns/cgroup")
+ ok(t, err)
+
+ if exitCode != 0 {
+ t.Fatalf("exit code not 0. code %d stderr %q", exitCode, buffers.Stderr)
+ }
+
+ if actual := strings.Trim(buffers.Stdout.String(), "\n"); actual == l {
+ t.Fatalf("cgroup link should be private to the container but equals host %q %q", actual, l)
+ }
+}
+
+func TestCGROUPHost(t *testing.T) {
+ if _, err := os.Stat("/proc/self/ns/cgroup"); os.IsNotExist(err) {
+ t.Skip("cgroupns is unsupported")
+ }
+ if testing.Short() {
+ return
+ }
+
+ rootfs, err := newRootfs()
+ ok(t, err)
+ defer remove(rootfs)
+
+ l, err := os.Readlink("/proc/1/ns/cgroup")
+ ok(t, err)
+
+ config := newTemplateConfig(rootfs)
+ buffers, exitCode, err := runContainer(config, "", "readlink", "/proc/self/ns/cgroup")
+ ok(t, err)
+
+ if exitCode != 0 {
+ t.Fatalf("exit code not 0. code %d stderr %q", exitCode, buffers.Stderr)
+ }
+
+ if actual := strings.Trim(buffers.Stdout.String(), "\n"); actual != l {
+ t.Fatalf("cgroup link not equal to host link %q %q", actual, l)
+ }
+}
--- /dev/null
+package integration
+
+import (
+ "bytes"
+ "fmt"
+ "io"
+ "os"
+ "strconv"
+ "strings"
+ "testing"
+ "time"
+
+ "github.com/containerd/console"
+ "github.com/opencontainers/runc/libcontainer"
+ "github.com/opencontainers/runc/libcontainer/configs"
+ "github.com/opencontainers/runc/libcontainer/utils"
+
+ "golang.org/x/sys/unix"
+)
+
+func TestExecIn(t *testing.T) {
+ if testing.Short() {
+ return
+ }
+ rootfs, err := newRootfs()
+ ok(t, err)
+ defer remove(rootfs)
+ config := newTemplateConfig(rootfs)
+ container, err := newContainer(config)
+ ok(t, err)
+ defer container.Destroy()
+
+ // Execute a first process in the container
+ stdinR, stdinW, err := os.Pipe()
+ ok(t, err)
+ process := &libcontainer.Process{
+ Cwd: "/",
+ Args: []string{"cat"},
+ Env: standardEnvironment,
+ Stdin: stdinR,
+ Init: true,
+ }
+ err = container.Run(process)
+ stdinR.Close()
+ defer stdinW.Close()
+ ok(t, err)
+
+ buffers := newStdBuffers()
+ ps := &libcontainer.Process{
+ Cwd: "/",
+ Args: []string{"ps"},
+ Env: standardEnvironment,
+ Stdin: buffers.Stdin,
+ Stdout: buffers.Stdout,
+ Stderr: buffers.Stderr,
+ }
+
+ err = container.Run(ps)
+ ok(t, err)
+ waitProcess(ps, t)
+ stdinW.Close()
+ waitProcess(process, t)
+
+ out := buffers.Stdout.String()
+ if !strings.Contains(out, "cat") || !strings.Contains(out, "ps") {
+ t.Fatalf("unexpected running process, output %q", out)
+ }
+ if strings.Contains(out, "\r") {
+ t.Fatalf("unexpected carriage-return in output")
+ }
+}
+
+func TestExecInUsernsRlimit(t *testing.T) {
+ if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) {
+ t.Skip("userns is unsupported")
+ }
+
+ testExecInRlimit(t, true)
+}
+
+func TestExecInRlimit(t *testing.T) {
+ testExecInRlimit(t, false)
+}
+
+func testExecInRlimit(t *testing.T, userns bool) {
+ if testing.Short() {
+ return
+ }
+
+ rootfs, err := newRootfs()
+ ok(t, err)
+ defer remove(rootfs)
+
+ config := newTemplateConfig(rootfs)
+ if userns {
+ config.UidMappings = []configs.IDMap{{HostID: 0, ContainerID: 0, Size: 1000}}
+ config.GidMappings = []configs.IDMap{{HostID: 0, ContainerID: 0, Size: 1000}}
+ config.Namespaces = append(config.Namespaces, configs.Namespace{Type: configs.NEWUSER})
+ }
+
+ container, err := newContainer(config)
+ ok(t, err)
+ defer container.Destroy()
+
+ stdinR, stdinW, err := os.Pipe()
+ ok(t, err)
+ process := &libcontainer.Process{
+ Cwd: "/",
+ Args: []string{"cat"},
+ Env: standardEnvironment,
+ Stdin: stdinR,
+ Init: true,
+ }
+ err = container.Run(process)
+ stdinR.Close()
+ defer stdinW.Close()
+ ok(t, err)
+
+ buffers := newStdBuffers()
+ ps := &libcontainer.Process{
+ Cwd: "/",
+ Args: []string{"/bin/sh", "-c", "ulimit -n"},
+ Env: standardEnvironment,
+ Stdin: buffers.Stdin,
+ Stdout: buffers.Stdout,
+ Stderr: buffers.Stderr,
+ Rlimits: []configs.Rlimit{
+ // increase process rlimit higher than container rlimit to test per-process limit
+ {Type: unix.RLIMIT_NOFILE, Hard: 1026, Soft: 1026},
+ },
+ Init: true,
+ }
+ err = container.Run(ps)
+ ok(t, err)
+ waitProcess(ps, t)
+
+ stdinW.Close()
+ waitProcess(process, t)
+
+ out := buffers.Stdout.String()
+ if limit := strings.TrimSpace(out); limit != "1026" {
+ t.Fatalf("expected rlimit to be 1026, got %s", limit)
+ }
+}
+
+func TestExecInAdditionalGroups(t *testing.T) {
+ if testing.Short() {
+ return
+ }
+
+ rootfs, err := newRootfs()
+ ok(t, err)
+ defer remove(rootfs)
+
+ config := newTemplateConfig(rootfs)
+ container, err := newContainer(config)
+ ok(t, err)
+ defer container.Destroy()
+
+ // Execute a first process in the container
+ stdinR, stdinW, err := os.Pipe()
+ ok(t, err)
+ process := &libcontainer.Process{
+ Cwd: "/",
+ Args: []string{"cat"},
+ Env: standardEnvironment,
+ Stdin: stdinR,
+ Init: true,
+ }
+ err = container.Run(process)
+ stdinR.Close()
+ defer stdinW.Close()
+ ok(t, err)
+
+ var stdout bytes.Buffer
+ pconfig := libcontainer.Process{
+ Cwd: "/",
+ Args: []string{"sh", "-c", "id", "-Gn"},
+ Env: standardEnvironment,
+ Stdin: nil,
+ Stdout: &stdout,
+ AdditionalGroups: []string{"plugdev", "audio"},
+ }
+ err = container.Run(&pconfig)
+ ok(t, err)
+
+ // Wait for process
+ waitProcess(&pconfig, t)
+
+ stdinW.Close()
+ waitProcess(process, t)
+
+ outputGroups := string(stdout.Bytes())
+
+ // Check that the groups output has the groups that we specified
+ if !strings.Contains(outputGroups, "audio") {
+ t.Fatalf("Listed groups do not contain the audio group as expected: %v", outputGroups)
+ }
+
+ if !strings.Contains(outputGroups, "plugdev") {
+ t.Fatalf("Listed groups do not contain the plugdev group as expected: %v", outputGroups)
+ }
+}
+
+func TestExecInError(t *testing.T) {
+ if testing.Short() {
+ return
+ }
+ rootfs, err := newRootfs()
+ ok(t, err)
+ defer remove(rootfs)
+ config := newTemplateConfig(rootfs)
+ container, err := newContainer(config)
+ ok(t, err)
+ defer container.Destroy()
+
+ // Execute a first process in the container
+ stdinR, stdinW, err := os.Pipe()
+ ok(t, err)
+ process := &libcontainer.Process{
+ Cwd: "/",
+ Args: []string{"cat"},
+ Env: standardEnvironment,
+ Stdin: stdinR,
+ Init: true,
+ }
+ err = container.Run(process)
+ stdinR.Close()
+ defer func() {
+ stdinW.Close()
+ if _, err := process.Wait(); err != nil {
+ t.Log(err)
+ }
+ }()
+ ok(t, err)
+
+ for i := 0; i < 42; i++ {
+ var out bytes.Buffer
+ unexistent := &libcontainer.Process{
+ Cwd: "/",
+ Args: []string{"unexistent"},
+ Env: standardEnvironment,
+ Stderr: &out,
+ }
+ err = container.Run(unexistent)
+ if err == nil {
+ t.Fatal("Should be an error")
+ }
+ if !strings.Contains(err.Error(), "executable file not found") {
+ t.Fatalf("Should be error about not found executable, got %s", err)
+ }
+ if !bytes.Contains(out.Bytes(), []byte("executable file not found")) {
+ t.Fatalf("executable file not found error not delivered to stdio:\n%s", out.String())
+ }
+ }
+}
+
+func TestExecInTTY(t *testing.T) {
+ if testing.Short() {
+ return
+ }
+ rootfs, err := newRootfs()
+ ok(t, err)
+ defer remove(rootfs)
+ config := newTemplateConfig(rootfs)
+ container, err := newContainer(config)
+ ok(t, err)
+ defer container.Destroy()
+
+ // Execute a first process in the container
+ stdinR, stdinW, err := os.Pipe()
+ ok(t, err)
+ process := &libcontainer.Process{
+ Cwd: "/",
+ Args: []string{"cat"},
+ Env: standardEnvironment,
+ Stdin: stdinR,
+ Init: true,
+ }
+ err = container.Run(process)
+ stdinR.Close()
+ defer stdinW.Close()
+ ok(t, err)
+
+ var stdout bytes.Buffer
+ ps := &libcontainer.Process{
+ Cwd: "/",
+ Args: []string{"ps"},
+ Env: standardEnvironment,
+ }
+ parent, child, err := utils.NewSockPair("console")
+ if err != nil {
+ ok(t, err)
+ }
+ defer parent.Close()
+ defer child.Close()
+ ps.ConsoleSocket = child
+ type cdata struct {
+ c console.Console
+ err error
+ }
+ dc := make(chan *cdata, 1)
+ go func() {
+ f, err := utils.RecvFd(parent)
+ if err != nil {
+ dc <- &cdata{
+ err: err,
+ }
+ return
+ }
+ c, err := console.ConsoleFromFile(f)
+ if err != nil {
+ dc <- &cdata{
+ err: err,
+ }
+ return
+ }
+ console.ClearONLCR(c.Fd())
+ dc <- &cdata{
+ c: c,
+ }
+ }()
+ err = container.Run(ps)
+ ok(t, err)
+ data := <-dc
+ if data.err != nil {
+ ok(t, data.err)
+ }
+ console := data.c
+ copy := make(chan struct{})
+ go func() {
+ io.Copy(&stdout, console)
+ close(copy)
+ }()
+ ok(t, err)
+ select {
+ case <-time.After(5 * time.Second):
+ t.Fatal("Waiting for copy timed out")
+ case <-copy:
+ }
+ waitProcess(ps, t)
+
+ stdinW.Close()
+ waitProcess(process, t)
+
+ out := stdout.String()
+ if !strings.Contains(out, "cat") || !strings.Contains(out, "ps") {
+ t.Fatalf("unexpected running process, output %q", out)
+ }
+ if strings.Contains(out, "\r") {
+ t.Fatalf("unexpected carriage-return in output")
+ }
+}
+
+func TestExecInEnvironment(t *testing.T) {
+ if testing.Short() {
+ return
+ }
+ rootfs, err := newRootfs()
+ ok(t, err)
+ defer remove(rootfs)
+ config := newTemplateConfig(rootfs)
+ container, err := newContainer(config)
+ ok(t, err)
+ defer container.Destroy()
+
+ // Execute a first process in the container
+ stdinR, stdinW, err := os.Pipe()
+ ok(t, err)
+ process := &libcontainer.Process{
+ Cwd: "/",
+ Args: []string{"cat"},
+ Env: standardEnvironment,
+ Stdin: stdinR,
+ Init: true,
+ }
+ err = container.Run(process)
+ stdinR.Close()
+ defer stdinW.Close()
+ ok(t, err)
+
+ buffers := newStdBuffers()
+ process2 := &libcontainer.Process{
+ Cwd: "/",
+ Args: []string{"env"},
+ Env: []string{
+ "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
+ "DEBUG=true",
+ "DEBUG=false",
+ "ENV=test",
+ },
+ Stdin: buffers.Stdin,
+ Stdout: buffers.Stdout,
+ Stderr: buffers.Stderr,
+ Init: true,
+ }
+ err = container.Run(process2)
+ ok(t, err)
+ waitProcess(process2, t)
+
+ stdinW.Close()
+ waitProcess(process, t)
+
+ out := buffers.Stdout.String()
+ // check execin's process environment
+ if !strings.Contains(out, "DEBUG=false") ||
+ !strings.Contains(out, "ENV=test") ||
+ !strings.Contains(out, "HOME=/root") ||
+ !strings.Contains(out, "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin") ||
+ strings.Contains(out, "DEBUG=true") {
+ t.Fatalf("unexpected running process, output %q", out)
+ }
+}
+
+func TestExecinPassExtraFiles(t *testing.T) {
+ if testing.Short() {
+ return
+ }
+ rootfs, err := newRootfs()
+ if err != nil {
+ t.Fatal(err)
+ }
+ defer remove(rootfs)
+ config := newTemplateConfig(rootfs)
+ container, err := newContainer(config)
+ if err != nil {
+ t.Fatal(err)
+ }
+ defer container.Destroy()
+
+ // Execute a first process in the container
+ stdinR, stdinW, err := os.Pipe()
+ if err != nil {
+ t.Fatal(err)
+ }
+ process := &libcontainer.Process{
+ Cwd: "/",
+ Args: []string{"cat"},
+ Env: standardEnvironment,
+ Stdin: stdinR,
+ Init: true,
+ }
+ err = container.Run(process)
+ stdinR.Close()
+ defer stdinW.Close()
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ var stdout bytes.Buffer
+ pipeout1, pipein1, err := os.Pipe()
+ if err != nil {
+ t.Fatal(err)
+ }
+ pipeout2, pipein2, err := os.Pipe()
+ if err != nil {
+ t.Fatal(err)
+ }
+ inprocess := &libcontainer.Process{
+ Cwd: "/",
+ Args: []string{"sh", "-c", "cd /proc/$$/fd; echo -n *; echo -n 1 >3; echo -n 2 >4"},
+ Env: []string{"PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"},
+ ExtraFiles: []*os.File{pipein1, pipein2},
+ Stdin: nil,
+ Stdout: &stdout,
+ }
+ err = container.Run(inprocess)
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ waitProcess(inprocess, t)
+ stdinW.Close()
+ waitProcess(process, t)
+
+ out := string(stdout.Bytes())
+ // fd 5 is the directory handle for /proc/$$/fd
+ if out != "0 1 2 3 4 5" {
+ t.Fatalf("expected to have the file descriptors '0 1 2 3 4 5' passed to exec, got '%s'", out)
+ }
+ var buf = []byte{0}
+ _, err = pipeout1.Read(buf)
+ if err != nil {
+ t.Fatal(err)
+ }
+ out1 := string(buf)
+ if out1 != "1" {
+ t.Fatalf("expected first pipe to receive '1', got '%s'", out1)
+ }
+
+ _, err = pipeout2.Read(buf)
+ if err != nil {
+ t.Fatal(err)
+ }
+ out2 := string(buf)
+ if out2 != "2" {
+ t.Fatalf("expected second pipe to receive '2', got '%s'", out2)
+ }
+}
+
+func TestExecInOomScoreAdj(t *testing.T) {
+ if testing.Short() {
+ return
+ }
+ rootfs, err := newRootfs()
+ ok(t, err)
+ defer remove(rootfs)
+ config := newTemplateConfig(rootfs)
+ config.OomScoreAdj = ptrInt(200)
+ container, err := newContainer(config)
+ ok(t, err)
+ defer container.Destroy()
+
+ stdinR, stdinW, err := os.Pipe()
+ ok(t, err)
+ process := &libcontainer.Process{
+ Cwd: "/",
+ Args: []string{"cat"},
+ Env: standardEnvironment,
+ Stdin: stdinR,
+ Init: true,
+ }
+ err = container.Run(process)
+ stdinR.Close()
+ defer stdinW.Close()
+ ok(t, err)
+
+ buffers := newStdBuffers()
+ ps := &libcontainer.Process{
+ Cwd: "/",
+ Args: []string{"/bin/sh", "-c", "cat /proc/self/oom_score_adj"},
+ Env: standardEnvironment,
+ Stdin: buffers.Stdin,
+ Stdout: buffers.Stdout,
+ Stderr: buffers.Stderr,
+ }
+ err = container.Run(ps)
+ ok(t, err)
+ waitProcess(ps, t)
+
+ stdinW.Close()
+ waitProcess(process, t)
+
+ out := buffers.Stdout.String()
+ if oomScoreAdj := strings.TrimSpace(out); oomScoreAdj != strconv.Itoa(*config.OomScoreAdj) {
+ t.Fatalf("expected oomScoreAdj to be %d, got %s", *config.OomScoreAdj, oomScoreAdj)
+ }
+}
+
+func TestExecInUserns(t *testing.T) {
+ if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) {
+ t.Skip("userns is unsupported")
+ }
+ if testing.Short() {
+ return
+ }
+ rootfs, err := newRootfs()
+ ok(t, err)
+ defer remove(rootfs)
+ config := newTemplateConfig(rootfs)
+ config.UidMappings = []configs.IDMap{{HostID: 0, ContainerID: 0, Size: 1000}}
+ config.GidMappings = []configs.IDMap{{HostID: 0, ContainerID: 0, Size: 1000}}
+ config.Namespaces = append(config.Namespaces, configs.Namespace{Type: configs.NEWUSER})
+ container, err := newContainer(config)
+ ok(t, err)
+ defer container.Destroy()
+
+ // Execute a first process in the container
+ stdinR, stdinW, err := os.Pipe()
+ ok(t, err)
+
+ process := &libcontainer.Process{
+ Cwd: "/",
+ Args: []string{"cat"},
+ Env: standardEnvironment,
+ Stdin: stdinR,
+ Init: true,
+ }
+ err = container.Run(process)
+ stdinR.Close()
+ defer stdinW.Close()
+ ok(t, err)
+
+ initPID, err := process.Pid()
+ ok(t, err)
+ initUserns, err := os.Readlink(fmt.Sprintf("/proc/%d/ns/user", initPID))
+ ok(t, err)
+
+ buffers := newStdBuffers()
+ process2 := &libcontainer.Process{
+ Cwd: "/",
+ Args: []string{"readlink", "/proc/self/ns/user"},
+ Env: []string{
+ "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
+ },
+ Stdout: buffers.Stdout,
+ Stderr: os.Stderr,
+ }
+ err = container.Run(process2)
+ ok(t, err)
+ waitProcess(process2, t)
+ stdinW.Close()
+ waitProcess(process, t)
+
+ if out := strings.TrimSpace(buffers.Stdout.String()); out != initUserns {
+ t.Errorf("execin userns(%s), wanted %s", out, initUserns)
+ }
+}
--- /dev/null
+package integration
+
+import (
+ "os"
+ "runtime"
+ "testing"
+
+ "github.com/opencontainers/runc/libcontainer"
+ "github.com/opencontainers/runc/libcontainer/cgroups/systemd"
+ _ "github.com/opencontainers/runc/libcontainer/nsenter"
+
+ "github.com/sirupsen/logrus"
+)
+
+// init runs the libcontainer initialization code because of the busybox style needs
+// to work around the go runtime and the issues with forking
+func init() {
+ if len(os.Args) < 2 || os.Args[1] != "init" {
+ return
+ }
+ runtime.GOMAXPROCS(1)
+ runtime.LockOSThread()
+ factory, err := libcontainer.New("")
+ if err != nil {
+ logrus.Fatalf("unable to initialize for container: %s", err)
+ }
+ if err := factory.StartInitialization(); err != nil {
+ logrus.Fatal(err)
+ }
+}
+
+var (
+ factory libcontainer.Factory
+ systemdFactory libcontainer.Factory
+)
+
+func TestMain(m *testing.M) {
+ var (
+ err error
+ ret int
+ )
+
+ logrus.SetOutput(os.Stderr)
+ logrus.SetLevel(logrus.InfoLevel)
+
+ factory, err = libcontainer.New("/run/libctTests", libcontainer.Cgroupfs)
+ if err != nil {
+ logrus.Error(err)
+ os.Exit(1)
+ }
+ if systemd.UseSystemd() {
+ systemdFactory, err = libcontainer.New("/run/libctTests", libcontainer.SystemdCgroups)
+ if err != nil {
+ logrus.Error(err)
+ os.Exit(1)
+ }
+ }
+
+ ret = m.Run()
+ os.Exit(ret)
+}
--- /dev/null
+// +build linux,cgo,seccomp
+
+package integration
+
+import (
+ "strings"
+ "syscall"
+ "testing"
+
+ "github.com/opencontainers/runc/libcontainer"
+ "github.com/opencontainers/runc/libcontainer/configs"
+ libseccomp "github.com/seccomp/libseccomp-golang"
+)
+
+func TestSeccompDenyGetcwd(t *testing.T) {
+ if testing.Short() {
+ return
+ }
+
+ rootfs, err := newRootfs()
+ if err != nil {
+ t.Fatal(err)
+ }
+ defer remove(rootfs)
+
+ config := newTemplateConfig(rootfs)
+ config.Seccomp = &configs.Seccomp{
+ DefaultAction: configs.Allow,
+ Syscalls: []*configs.Syscall{
+ {
+ Name: "getcwd",
+ Action: configs.Errno,
+ },
+ },
+ }
+
+ container, err := newContainer(config)
+ if err != nil {
+ t.Fatal(err)
+ }
+ defer container.Destroy()
+
+ buffers := newStdBuffers()
+ pwd := &libcontainer.Process{
+ Cwd: "/",
+ Args: []string{"pwd"},
+ Env: standardEnvironment,
+ Stdin: buffers.Stdin,
+ Stdout: buffers.Stdout,
+ Stderr: buffers.Stderr,
+ Init: true,
+ }
+
+ err = container.Run(pwd)
+ if err != nil {
+ t.Fatal(err)
+ }
+ ps, err := pwd.Wait()
+ if err == nil {
+ t.Fatal("Expecting error (negative return code); instead exited cleanly!")
+ }
+
+ var exitCode int
+ status := ps.Sys().(syscall.WaitStatus)
+ if status.Exited() {
+ exitCode = status.ExitStatus()
+ } else if status.Signaled() {
+ exitCode = -int(status.Signal())
+ } else {
+ t.Fatalf("Unrecognized exit reason!")
+ }
+
+ if exitCode == 0 {
+ t.Fatalf("Getcwd should fail with negative exit code, instead got %d!", exitCode)
+ }
+
+ expected := "pwd: getcwd: Operation not permitted"
+ actual := strings.Trim(buffers.Stderr.String(), "\n")
+ if actual != expected {
+ t.Fatalf("Expected output %s but got %s\n", expected, actual)
+ }
+}
+
+func TestSeccompPermitWriteConditional(t *testing.T) {
+ if testing.Short() {
+ return
+ }
+
+ rootfs, err := newRootfs()
+ if err != nil {
+ t.Fatal(err)
+ }
+ defer remove(rootfs)
+
+ config := newTemplateConfig(rootfs)
+ config.Seccomp = &configs.Seccomp{
+ DefaultAction: configs.Allow,
+ Syscalls: []*configs.Syscall{
+ {
+ Name: "write",
+ Action: configs.Errno,
+ Args: []*configs.Arg{
+ {
+ Index: 0,
+ Value: 2,
+ Op: configs.EqualTo,
+ },
+ },
+ },
+ },
+ }
+
+ container, err := newContainer(config)
+ if err != nil {
+ t.Fatal(err)
+ }
+ defer container.Destroy()
+
+ buffers := newStdBuffers()
+ dmesg := &libcontainer.Process{
+ Cwd: "/",
+ Args: []string{"busybox", "ls", "/"},
+ Env: standardEnvironment,
+ Stdin: buffers.Stdin,
+ Stdout: buffers.Stdout,
+ Stderr: buffers.Stderr,
+ Init: true,
+ }
+
+ err = container.Run(dmesg)
+ if err != nil {
+ t.Fatal(err)
+ }
+ if _, err := dmesg.Wait(); err != nil {
+ t.Fatalf("%s: %s", err, buffers.Stderr)
+ }
+}
+
+func TestSeccompDenyWriteConditional(t *testing.T) {
+ if testing.Short() {
+ return
+ }
+
+ // Only test if library version is v2.2.1 or higher
+ // Conditional filtering will always error in v2.2.0 and lower
+ major, minor, micro := libseccomp.GetLibraryVersion()
+ if (major == 2 && minor < 2) || (major == 2 && minor == 2 && micro < 1) {
+ return
+ }
+
+ rootfs, err := newRootfs()
+ if err != nil {
+ t.Fatal(err)
+ }
+ defer remove(rootfs)
+
+ config := newTemplateConfig(rootfs)
+ config.Seccomp = &configs.Seccomp{
+ DefaultAction: configs.Allow,
+ Syscalls: []*configs.Syscall{
+ {
+ Name: "write",
+ Action: configs.Errno,
+ Args: []*configs.Arg{
+ {
+ Index: 0,
+ Value: 2,
+ Op: configs.EqualTo,
+ },
+ },
+ },
+ },
+ }
+
+ container, err := newContainer(config)
+ if err != nil {
+ t.Fatal(err)
+ }
+ defer container.Destroy()
+
+ buffers := newStdBuffers()
+ dmesg := &libcontainer.Process{
+ Cwd: "/",
+ Args: []string{"busybox", "ls", "does_not_exist"},
+ Env: standardEnvironment,
+ Stdin: buffers.Stdin,
+ Stdout: buffers.Stdout,
+ Stderr: buffers.Stderr,
+ Init: true,
+ }
+
+ err = container.Run(dmesg)
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ ps, err := dmesg.Wait()
+ if err == nil {
+ t.Fatal("Expecting negative return, instead got 0!")
+ }
+
+ var exitCode int
+ status := ps.Sys().(syscall.WaitStatus)
+ if status.Exited() {
+ exitCode = status.ExitStatus()
+ } else if status.Signaled() {
+ exitCode = -int(status.Signal())
+ } else {
+ t.Fatalf("Unrecognized exit reason!")
+ }
+
+ if exitCode == 0 {
+ t.Fatalf("Busybox should fail with negative exit code, instead got %d!", exitCode)
+ }
+
+ // We're denying write to stderr, so we expect an empty buffer
+ expected := ""
+ actual := strings.Trim(buffers.Stderr.String(), "\n")
+ if actual != expected {
+ t.Fatalf("Expected output %s but got %s\n", expected, actual)
+ }
+}
+
+func TestSeccompPermitWriteMultipleConditions(t *testing.T) {
+ if testing.Short() {
+ return
+ }
+
+ rootfs, err := newRootfs()
+ if err != nil {
+ t.Fatal(err)
+ }
+ defer remove(rootfs)
+
+ config := newTemplateConfig(rootfs)
+ config.Seccomp = &configs.Seccomp{
+ DefaultAction: configs.Allow,
+ Syscalls: []*configs.Syscall{
+ {
+ Name: "write",
+ Action: configs.Errno,
+ Args: []*configs.Arg{
+ {
+ Index: 0,
+ Value: 2,
+ Op: configs.EqualTo,
+ },
+ {
+ Index: 2,
+ Value: 0,
+ Op: configs.NotEqualTo,
+ },
+ },
+ },
+ },
+ }
+
+ buffers, exitCode, err := runContainer(config, "", "ls", "/")
+ if err != nil {
+ t.Fatalf("%s: %s", buffers, err)
+ }
+ if exitCode != 0 {
+ t.Fatalf("exit code not 0. code %d buffers %s", exitCode, buffers)
+ }
+ // We don't need to verify the actual thing printed
+ // Just that something was written to stdout
+ if len(buffers.Stdout.String()) == 0 {
+ t.Fatalf("Nothing was written to stdout, write call failed!\n")
+ }
+}
+
+func TestSeccompDenyWriteMultipleConditions(t *testing.T) {
+ if testing.Short() {
+ return
+ }
+
+ // Only test if library version is v2.2.1 or higher
+ // Conditional filtering will always error in v2.2.0 and lower
+ major, minor, micro := libseccomp.GetLibraryVersion()
+ if (major == 2 && minor < 2) || (major == 2 && minor == 2 && micro < 1) {
+ return
+ }
+
+ rootfs, err := newRootfs()
+ if err != nil {
+ t.Fatal(err)
+ }
+ defer remove(rootfs)
+
+ config := newTemplateConfig(rootfs)
+ config.Seccomp = &configs.Seccomp{
+ DefaultAction: configs.Allow,
+ Syscalls: []*configs.Syscall{
+ {
+ Name: "write",
+ Action: configs.Errno,
+ Args: []*configs.Arg{
+ {
+ Index: 0,
+ Value: 2,
+ Op: configs.EqualTo,
+ },
+ {
+ Index: 2,
+ Value: 0,
+ Op: configs.NotEqualTo,
+ },
+ },
+ },
+ },
+ }
+
+ buffers, exitCode, err := runContainer(config, "", "ls", "/does_not_exist")
+ if err == nil {
+ t.Fatalf("Expecting error return, instead got 0")
+ }
+ if exitCode == 0 {
+ t.Fatalf("Busybox should fail with negative exit code, instead got %d!", exitCode)
+ }
+
+ expected := ""
+ actual := strings.Trim(buffers.Stderr.String(), "\n")
+ if actual != expected {
+ t.Fatalf("Expected output %s but got %s\n", expected, actual)
+ }
+}
+
+func TestSeccompMultipleConditionSameArgDeniesStdout(t *testing.T) {
+ if testing.Short() {
+ return
+ }
+
+ rootfs, err := newRootfs()
+ if err != nil {
+ t.Fatal(err)
+ }
+ defer remove(rootfs)
+
+ // Prevent writing to both stdout and stderr
+ config := newTemplateConfig(rootfs)
+ config.Seccomp = &configs.Seccomp{
+ DefaultAction: configs.Allow,
+ Syscalls: []*configs.Syscall{
+ {
+ Name: "write",
+ Action: configs.Errno,
+ Args: []*configs.Arg{
+ {
+ Index: 0,
+ Value: 1,
+ Op: configs.EqualTo,
+ },
+ {
+ Index: 0,
+ Value: 2,
+ Op: configs.EqualTo,
+ },
+ },
+ },
+ },
+ }
+
+ buffers, exitCode, err := runContainer(config, "", "ls", "/")
+ if err != nil {
+ t.Fatalf("%s: %s", buffers, err)
+ }
+ if exitCode != 0 {
+ t.Fatalf("exit code not 0. code %d buffers %s", exitCode, buffers)
+ }
+ // Verify that nothing was printed
+ if len(buffers.Stdout.String()) != 0 {
+ t.Fatalf("Something was written to stdout, write call succeeded!\n")
+ }
+}
+
+func TestSeccompMultipleConditionSameArgDeniesStderr(t *testing.T) {
+ if testing.Short() {
+ return
+ }
+
+ rootfs, err := newRootfs()
+ if err != nil {
+ t.Fatal(err)
+ }
+ defer remove(rootfs)
+
+ // Prevent writing to both stdout and stderr
+ config := newTemplateConfig(rootfs)
+ config.Seccomp = &configs.Seccomp{
+ DefaultAction: configs.Allow,
+ Syscalls: []*configs.Syscall{
+ {
+ Name: "write",
+ Action: configs.Errno,
+ Args: []*configs.Arg{
+ {
+ Index: 0,
+ Value: 1,
+ Op: configs.EqualTo,
+ },
+ {
+ Index: 0,
+ Value: 2,
+ Op: configs.EqualTo,
+ },
+ },
+ },
+ },
+ }
+
+ buffers, exitCode, err := runContainer(config, "", "ls", "/does_not_exist")
+ if err == nil {
+ t.Fatalf("Expecting error return, instead got 0")
+ }
+ if exitCode == 0 {
+ t.Fatalf("Busybox should fail with negative exit code, instead got %d!", exitCode)
+ }
+ // Verify nothing was printed
+ if len(buffers.Stderr.String()) != 0 {
+ t.Fatalf("Something was written to stderr, write call succeeded!\n")
+ }
+}
--- /dev/null
+package integration
+
+import (
+ "github.com/opencontainers/runc/libcontainer/configs"
+
+ "golang.org/x/sys/unix"
+)
+
+var standardEnvironment = []string{
+ "HOME=/root",
+ "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
+ "HOSTNAME=integration",
+ "TERM=xterm",
+}
+
+const defaultMountFlags = unix.MS_NOEXEC | unix.MS_NOSUID | unix.MS_NODEV
+
+// newTemplateConfig returns a base template for running a container
+//
+// it uses a network strategy of just setting a loopback interface
+// and the default setup for devices
+func newTemplateConfig(rootfs string) *configs.Config {
+ allowAllDevices := false
+ return &configs.Config{
+ Rootfs: rootfs,
+ Capabilities: &configs.Capabilities{
+ Bounding: []string{
+ "CAP_CHOWN",
+ "CAP_DAC_OVERRIDE",
+ "CAP_FSETID",
+ "CAP_FOWNER",
+ "CAP_MKNOD",
+ "CAP_NET_RAW",
+ "CAP_SETGID",
+ "CAP_SETUID",
+ "CAP_SETFCAP",
+ "CAP_SETPCAP",
+ "CAP_NET_BIND_SERVICE",
+ "CAP_SYS_CHROOT",
+ "CAP_KILL",
+ "CAP_AUDIT_WRITE",
+ },
+ Permitted: []string{
+ "CAP_CHOWN",
+ "CAP_DAC_OVERRIDE",
+ "CAP_FSETID",
+ "CAP_FOWNER",
+ "CAP_MKNOD",
+ "CAP_NET_RAW",
+ "CAP_SETGID",
+ "CAP_SETUID",
+ "CAP_SETFCAP",
+ "CAP_SETPCAP",
+ "CAP_NET_BIND_SERVICE",
+ "CAP_SYS_CHROOT",
+ "CAP_KILL",
+ "CAP_AUDIT_WRITE",
+ },
+ Inheritable: []string{
+ "CAP_CHOWN",
+ "CAP_DAC_OVERRIDE",
+ "CAP_FSETID",
+ "CAP_FOWNER",
+ "CAP_MKNOD",
+ "CAP_NET_RAW",
+ "CAP_SETGID",
+ "CAP_SETUID",
+ "CAP_SETFCAP",
+ "CAP_SETPCAP",
+ "CAP_NET_BIND_SERVICE",
+ "CAP_SYS_CHROOT",
+ "CAP_KILL",
+ "CAP_AUDIT_WRITE",
+ },
+ Ambient: []string{
+ "CAP_CHOWN",
+ "CAP_DAC_OVERRIDE",
+ "CAP_FSETID",
+ "CAP_FOWNER",
+ "CAP_MKNOD",
+ "CAP_NET_RAW",
+ "CAP_SETGID",
+ "CAP_SETUID",
+ "CAP_SETFCAP",
+ "CAP_SETPCAP",
+ "CAP_NET_BIND_SERVICE",
+ "CAP_SYS_CHROOT",
+ "CAP_KILL",
+ "CAP_AUDIT_WRITE",
+ },
+ Effective: []string{
+ "CAP_CHOWN",
+ "CAP_DAC_OVERRIDE",
+ "CAP_FSETID",
+ "CAP_FOWNER",
+ "CAP_MKNOD",
+ "CAP_NET_RAW",
+ "CAP_SETGID",
+ "CAP_SETUID",
+ "CAP_SETFCAP",
+ "CAP_SETPCAP",
+ "CAP_NET_BIND_SERVICE",
+ "CAP_SYS_CHROOT",
+ "CAP_KILL",
+ "CAP_AUDIT_WRITE",
+ },
+ },
+ Namespaces: configs.Namespaces([]configs.Namespace{
+ {Type: configs.NEWNS},
+ {Type: configs.NEWUTS},
+ {Type: configs.NEWIPC},
+ {Type: configs.NEWPID},
+ {Type: configs.NEWNET},
+ }),
+ Cgroups: &configs.Cgroup{
+ Path: "integration/test",
+ Resources: &configs.Resources{
+ MemorySwappiness: nil,
+ AllowAllDevices: &allowAllDevices,
+ AllowedDevices: configs.DefaultAllowedDevices,
+ },
+ },
+ MaskPaths: []string{
+ "/proc/kcore",
+ "/sys/firmware",
+ },
+ ReadonlyPaths: []string{
+ "/proc/sys", "/proc/sysrq-trigger", "/proc/irq", "/proc/bus",
+ },
+ Devices: configs.DefaultAutoCreatedDevices,
+ Hostname: "integration",
+ Mounts: []*configs.Mount{
+ {
+ Source: "proc",
+ Destination: "/proc",
+ Device: "proc",
+ Flags: defaultMountFlags,
+ },
+ {
+ Source: "tmpfs",
+ Destination: "/dev",
+ Device: "tmpfs",
+ Flags: unix.MS_NOSUID | unix.MS_STRICTATIME,
+ Data: "mode=755",
+ },
+ {
+ Source: "devpts",
+ Destination: "/dev/pts",
+ Device: "devpts",
+ Flags: unix.MS_NOSUID | unix.MS_NOEXEC,
+ Data: "newinstance,ptmxmode=0666,mode=0620,gid=5",
+ },
+ {
+ Device: "tmpfs",
+ Source: "shm",
+ Destination: "/dev/shm",
+ Data: "mode=1777,size=65536k",
+ Flags: defaultMountFlags,
+ },
+ /*
+ CI is broken on the debian based kernels with this
+ {
+ Source: "mqueue",
+ Destination: "/dev/mqueue",
+ Device: "mqueue",
+ Flags: defaultMountFlags,
+ },
+ */
+ {
+ Source: "sysfs",
+ Destination: "/sys",
+ Device: "sysfs",
+ Flags: defaultMountFlags | unix.MS_RDONLY,
+ },
+ },
+ Networks: []*configs.Network{
+ {
+ Type: "loopback",
+ Address: "127.0.0.1/0",
+ Gateway: "localhost",
+ },
+ },
+ Rlimits: []configs.Rlimit{
+ {
+ Type: unix.RLIMIT_NOFILE,
+ Hard: uint64(1025),
+ Soft: uint64(1025),
+ },
+ },
+ }
+}
--- /dev/null
+package integration
+
+import (
+ "bytes"
+ "crypto/md5"
+ "encoding/hex"
+ "fmt"
+ "io/ioutil"
+ "os"
+ "os/exec"
+ "path/filepath"
+ "runtime"
+ "strings"
+ "syscall"
+ "testing"
+ "time"
+
+ "github.com/opencontainers/runc/libcontainer"
+ "github.com/opencontainers/runc/libcontainer/configs"
+)
+
+func ptrInt(v int) *int {
+ return &v
+}
+
+func newStdBuffers() *stdBuffers {
+ return &stdBuffers{
+ Stdin: bytes.NewBuffer(nil),
+ Stdout: bytes.NewBuffer(nil),
+ Stderr: bytes.NewBuffer(nil),
+ }
+}
+
+type stdBuffers struct {
+ Stdin *bytes.Buffer
+ Stdout *bytes.Buffer
+ Stderr *bytes.Buffer
+}
+
+func (b *stdBuffers) String() string {
+ s := []string{}
+ if b.Stderr != nil {
+ s = append(s, b.Stderr.String())
+ }
+ if b.Stdout != nil {
+ s = append(s, b.Stdout.String())
+ }
+ return strings.Join(s, "|")
+}
+
+// ok fails the test if an err is not nil.
+func ok(t testing.TB, err error) {
+ if err != nil {
+ _, file, line, _ := runtime.Caller(1)
+ t.Fatalf("%s:%d: unexpected error: %s\n\n", filepath.Base(file), line, err.Error())
+ }
+}
+
+func waitProcess(p *libcontainer.Process, t *testing.T) {
+ _, file, line, _ := runtime.Caller(1)
+ status, err := p.Wait()
+
+ if err != nil {
+ t.Fatalf("%s:%d: unexpected error: %s\n\n", filepath.Base(file), line, err.Error())
+ }
+
+ if !status.Success() {
+ t.Fatalf("%s:%d: unexpected status: %s\n\n", filepath.Base(file), line, status.String())
+ }
+}
+
+func newTestRoot() (string, error) {
+ dir, err := ioutil.TempDir("", "libcontainer")
+ if err != nil {
+ return "", err
+ }
+ if err := os.MkdirAll(dir, 0700); err != nil {
+ return "", err
+ }
+ return dir, nil
+}
+
+func newTestBundle() (string, error) {
+ dir, err := ioutil.TempDir("", "bundle")
+ if err != nil {
+ return "", err
+ }
+ if err := os.MkdirAll(dir, 0700); err != nil {
+ return "", err
+ }
+ return dir, nil
+}
+
+// newRootfs creates a new tmp directory and copies the busybox root filesystem
+func newRootfs() (string, error) {
+ dir, err := ioutil.TempDir("", "")
+ if err != nil {
+ return "", err
+ }
+ if err := os.MkdirAll(dir, 0700); err != nil {
+ return "", err
+ }
+ if err := copyBusybox(dir); err != nil {
+ return "", err
+ }
+ return dir, nil
+}
+
+func remove(dir string) {
+ os.RemoveAll(dir)
+}
+
+// copyBusybox copies the rootfs for a busybox container created for the test image
+// into the new directory for the specific test
+func copyBusybox(dest string) error {
+ out, err := exec.Command("sh", "-c", fmt.Sprintf("cp -a /busybox/* %s/", dest)).CombinedOutput()
+ if err != nil {
+ return fmt.Errorf("copy error %q: %q", err, out)
+ }
+ return nil
+}
+
+func newContainer(config *configs.Config) (libcontainer.Container, error) {
+ h := md5.New()
+ h.Write([]byte(time.Now().String()))
+ return newContainerWithName(hex.EncodeToString(h.Sum(nil)), config)
+}
+
+func newContainerWithName(name string, config *configs.Config) (libcontainer.Container, error) {
+ f := factory
+ if config.Cgroups != nil && config.Cgroups.Parent == "system.slice" {
+ f = systemdFactory
+ }
+ return f.Create(name, config)
+}
+
+// runContainer runs the container with the specific config and arguments
+//
+// buffers are returned containing the STDOUT and STDERR output for the run
+// along with the exit code and any go error
+func runContainer(config *configs.Config, console string, args ...string) (buffers *stdBuffers, exitCode int, err error) {
+ container, err := newContainer(config)
+ if err != nil {
+ return nil, -1, err
+ }
+ defer container.Destroy()
+ buffers = newStdBuffers()
+ process := &libcontainer.Process{
+ Cwd: "/",
+ Args: args,
+ Env: standardEnvironment,
+ Stdin: buffers.Stdin,
+ Stdout: buffers.Stdout,
+ Stderr: buffers.Stderr,
+ Init: true,
+ }
+
+ err = container.Run(process)
+ if err != nil {
+ return buffers, -1, err
+ }
+ ps, err := process.Wait()
+ if err != nil {
+ return buffers, -1, err
+ }
+ status := ps.Sys().(syscall.WaitStatus)
+ if status.Exited() {
+ exitCode = status.ExitStatus()
+ } else if status.Signaled() {
+ exitCode = -int(status.Signal())
+ } else {
+ return buffers, -1, err
+ }
+ return
+}
--- /dev/null
+// +build linux
+
+package intelrdt
+
+import (
+ "bufio"
+ "fmt"
+ "io/ioutil"
+ "os"
+ "path/filepath"
+ "strconv"
+ "strings"
+ "sync"
+
+ "github.com/opencontainers/runc/libcontainer/configs"
+)
+
+/*
+ * About Intel RDT features:
+ * Intel platforms with new Xeon CPU support Resource Director Technology (RDT).
+ * Cache Allocation Technology (CAT) and Memory Bandwidth Allocation (MBA) are
+ * two sub-features of RDT.
+ *
+ * Cache Allocation Technology (CAT) provides a way for the software to restrict
+ * cache allocation to a defined 'subset' of L3 cache which may be overlapping
+ * with other 'subsets'. The different subsets are identified by class of
+ * service (CLOS) and each CLOS has a capacity bitmask (CBM).
+ *
+ * Memory Bandwidth Allocation (MBA) provides indirect and approximate throttle
+ * over memory bandwidth for the software. A user controls the resource by
+ * indicating the percentage of maximum memory bandwidth.
+ *
+ * More details about Intel RDT CAT and MBA can be found in the section 17.18
+ * of Intel Software Developer Manual:
+ * https://software.intel.com/en-us/articles/intel-sdm
+ *
+ * About Intel RDT kernel interface:
+ * In Linux 4.10 kernel or newer, the interface is defined and exposed via
+ * "resource control" filesystem, which is a "cgroup-like" interface.
+ *
+ * Comparing with cgroups, it has similar process management lifecycle and
+ * interfaces in a container. But unlike cgroups' hierarchy, it has single level
+ * filesystem layout.
+ *
+ * CAT and MBA features are introduced in Linux 4.10 and 4.12 kernel via
+ * "resource control" filesystem.
+ *
+ * Intel RDT "resource control" filesystem hierarchy:
+ * mount -t resctrl resctrl /sys/fs/resctrl
+ * tree /sys/fs/resctrl
+ * /sys/fs/resctrl/
+ * |-- info
+ * | |-- L3
+ * | | |-- cbm_mask
+ * | | |-- min_cbm_bits
+ * | | |-- num_closids
+ * | |-- MB
+ * | |-- bandwidth_gran
+ * | |-- delay_linear
+ * | |-- min_bandwidth
+ * | |-- num_closids
+ * |-- ...
+ * |-- schemata
+ * |-- tasks
+ * |-- <container_id>
+ * |-- ...
+ * |-- schemata
+ * |-- tasks
+ *
+ * For runc, we can make use of `tasks` and `schemata` configuration for L3
+ * cache and memory bandwidth resources constraints.
+ *
+ * The file `tasks` has a list of tasks that belongs to this group (e.g.,
+ * <container_id>" group). Tasks can be added to a group by writing the task ID
+ * to the "tasks" file (which will automatically remove them from the previous
+ * group to which they belonged). New tasks created by fork(2) and clone(2) are
+ * added to the same group as their parent.
+ *
+ * The file `schemata` has a list of all the resources available to this group.
+ * Each resource (L3 cache, memory bandwidth) has its own line and format.
+ *
+ * L3 cache schema:
+ * It has allocation bitmasks/values for L3 cache on each socket, which
+ * contains L3 cache id and capacity bitmask (CBM).
+ * Format: "L3:<cache_id0>=<cbm0>;<cache_id1>=<cbm1>;..."
+ * For example, on a two-socket machine, the schema line could be "L3:0=ff;1=c0"
+ * which means L3 cache id 0's CBM is 0xff, and L3 cache id 1's CBM is 0xc0.
+ *
+ * The valid L3 cache CBM is a *contiguous bits set* and number of bits that can
+ * be set is less than the max bit. The max bits in the CBM is varied among
+ * supported Intel CPU models. Kernel will check if it is valid when writing.
+ * e.g., default value 0xfffff in root indicates the max bits of CBM is 20
+ * bits, which mapping to entire L3 cache capacity. Some valid CBM values to
+ * set in a group: 0xf, 0xf0, 0x3ff, 0x1f00 and etc.
+ *
+ * Memory bandwidth schema:
+ * It has allocation values for memory bandwidth on each socket, which contains
+ * L3 cache id and memory bandwidth percentage.
+ * Format: "MB:<cache_id0>=bandwidth0;<cache_id1>=bandwidth1;..."
+ * For example, on a two-socket machine, the schema line could be "MB:0=20;1=70"
+ *
+ * The minimum bandwidth percentage value for each CPU model is predefined and
+ * can be looked up through "info/MB/min_bandwidth". The bandwidth granularity
+ * that is allocated is also dependent on the CPU model and can be looked up at
+ * "info/MB/bandwidth_gran". The available bandwidth control steps are:
+ * min_bw + N * bw_gran. Intermediate values are rounded to the next control
+ * step available on the hardware.
+ *
+ * For more information about Intel RDT kernel interface:
+ * https://www.kernel.org/doc/Documentation/x86/intel_rdt_ui.txt
+ *
+ * An example for runc:
+ * Consider a two-socket machine with two L3 caches where the default CBM is
+ * 0x7ff and the max CBM length is 11 bits, and minimum memory bandwidth of 10%
+ * with a memory bandwidth granularity of 10%.
+ *
+ * Tasks inside the container only have access to the "upper" 7/11 of L3 cache
+ * on socket 0 and the "lower" 5/11 L3 cache on socket 1, and may use a
+ * maximum memory bandwidth of 20% on socket 0 and 70% on socket 1.
+ *
+ * "linux": {
+ * "intelRdt": {
+ * "l3CacheSchema": "L3:0=7f0;1=1f",
+ * "memBwSchema": "MB:0=20;1=70"
+ * }
+ * }
+ */
+
+type Manager interface {
+ // Applies Intel RDT configuration to the process with the specified pid
+ Apply(pid int) error
+
+ // Returns statistics for Intel RDT
+ GetStats() (*Stats, error)
+
+ // Destroys the Intel RDT 'container_id' group
+ Destroy() error
+
+ // Returns Intel RDT path to save in a state file and to be able to
+ // restore the object later
+ GetPath() string
+
+ // Set Intel RDT "resource control" filesystem as configured.
+ Set(container *configs.Config) error
+}
+
+// This implements interface Manager
+type IntelRdtManager struct {
+ mu sync.Mutex
+ Config *configs.Config
+ Id string
+ Path string
+}
+
+const (
+ IntelRdtTasks = "tasks"
+)
+
+var (
+ // The absolute root path of the Intel RDT "resource control" filesystem
+ intelRdtRoot string
+ intelRdtRootLock sync.Mutex
+
+ // The flag to indicate if Intel RDT/CAT is enabled
+ isCatEnabled bool
+ // The flag to indicate if Intel RDT/MBA is enabled
+ isMbaEnabled bool
+)
+
+type intelRdtData struct {
+ root string
+ config *configs.Config
+ pid int
+}
+
+// Check if Intel RDT sub-features are enabled in init()
+func init() {
+ // 1. Check if hardware and kernel support Intel RDT sub-features
+ // "cat_l3" flag for CAT and "mba" flag for MBA
+ isCatFlagSet, isMbaFlagSet, err := parseCpuInfoFile("/proc/cpuinfo")
+ if err != nil {
+ return
+ }
+
+ // 2. Check if Intel RDT "resource control" filesystem is mounted
+ // The user guarantees to mount the filesystem
+ if !isIntelRdtMounted() {
+ return
+ }
+
+ // 3. Double check if Intel RDT sub-features are available in
+ // "resource control" filesystem. Intel RDT sub-features can be
+ // selectively disabled or enabled by kernel command line
+ // (e.g., rdt=!l3cat,mba) in 4.14 and newer kernel
+ if isCatFlagSet {
+ if _, err := os.Stat(filepath.Join(intelRdtRoot, "info", "L3")); err == nil {
+ isCatEnabled = true
+ }
+ }
+ if isMbaFlagSet {
+ if _, err := os.Stat(filepath.Join(intelRdtRoot, "info", "MB")); err == nil {
+ isMbaEnabled = true
+ }
+ }
+}
+
+// Return the mount point path of Intel RDT "resource control" filesysem
+func findIntelRdtMountpointDir() (string, error) {
+ f, err := os.Open("/proc/self/mountinfo")
+ if err != nil {
+ return "", err
+ }
+ defer f.Close()
+
+ s := bufio.NewScanner(f)
+ for s.Scan() {
+ text := s.Text()
+ fields := strings.Split(text, " ")
+ // Safe as mountinfo encodes mountpoints with spaces as \040.
+ index := strings.Index(text, " - ")
+ postSeparatorFields := strings.Fields(text[index+3:])
+ numPostFields := len(postSeparatorFields)
+
+ // This is an error as we can't detect if the mount is for "Intel RDT"
+ if numPostFields == 0 {
+ return "", fmt.Errorf("Found no fields post '-' in %q", text)
+ }
+
+ if postSeparatorFields[0] == "resctrl" {
+ // Check that the mount is properly formatted.
+ if numPostFields < 3 {
+ return "", fmt.Errorf("Error found less than 3 fields post '-' in %q", text)
+ }
+
+ return fields[4], nil
+ }
+ }
+ if err := s.Err(); err != nil {
+ return "", err
+ }
+
+ return "", NewNotFoundError("Intel RDT")
+}
+
+// Gets the root path of Intel RDT "resource control" filesystem
+func getIntelRdtRoot() (string, error) {
+ intelRdtRootLock.Lock()
+ defer intelRdtRootLock.Unlock()
+
+ if intelRdtRoot != "" {
+ return intelRdtRoot, nil
+ }
+
+ root, err := findIntelRdtMountpointDir()
+ if err != nil {
+ return "", err
+ }
+
+ if _, err := os.Stat(root); err != nil {
+ return "", err
+ }
+
+ intelRdtRoot = root
+ return intelRdtRoot, nil
+}
+
+func isIntelRdtMounted() bool {
+ _, err := getIntelRdtRoot()
+ if err != nil {
+ return false
+ }
+
+ return true
+}
+
+func parseCpuInfoFile(path string) (bool, bool, error) {
+ isCatFlagSet := false
+ isMbaFlagSet := false
+
+ f, err := os.Open(path)
+ if err != nil {
+ return false, false, err
+ }
+ defer f.Close()
+
+ s := bufio.NewScanner(f)
+ for s.Scan() {
+ if err := s.Err(); err != nil {
+ return false, false, err
+ }
+
+ line := s.Text()
+
+ // Search "cat_l3" and "mba" flags in first "flags" line
+ if strings.Contains(line, "flags") {
+ flags := strings.Split(line, " ")
+ // "cat_l3" flag for CAT and "mba" flag for MBA
+ for _, flag := range flags {
+ switch flag {
+ case "cat_l3":
+ isCatFlagSet = true
+ case "mba":
+ isMbaFlagSet = true
+ }
+ }
+ return isCatFlagSet, isMbaFlagSet, nil
+ }
+ }
+ return isCatFlagSet, isMbaFlagSet, nil
+}
+
+func parseUint(s string, base, bitSize int) (uint64, error) {
+ value, err := strconv.ParseUint(s, base, bitSize)
+ if err != nil {
+ intValue, intErr := strconv.ParseInt(s, base, bitSize)
+ // 1. Handle negative values greater than MinInt64 (and)
+ // 2. Handle negative values lesser than MinInt64
+ if intErr == nil && intValue < 0 {
+ return 0, nil
+ } else if intErr != nil && intErr.(*strconv.NumError).Err == strconv.ErrRange && intValue < 0 {
+ return 0, nil
+ }
+
+ return value, err
+ }
+
+ return value, nil
+}
+
+// Gets a single uint64 value from the specified file.
+func getIntelRdtParamUint(path, file string) (uint64, error) {
+ fileName := filepath.Join(path, file)
+ contents, err := ioutil.ReadFile(fileName)
+ if err != nil {
+ return 0, err
+ }
+
+ res, err := parseUint(strings.TrimSpace(string(contents)), 10, 64)
+ if err != nil {
+ return res, fmt.Errorf("unable to parse %q as a uint from file %q", string(contents), fileName)
+ }
+ return res, nil
+}
+
+// Gets a string value from the specified file
+func getIntelRdtParamString(path, file string) (string, error) {
+ contents, err := ioutil.ReadFile(filepath.Join(path, file))
+ if err != nil {
+ return "", err
+ }
+
+ return strings.TrimSpace(string(contents)), nil
+}
+
+func writeFile(dir, file, data string) error {
+ if dir == "" {
+ return fmt.Errorf("no such directory for %s", file)
+ }
+ if err := ioutil.WriteFile(filepath.Join(dir, file), []byte(data+"\n"), 0700); err != nil {
+ return fmt.Errorf("failed to write %v to %v: %v", data, file, err)
+ }
+ return nil
+}
+
+func getIntelRdtData(c *configs.Config, pid int) (*intelRdtData, error) {
+ rootPath, err := getIntelRdtRoot()
+ if err != nil {
+ return nil, err
+ }
+ return &intelRdtData{
+ root: rootPath,
+ config: c,
+ pid: pid,
+ }, nil
+}
+
+// Get the read-only L3 cache information
+func getL3CacheInfo() (*L3CacheInfo, error) {
+ l3CacheInfo := &L3CacheInfo{}
+
+ rootPath, err := getIntelRdtRoot()
+ if err != nil {
+ return l3CacheInfo, err
+ }
+
+ path := filepath.Join(rootPath, "info", "L3")
+ cbmMask, err := getIntelRdtParamString(path, "cbm_mask")
+ if err != nil {
+ return l3CacheInfo, err
+ }
+ minCbmBits, err := getIntelRdtParamUint(path, "min_cbm_bits")
+ if err != nil {
+ return l3CacheInfo, err
+ }
+ numClosids, err := getIntelRdtParamUint(path, "num_closids")
+ if err != nil {
+ return l3CacheInfo, err
+ }
+
+ l3CacheInfo.CbmMask = cbmMask
+ l3CacheInfo.MinCbmBits = minCbmBits
+ l3CacheInfo.NumClosids = numClosids
+
+ return l3CacheInfo, nil
+}
+
+// Get the read-only memory bandwidth information
+func getMemBwInfo() (*MemBwInfo, error) {
+ memBwInfo := &MemBwInfo{}
+
+ rootPath, err := getIntelRdtRoot()
+ if err != nil {
+ return memBwInfo, err
+ }
+
+ path := filepath.Join(rootPath, "info", "MB")
+ bandwidthGran, err := getIntelRdtParamUint(path, "bandwidth_gran")
+ if err != nil {
+ return memBwInfo, err
+ }
+ delayLinear, err := getIntelRdtParamUint(path, "delay_linear")
+ if err != nil {
+ return memBwInfo, err
+ }
+ minBandwidth, err := getIntelRdtParamUint(path, "min_bandwidth")
+ if err != nil {
+ return memBwInfo, err
+ }
+ numClosids, err := getIntelRdtParamUint(path, "num_closids")
+ if err != nil {
+ return memBwInfo, err
+ }
+
+ memBwInfo.BandwidthGran = bandwidthGran
+ memBwInfo.DelayLinear = delayLinear
+ memBwInfo.MinBandwidth = minBandwidth
+ memBwInfo.NumClosids = numClosids
+
+ return memBwInfo, nil
+}
+
+// Get diagnostics for last filesystem operation error from file info/last_cmd_status
+func getLastCmdStatus() (string, error) {
+ rootPath, err := getIntelRdtRoot()
+ if err != nil {
+ return "", err
+ }
+
+ path := filepath.Join(rootPath, "info")
+ lastCmdStatus, err := getIntelRdtParamString(path, "last_cmd_status")
+ if err != nil {
+ return "", err
+ }
+
+ return lastCmdStatus, nil
+}
+
+// WriteIntelRdtTasks writes the specified pid into the "tasks" file
+func WriteIntelRdtTasks(dir string, pid int) error {
+ if dir == "" {
+ return fmt.Errorf("no such directory for %s", IntelRdtTasks)
+ }
+
+ // Dont attach any pid if -1 is specified as a pid
+ if pid != -1 {
+ if err := ioutil.WriteFile(filepath.Join(dir, IntelRdtTasks), []byte(strconv.Itoa(pid)), 0700); err != nil {
+ return fmt.Errorf("failed to write %v to %v: %v", pid, IntelRdtTasks, err)
+ }
+ }
+ return nil
+}
+
+// Check if Intel RDT/CAT is enabled
+func IsCatEnabled() bool {
+ return isCatEnabled
+}
+
+// Check if Intel RDT/MBA is enabled
+func IsMbaEnabled() bool {
+ return isMbaEnabled
+}
+
+// Get the 'container_id' path in Intel RDT "resource control" filesystem
+func GetIntelRdtPath(id string) (string, error) {
+ rootPath, err := getIntelRdtRoot()
+ if err != nil {
+ return "", err
+ }
+
+ path := filepath.Join(rootPath, id)
+ return path, nil
+}
+
+// Applies Intel RDT configuration to the process with the specified pid
+func (m *IntelRdtManager) Apply(pid int) (err error) {
+ // If intelRdt is not specified in config, we do nothing
+ if m.Config.IntelRdt == nil {
+ return nil
+ }
+ d, err := getIntelRdtData(m.Config, pid)
+ if err != nil && !IsNotFound(err) {
+ return err
+ }
+
+ m.mu.Lock()
+ defer m.mu.Unlock()
+ path, err := d.join(m.Id)
+ if err != nil {
+ return err
+ }
+
+ m.Path = path
+ return nil
+}
+
+// Destroys the Intel RDT 'container_id' group
+func (m *IntelRdtManager) Destroy() error {
+ m.mu.Lock()
+ defer m.mu.Unlock()
+ if err := os.RemoveAll(m.Path); err != nil {
+ return err
+ }
+ m.Path = ""
+ return nil
+}
+
+// Returns Intel RDT path to save in a state file and to be able to
+// restore the object later
+func (m *IntelRdtManager) GetPath() string {
+ if m.Path == "" {
+ m.Path, _ = GetIntelRdtPath(m.Id)
+ }
+ return m.Path
+}
+
+// Returns statistics for Intel RDT
+func (m *IntelRdtManager) GetStats() (*Stats, error) {
+ // If intelRdt is not specified in config
+ if m.Config.IntelRdt == nil {
+ return nil, nil
+ }
+
+ m.mu.Lock()
+ defer m.mu.Unlock()
+ stats := NewStats()
+
+ rootPath, err := getIntelRdtRoot()
+ if err != nil {
+ return nil, err
+ }
+ // The read-only L3 cache and memory bandwidth schemata in root
+ tmpRootStrings, err := getIntelRdtParamString(rootPath, "schemata")
+ if err != nil {
+ return nil, err
+ }
+ schemaRootStrings := strings.Split(tmpRootStrings, "\n")
+
+ // The L3 cache and memory bandwidth schemata in 'container_id' group
+ tmpStrings, err := getIntelRdtParamString(m.GetPath(), "schemata")
+ if err != nil {
+ return nil, err
+ }
+ schemaStrings := strings.Split(tmpStrings, "\n")
+
+ if IsCatEnabled() {
+ // The read-only L3 cache information
+ l3CacheInfo, err := getL3CacheInfo()
+ if err != nil {
+ return nil, err
+ }
+ stats.L3CacheInfo = l3CacheInfo
+
+ // The read-only L3 cache schema in root
+ for _, schemaRoot := range schemaRootStrings {
+ if strings.Contains(schemaRoot, "L3") {
+ stats.L3CacheSchemaRoot = strings.TrimSpace(schemaRoot)
+ }
+ }
+
+ // The L3 cache schema in 'container_id' group
+ for _, schema := range schemaStrings {
+ if strings.Contains(schema, "L3") {
+ stats.L3CacheSchema = strings.TrimSpace(schema)
+ }
+ }
+ }
+
+ if IsMbaEnabled() {
+ // The read-only memory bandwidth information
+ memBwInfo, err := getMemBwInfo()
+ if err != nil {
+ return nil, err
+ }
+ stats.MemBwInfo = memBwInfo
+
+ // The read-only memory bandwidth information
+ for _, schemaRoot := range schemaRootStrings {
+ if strings.Contains(schemaRoot, "MB") {
+ stats.MemBwSchemaRoot = strings.TrimSpace(schemaRoot)
+ }
+ }
+
+ // The memory bandwidth schema in 'container_id' group
+ for _, schema := range schemaStrings {
+ if strings.Contains(schema, "MB") {
+ stats.MemBwSchema = strings.TrimSpace(schema)
+ }
+ }
+ }
+
+ return stats, nil
+}
+
+// Set Intel RDT "resource control" filesystem as configured.
+func (m *IntelRdtManager) Set(container *configs.Config) error {
+ // About L3 cache schema:
+ // It has allocation bitmasks/values for L3 cache on each socket,
+ // which contains L3 cache id and capacity bitmask (CBM).
+ // Format: "L3:<cache_id0>=<cbm0>;<cache_id1>=<cbm1>;..."
+ // For example, on a two-socket machine, the schema line could be:
+ // L3:0=ff;1=c0
+ // which means L3 cache id 0's CBM is 0xff, and L3 cache id 1's CBM
+ // is 0xc0.
+ //
+ // The valid L3 cache CBM is a *contiguous bits set* and number of
+ // bits that can be set is less than the max bit. The max bits in the
+ // CBM is varied among supported Intel CPU models. Kernel will check
+ // if it is valid when writing. e.g., default value 0xfffff in root
+ // indicates the max bits of CBM is 20 bits, which mapping to entire
+ // L3 cache capacity. Some valid CBM values to set in a group:
+ // 0xf, 0xf0, 0x3ff, 0x1f00 and etc.
+ //
+ //
+ // About memory bandwidth schema:
+ // It has allocation values for memory bandwidth on each socket, which
+ // contains L3 cache id and memory bandwidth percentage.
+ // Format: "MB:<cache_id0>=bandwidth0;<cache_id1>=bandwidth1;..."
+ // For example, on a two-socket machine, the schema line could be:
+ // "MB:0=20;1=70"
+ //
+ // The minimum bandwidth percentage value for each CPU model is
+ // predefined and can be looked up through "info/MB/min_bandwidth".
+ // The bandwidth granularity that is allocated is also dependent on
+ // the CPU model and can be looked up at "info/MB/bandwidth_gran".
+ // The available bandwidth control steps are: min_bw + N * bw_gran.
+ // Intermediate values are rounded to the next control step available
+ // on the hardware.
+ if container.IntelRdt != nil {
+ path := m.GetPath()
+ l3CacheSchema := container.IntelRdt.L3CacheSchema
+ memBwSchema := container.IntelRdt.MemBwSchema
+
+ // Write a single joint schema string to schemata file
+ if l3CacheSchema != "" && memBwSchema != "" {
+ if err := writeFile(path, "schemata", l3CacheSchema+"\n"+memBwSchema); err != nil {
+ return NewLastCmdError(err)
+ }
+ }
+
+ // Write only L3 cache schema string to schemata file
+ if l3CacheSchema != "" && memBwSchema == "" {
+ if err := writeFile(path, "schemata", l3CacheSchema); err != nil {
+ return NewLastCmdError(err)
+ }
+ }
+
+ // Write only memory bandwidth schema string to schemata file
+ if l3CacheSchema == "" && memBwSchema != "" {
+ if err := writeFile(path, "schemata", memBwSchema); err != nil {
+ return NewLastCmdError(err)
+ }
+ }
+ }
+
+ return nil
+}
+
+func (raw *intelRdtData) join(id string) (string, error) {
+ path := filepath.Join(raw.root, id)
+ if err := os.MkdirAll(path, 0755); err != nil {
+ return "", NewLastCmdError(err)
+ }
+
+ if err := WriteIntelRdtTasks(path, raw.pid); err != nil {
+ return "", NewLastCmdError(err)
+ }
+ return path, nil
+}
+
+type NotFoundError struct {
+ ResourceControl string
+}
+
+func (e *NotFoundError) Error() string {
+ return fmt.Sprintf("mountpoint for %s not found", e.ResourceControl)
+}
+
+func NewNotFoundError(res string) error {
+ return &NotFoundError{
+ ResourceControl: res,
+ }
+}
+
+func IsNotFound(err error) bool {
+ if err == nil {
+ return false
+ }
+ _, ok := err.(*NotFoundError)
+ return ok
+}
+
+type LastCmdError struct {
+ LastCmdStatus string
+ Err error
+}
+
+func (e *LastCmdError) Error() string {
+ return fmt.Sprintf(e.Err.Error() + ", last_cmd_status: " + e.LastCmdStatus)
+}
+
+func NewLastCmdError(err error) error {
+ lastCmdStatus, err1 := getLastCmdStatus()
+ if err1 == nil {
+ return &LastCmdError{
+ LastCmdStatus: lastCmdStatus,
+ Err: err,
+ }
+ }
+ return err
+}
--- /dev/null
+// +build linux
+
+package intelrdt
+
+import (
+ "strings"
+ "testing"
+)
+
+func TestIntelRdtSetL3CacheSchema(t *testing.T) {
+ if !IsCatEnabled() {
+ return
+ }
+
+ helper := NewIntelRdtTestUtil(t)
+ defer helper.cleanup()
+
+ const (
+ l3CacheSchemaBefore = "L3:0=f;1=f0"
+ l3CacheSchemeAfter = "L3:0=f0;1=f"
+ )
+
+ helper.writeFileContents(map[string]string{
+ "schemata": l3CacheSchemaBefore + "\n",
+ })
+
+ helper.IntelRdtData.config.IntelRdt.L3CacheSchema = l3CacheSchemeAfter
+ intelrdt := &IntelRdtManager{
+ Config: helper.IntelRdtData.config,
+ Path: helper.IntelRdtPath,
+ }
+ if err := intelrdt.Set(helper.IntelRdtData.config); err != nil {
+ t.Fatal(err)
+ }
+
+ tmpStrings, err := getIntelRdtParamString(helper.IntelRdtPath, "schemata")
+ if err != nil {
+ t.Fatalf("Failed to parse file 'schemata' - %s", err)
+ }
+ values := strings.Split(tmpStrings, "\n")
+ value := values[0]
+
+ if value != l3CacheSchemeAfter {
+ t.Fatal("Got the wrong value, set 'schemata' failed.")
+ }
+}
+
+func TestIntelRdtSetMemBwSchema(t *testing.T) {
+ if !IsMbaEnabled() {
+ return
+ }
+
+ helper := NewIntelRdtTestUtil(t)
+ defer helper.cleanup()
+
+ const (
+ memBwSchemaBefore = "MB:0=20;1=70"
+ memBwSchemeAfter = "MB:0=70;1=20"
+ )
+
+ helper.writeFileContents(map[string]string{
+ "schemata": memBwSchemaBefore + "\n",
+ })
+
+ helper.IntelRdtData.config.IntelRdt.MemBwSchema = memBwSchemeAfter
+ intelrdt := &IntelRdtManager{
+ Config: helper.IntelRdtData.config,
+ Path: helper.IntelRdtPath,
+ }
+ if err := intelrdt.Set(helper.IntelRdtData.config); err != nil {
+ t.Fatal(err)
+ }
+
+ tmpStrings, err := getIntelRdtParamString(helper.IntelRdtPath, "schemata")
+ if err != nil {
+ t.Fatalf("Failed to parse file 'schemata' - %s", err)
+ }
+ values := strings.Split(tmpStrings, "\n")
+ value := values[0]
+
+ if value != memBwSchemeAfter {
+ t.Fatal("Got the wrong value, set 'schemata' failed.")
+ }
+}
--- /dev/null
+// +build linux
+
+package intelrdt
+
+type L3CacheInfo struct {
+ CbmMask string `json:"cbm_mask,omitempty"`
+ MinCbmBits uint64 `json:"min_cbm_bits,omitempty"`
+ NumClosids uint64 `json:"num_closids,omitempty"`
+}
+
+type MemBwInfo struct {
+ BandwidthGran uint64 `json:"bandwidth_gran,omitempty"`
+ DelayLinear uint64 `json:"delay_linear,omitempty"`
+ MinBandwidth uint64 `json:"min_bandwidth,omitempty"`
+ NumClosids uint64 `json:"num_closids,omitempty"`
+}
+
+type Stats struct {
+ // The read-only L3 cache information
+ L3CacheInfo *L3CacheInfo `json:"l3_cache_info,omitempty"`
+
+ // The read-only L3 cache schema in root
+ L3CacheSchemaRoot string `json:"l3_cache_schema_root,omitempty"`
+
+ // The L3 cache schema in 'container_id' group
+ L3CacheSchema string `json:"l3_cache_schema,omitempty"`
+
+ // The read-only memory bandwidth information
+ MemBwInfo *MemBwInfo `json:"mem_bw_info,omitempty"`
+
+ // The read-only memory bandwidth schema in root
+ MemBwSchemaRoot string `json:"mem_bw_schema_root,omitempty"`
+
+ // The memory bandwidth schema in 'container_id' group
+ MemBwSchema string `json:"mem_bw_schema,omitempty"`
+}
+
+func NewStats() *Stats {
+ return &Stats{}
+}
--- /dev/null
+// +build linux
+
+/*
+ * Utility for testing Intel RDT operations.
+ * Creates a mock of the Intel RDT "resource control" filesystem for the duration of the test.
+ */
+package intelrdt
+
+import (
+ "io/ioutil"
+ "os"
+ "path/filepath"
+ "testing"
+
+ "github.com/opencontainers/runc/libcontainer/configs"
+)
+
+type intelRdtTestUtil struct {
+ // intelRdt data to use in tests
+ IntelRdtData *intelRdtData
+
+ // Path to the mock Intel RDT "resource control" filesystem directory
+ IntelRdtPath string
+
+ // Temporary directory to store mock Intel RDT "resource control" filesystem
+ tempDir string
+ t *testing.T
+}
+
+// Creates a new test util
+func NewIntelRdtTestUtil(t *testing.T) *intelRdtTestUtil {
+ d := &intelRdtData{
+ config: &configs.Config{
+ IntelRdt: &configs.IntelRdt{},
+ },
+ }
+ tempDir, err := ioutil.TempDir("", "intelrdt_test")
+ if err != nil {
+ t.Fatal(err)
+ }
+ d.root = tempDir
+ testIntelRdtPath := filepath.Join(d.root, "resctrl")
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ // Ensure the full mock Intel RDT "resource control" filesystem path exists
+ err = os.MkdirAll(testIntelRdtPath, 0755)
+ if err != nil {
+ t.Fatal(err)
+ }
+ return &intelRdtTestUtil{IntelRdtData: d, IntelRdtPath: testIntelRdtPath, tempDir: tempDir, t: t}
+}
+
+func (c *intelRdtTestUtil) cleanup() {
+ os.RemoveAll(c.tempDir)
+}
+
+// Write the specified contents on the mock of the specified Intel RDT "resource control" files
+func (c *intelRdtTestUtil) writeFileContents(fileContents map[string]string) {
+ for file, contents := range fileContents {
+ err := writeFile(c.IntelRdtPath, file, contents)
+ if err != nil {
+ c.t.Fatal(err)
+ }
+ }
+}
--- /dev/null
+// +build linux
+
+package keys
+
+import (
+ "fmt"
+ "strconv"
+ "strings"
+
+ "github.com/pkg/errors"
+
+ "golang.org/x/sys/unix"
+)
+
+type KeySerial uint32
+
+func JoinSessionKeyring(name string) (KeySerial, error) {
+ sessKeyId, err := unix.KeyctlJoinSessionKeyring(name)
+ if err != nil {
+ return 0, errors.Wrap(err, "create session key")
+ }
+ return KeySerial(sessKeyId), nil
+}
+
+// ModKeyringPerm modifies permissions on a keyring by reading the current permissions,
+// anding the bits with the given mask (clearing permissions) and setting
+// additional permission bits
+func ModKeyringPerm(ringId KeySerial, mask, setbits uint32) error {
+ dest, err := unix.KeyctlString(unix.KEYCTL_DESCRIBE, int(ringId))
+ if err != nil {
+ return err
+ }
+
+ res := strings.Split(dest, ";")
+ if len(res) < 5 {
+ return fmt.Errorf("Destination buffer for key description is too small")
+ }
+
+ // parse permissions
+ perm64, err := strconv.ParseUint(res[3], 16, 32)
+ if err != nil {
+ return err
+ }
+
+ perm := (uint32(perm64) & mask) | setbits
+
+ return unix.KeyctlSetperm(int(ringId), perm)
+}
--- /dev/null
+// +build linux
+
+package libcontainer
+
+import (
+ "github.com/vishvananda/netlink/nl"
+ "golang.org/x/sys/unix"
+)
+
+// list of known message types we want to send to bootstrap program
+// The number is randomly chosen to not conflict with known netlink types
+const (
+ InitMsg uint16 = 62000
+ CloneFlagsAttr uint16 = 27281
+ NsPathsAttr uint16 = 27282
+ UidmapAttr uint16 = 27283
+ GidmapAttr uint16 = 27284
+ SetgroupAttr uint16 = 27285
+ OomScoreAdjAttr uint16 = 27286
+ RootlessEUIDAttr uint16 = 27287
+ UidmapPathAttr uint16 = 27288
+ GidmapPathAttr uint16 = 27289
+)
+
+type Int32msg struct {
+ Type uint16
+ Value uint32
+}
+
+// Serialize serializes the message.
+// Int32msg has the following representation
+// | nlattr len | nlattr type |
+// | uint32 value |
+func (msg *Int32msg) Serialize() []byte {
+ buf := make([]byte, msg.Len())
+ native := nl.NativeEndian()
+ native.PutUint16(buf[0:2], uint16(msg.Len()))
+ native.PutUint16(buf[2:4], msg.Type)
+ native.PutUint32(buf[4:8], msg.Value)
+ return buf
+}
+
+func (msg *Int32msg) Len() int {
+ return unix.NLA_HDRLEN + 4
+}
+
+// Bytemsg has the following representation
+// | nlattr len | nlattr type |
+// | value | pad |
+type Bytemsg struct {
+ Type uint16
+ Value []byte
+}
+
+func (msg *Bytemsg) Serialize() []byte {
+ l := msg.Len()
+ buf := make([]byte, (l+unix.NLA_ALIGNTO-1) & ^(unix.NLA_ALIGNTO-1))
+ native := nl.NativeEndian()
+ native.PutUint16(buf[0:2], uint16(l))
+ native.PutUint16(buf[2:4], msg.Type)
+ copy(buf[4:], msg.Value)
+ return buf
+}
+
+func (msg *Bytemsg) Len() int {
+ return unix.NLA_HDRLEN + len(msg.Value) + 1 // null-terminated
+}
+
+type Boolmsg struct {
+ Type uint16
+ Value bool
+}
+
+func (msg *Boolmsg) Serialize() []byte {
+ buf := make([]byte, msg.Len())
+ native := nl.NativeEndian()
+ native.PutUint16(buf[0:2], uint16(msg.Len()))
+ native.PutUint16(buf[2:4], msg.Type)
+ if msg.Value {
+ native.PutUint32(buf[4:8], uint32(1))
+ } else {
+ native.PutUint32(buf[4:8], uint32(0))
+ }
+ return buf
+}
+
+func (msg *Boolmsg) Len() int {
+ return unix.NLA_HDRLEN + 4 // alignment
+}
--- /dev/null
+package mount
+
+// GetMounts retrieves a list of mounts for the current running process.
+func GetMounts() ([]*Info, error) {
+ return parseMountTable()
+}
+
+// Mounted looks at /proc/self/mountinfo to determine of the specified
+// mountpoint has been mounted
+func Mounted(mountpoint string) (bool, error) {
+ entries, err := parseMountTable()
+ if err != nil {
+ return false, err
+ }
+
+ // Search the table for the mountpoint
+ for _, e := range entries {
+ if e.Mountpoint == mountpoint {
+ return true, nil
+ }
+ }
+ return false, nil
+}
--- /dev/null
+// +build linux
+
+package mount
+
+import (
+ "bufio"
+ "fmt"
+ "io"
+ "os"
+ "strings"
+)
+
+const (
+ /* 36 35 98:0 /mnt1 /mnt2 rw,noatime master:1 - ext3 /dev/root rw,errors=continue
+ (1)(2)(3) (4) (5) (6) (7) (8) (9) (10) (11)
+
+ (1) mount ID: unique identifier of the mount (may be reused after umount)
+ (2) parent ID: ID of parent (or of self for the top of the mount tree)
+ (3) major:minor: value of st_dev for files on filesystem
+ (4) root: root of the mount within the filesystem
+ (5) mount point: mount point relative to the process's root
+ (6) mount options: per mount options
+ (7) optional fields: zero or more fields of the form "tag[:value]"
+ (8) separator: marks the end of the optional fields
+ (9) filesystem type: name of filesystem of the form "type[.subtype]"
+ (10) mount source: filesystem specific information or "none"
+ (11) super options: per super block options*/
+ mountinfoFormat = "%d %d %d:%d %s %s %s %s"
+)
+
+// Parse /proc/self/mountinfo because comparing Dev and ino does not work from
+// bind mounts
+func parseMountTable() ([]*Info, error) {
+ f, err := os.Open("/proc/self/mountinfo")
+ if err != nil {
+ return nil, err
+ }
+ defer f.Close()
+
+ return parseInfoFile(f)
+}
+
+func parseInfoFile(r io.Reader) ([]*Info, error) {
+ var (
+ s = bufio.NewScanner(r)
+ out = []*Info{}
+ )
+
+ for s.Scan() {
+ if err := s.Err(); err != nil {
+ return nil, err
+ }
+
+ var (
+ p = &Info{}
+ text = s.Text()
+ optionalFields string
+ )
+
+ if _, err := fmt.Sscanf(text, mountinfoFormat,
+ &p.ID, &p.Parent, &p.Major, &p.Minor,
+ &p.Root, &p.Mountpoint, &p.Opts, &optionalFields); err != nil {
+ return nil, fmt.Errorf("Scanning '%s' failed: %s", text, err)
+ }
+ // Safe as mountinfo encodes mountpoints with spaces as \040.
+ index := strings.Index(text, " - ")
+ postSeparatorFields := strings.Fields(text[index+3:])
+ if len(postSeparatorFields) < 3 {
+ return nil, fmt.Errorf("Error found less than 3 fields post '-' in %q", text)
+ }
+
+ if optionalFields != "-" {
+ p.Optional = optionalFields
+ }
+
+ p.Fstype = postSeparatorFields[0]
+ p.Source = postSeparatorFields[1]
+ p.VfsOpts = strings.Join(postSeparatorFields[2:], " ")
+ out = append(out, p)
+ }
+ return out, nil
+}
--- /dev/null
+package mount
+
+// Info reveals information about a particular mounted filesystem. This
+// struct is populated from the content in the /proc/<pid>/mountinfo file.
+type Info struct {
+ // ID is a unique identifier of the mount (may be reused after umount).
+ ID int
+
+ // Parent indicates the ID of the mount parent (or of self for the top of the
+ // mount tree).
+ Parent int
+
+ // Major indicates one half of the device ID which identifies the device class.
+ Major int
+
+ // Minor indicates one half of the device ID which identifies a specific
+ // instance of device.
+ Minor int
+
+ // Root of the mount within the filesystem.
+ Root string
+
+ // Mountpoint indicates the mount point relative to the process's root.
+ Mountpoint string
+
+ // Opts represents mount-specific options.
+ Opts string
+
+ // Optional represents optional fields.
+ Optional string
+
+ // Fstype indicates the type of filesystem, such as EXT3.
+ Fstype string
+
+ // Source indicates filesystem specific information or "none".
+ Source string
+
+ // VfsOpts represents per super block options.
+ VfsOpts string
+}
--- /dev/null
+// +build linux
+
+package libcontainer
+
+import (
+ "fmt"
+ "io/ioutil"
+ "path/filepath"
+ "strconv"
+ "strings"
+
+ "github.com/opencontainers/runc/libcontainer/configs"
+ "github.com/vishvananda/netlink"
+)
+
+var strategies = map[string]networkStrategy{
+ "loopback": &loopback{},
+}
+
+// networkStrategy represents a specific network configuration for
+// a container's networking stack
+type networkStrategy interface {
+ create(*network, int) error
+ initialize(*network) error
+ detach(*configs.Network) error
+ attach(*configs.Network) error
+}
+
+// getStrategy returns the specific network strategy for the
+// provided type.
+func getStrategy(tpe string) (networkStrategy, error) {
+ s, exists := strategies[tpe]
+ if !exists {
+ return nil, fmt.Errorf("unknown strategy type %q", tpe)
+ }
+ return s, nil
+}
+
+// Returns the network statistics for the network interfaces represented by the NetworkRuntimeInfo.
+func getNetworkInterfaceStats(interfaceName string) (*NetworkInterface, error) {
+ out := &NetworkInterface{Name: interfaceName}
+ // This can happen if the network runtime information is missing - possible if the
+ // container was created by an old version of libcontainer.
+ if interfaceName == "" {
+ return out, nil
+ }
+ type netStatsPair struct {
+ // Where to write the output.
+ Out *uint64
+ // The network stats file to read.
+ File string
+ }
+ // Ingress for host veth is from the container. Hence tx_bytes stat on the host veth is actually number of bytes received by the container.
+ netStats := []netStatsPair{
+ {Out: &out.RxBytes, File: "tx_bytes"},
+ {Out: &out.RxPackets, File: "tx_packets"},
+ {Out: &out.RxErrors, File: "tx_errors"},
+ {Out: &out.RxDropped, File: "tx_dropped"},
+
+ {Out: &out.TxBytes, File: "rx_bytes"},
+ {Out: &out.TxPackets, File: "rx_packets"},
+ {Out: &out.TxErrors, File: "rx_errors"},
+ {Out: &out.TxDropped, File: "rx_dropped"},
+ }
+ for _, netStat := range netStats {
+ data, err := readSysfsNetworkStats(interfaceName, netStat.File)
+ if err != nil {
+ return nil, err
+ }
+ *(netStat.Out) = data
+ }
+ return out, nil
+}
+
+// Reads the specified statistics available under /sys/class/net/<EthInterface>/statistics
+func readSysfsNetworkStats(ethInterface, statsFile string) (uint64, error) {
+ data, err := ioutil.ReadFile(filepath.Join("/sys/class/net", ethInterface, "statistics", statsFile))
+ if err != nil {
+ return 0, err
+ }
+ return strconv.ParseUint(strings.TrimSpace(string(data)), 10, 64)
+}
+
+// loopback is a network strategy that provides a basic loopback device
+type loopback struct {
+}
+
+func (l *loopback) create(n *network, nspid int) error {
+ return nil
+}
+
+func (l *loopback) initialize(config *network) error {
+ return netlink.LinkSetUp(&netlink.Device{LinkAttrs: netlink.LinkAttrs{Name: "lo"}})
+}
+
+func (l *loopback) attach(n *configs.Network) (err error) {
+ return nil
+}
+
+func (l *loopback) detach(n *configs.Network) (err error) {
+ return nil
+}
--- /dev/null
+// +build linux
+
+package libcontainer
+
+import (
+ "fmt"
+ "io/ioutil"
+ "os"
+ "path/filepath"
+
+ "golang.org/x/sys/unix"
+)
+
+const oomCgroupName = "memory"
+
+type PressureLevel uint
+
+const (
+ LowPressure PressureLevel = iota
+ MediumPressure
+ CriticalPressure
+)
+
+func registerMemoryEvent(cgDir string, evName string, arg string) (<-chan struct{}, error) {
+ evFile, err := os.Open(filepath.Join(cgDir, evName))
+ if err != nil {
+ return nil, err
+ }
+ fd, err := unix.Eventfd(0, unix.EFD_CLOEXEC)
+ if err != nil {
+ evFile.Close()
+ return nil, err
+ }
+
+ eventfd := os.NewFile(uintptr(fd), "eventfd")
+
+ eventControlPath := filepath.Join(cgDir, "cgroup.event_control")
+ data := fmt.Sprintf("%d %d %s", eventfd.Fd(), evFile.Fd(), arg)
+ if err := ioutil.WriteFile(eventControlPath, []byte(data), 0700); err != nil {
+ eventfd.Close()
+ evFile.Close()
+ return nil, err
+ }
+ ch := make(chan struct{})
+ go func() {
+ defer func() {
+ eventfd.Close()
+ evFile.Close()
+ close(ch)
+ }()
+ buf := make([]byte, 8)
+ for {
+ if _, err := eventfd.Read(buf); err != nil {
+ return
+ }
+ // When a cgroup is destroyed, an event is sent to eventfd.
+ // So if the control path is gone, return instead of notifying.
+ if _, err := os.Lstat(eventControlPath); os.IsNotExist(err) {
+ return
+ }
+ ch <- struct{}{}
+ }
+ }()
+ return ch, nil
+}
+
+// notifyOnOOM returns channel on which you can expect event about OOM,
+// if process died without OOM this channel will be closed.
+func notifyOnOOM(paths map[string]string) (<-chan struct{}, error) {
+ dir := paths[oomCgroupName]
+ if dir == "" {
+ return nil, fmt.Errorf("path %q missing", oomCgroupName)
+ }
+
+ return registerMemoryEvent(dir, "memory.oom_control", "")
+}
+
+func notifyMemoryPressure(paths map[string]string, level PressureLevel) (<-chan struct{}, error) {
+ dir := paths[oomCgroupName]
+ if dir == "" {
+ return nil, fmt.Errorf("path %q missing", oomCgroupName)
+ }
+
+ if level > CriticalPressure {
+ return nil, fmt.Errorf("invalid pressure level %d", level)
+ }
+
+ levelStr := []string{"low", "medium", "critical"}[level]
+ return registerMemoryEvent(dir, "memory.pressure_level", levelStr)
+}
--- /dev/null
+// +build linux
+
+package libcontainer
+
+import (
+ "encoding/binary"
+ "fmt"
+ "io/ioutil"
+ "os"
+ "path/filepath"
+ "testing"
+ "time"
+
+ "golang.org/x/sys/unix"
+)
+
+type notifyFunc func(paths map[string]string) (<-chan struct{}, error)
+
+func testMemoryNotification(t *testing.T, evName string, notify notifyFunc, targ string) {
+ memoryPath, err := ioutil.TempDir("", "testmemnotification-"+evName)
+ if err != nil {
+ t.Fatal(err)
+ }
+ evFile := filepath.Join(memoryPath, evName)
+ eventPath := filepath.Join(memoryPath, "cgroup.event_control")
+ if err := ioutil.WriteFile(evFile, []byte{}, 0700); err != nil {
+ t.Fatal(err)
+ }
+ if err := ioutil.WriteFile(eventPath, []byte{}, 0700); err != nil {
+ t.Fatal(err)
+ }
+ paths := map[string]string{
+ "memory": memoryPath,
+ }
+ ch, err := notify(paths)
+ if err != nil {
+ t.Fatal("expected no error, got:", err)
+ }
+
+ data, err := ioutil.ReadFile(eventPath)
+ if err != nil {
+ t.Fatal("couldn't read event control file:", err)
+ }
+
+ var eventFd, evFd int
+ var arg string
+ if targ != "" {
+ _, err = fmt.Sscanf(string(data), "%d %d %s", &eventFd, &evFd, &arg)
+ } else {
+ _, err = fmt.Sscanf(string(data), "%d %d", &eventFd, &evFd)
+ }
+ if err != nil || arg != targ {
+ t.Fatalf("invalid control data %q: %s", data, err)
+ }
+
+ // dup the eventfd
+ efd, err := unix.Dup(eventFd)
+ if err != nil {
+ t.Fatal("unable to dup eventfd:", err)
+ }
+ defer unix.Close(efd)
+
+ buf := make([]byte, 8)
+ binary.LittleEndian.PutUint64(buf, 1)
+
+ if _, err := unix.Write(efd, buf); err != nil {
+ t.Fatal("unable to write to eventfd:", err)
+ }
+
+ select {
+ case <-ch:
+ case <-time.After(100 * time.Millisecond):
+ t.Fatal("no notification on channel after 100ms")
+ }
+
+ // simulate what happens when a cgroup is destroyed by cleaning up and then
+ // writing to the eventfd.
+ if err := os.RemoveAll(memoryPath); err != nil {
+ t.Fatal(err)
+ }
+ if _, err := unix.Write(efd, buf); err != nil {
+ t.Fatal("unable to write to eventfd:", err)
+ }
+
+ // give things a moment to shut down
+ select {
+ case _, ok := <-ch:
+ if ok {
+ t.Fatal("expected no notification to be triggered")
+ }
+ case <-time.After(100 * time.Millisecond):
+ t.Fatal("channel not closed after 100ms")
+ }
+
+ if _, _, err := unix.Syscall(unix.SYS_FCNTL, uintptr(evFd), unix.F_GETFD, 0); err != unix.EBADF {
+ t.Errorf("expected event control to be closed, but received error %s", err.Error())
+ }
+
+ if _, _, err := unix.Syscall(unix.SYS_FCNTL, uintptr(eventFd), unix.F_GETFD, 0); err != unix.EBADF {
+ t.Errorf("expected event fd to be closed, but received error %s", err.Error())
+ }
+}
+
+func TestNotifyOnOOM(t *testing.T) {
+ f := func(paths map[string]string) (<-chan struct{}, error) {
+ return notifyOnOOM(paths)
+ }
+
+ testMemoryNotification(t, "memory.oom_control", f, "")
+}
+
+func TestNotifyMemoryPressure(t *testing.T) {
+ tests := map[PressureLevel]string{
+ LowPressure: "low",
+ MediumPressure: "medium",
+ CriticalPressure: "critical",
+ }
+
+ for level, arg := range tests {
+ f := func(paths map[string]string) (<-chan struct{}, error) {
+ return notifyMemoryPressure(paths, level)
+ }
+
+ testMemoryNotification(t, "memory.pressure_level", f, arg)
+ }
+}
--- /dev/null
+## nsenter
+
+The `nsenter` package registers a special init constructor that is called before
+the Go runtime has a chance to boot. This provides us the ability to `setns` on
+existing namespaces and avoid the issues that the Go runtime has with multiple
+threads. This constructor will be called if this package is registered,
+imported, in your go application.
+
+The `nsenter` package will `import "C"` and it uses [cgo](https://golang.org/cmd/cgo/)
+package. In cgo, if the import of "C" is immediately preceded by a comment, that comment,
+called the preamble, is used as a header when compiling the C parts of the package.
+So every time we import package `nsenter`, the C code function `nsexec()` would be
+called. And package `nsenter` is only imported in `init.go`, so every time the runc
+`init` command is invoked, that C code is run.
+
+Because `nsexec()` must be run before the Go runtime in order to use the
+Linux kernel namespace, you must `import` this library into a package if
+you plan to use `libcontainer` directly. Otherwise Go will not execute
+the `nsexec()` constructor, which means that the re-exec will not cause
+the namespaces to be joined. You can import it like this:
+
+```go
+import _ "github.com/opencontainers/runc/libcontainer/nsenter"
+```
+
+`nsexec()` will first get the file descriptor number for the init pipe
+from the environment variable `_LIBCONTAINER_INITPIPE` (which was opened
+by the parent and kept open across the fork-exec of the `nsexec()` init
+process). The init pipe is used to read bootstrap data (namespace paths,
+clone flags, uid and gid mappings, and the console path) from the parent
+process. `nsexec()` will then call `setns(2)` to join the namespaces
+provided in the bootstrap data (if available), `clone(2)` a child process
+with the provided clone flags, update the user and group ID mappings, do
+some further miscellaneous setup steps, and then send the PID of the
+child process to the parent of the `nsexec()` "caller". Finally,
+the parent `nsexec()` will exit and the child `nsexec()` process will
+return to allow the Go runtime take over.
+
+NOTE: We do both `setns(2)` and `clone(2)` even if we don't have any
+`CLONE_NEW*` clone flags because we must fork a new process in order to
+enter the PID namespace.
+
+
+
--- /dev/null
+#ifndef NSENTER_NAMESPACE_H
+#define NSENTER_NAMESPACE_H
+
+#ifndef _GNU_SOURCE
+# define _GNU_SOURCE
+#endif
+#include <sched.h>
+
+/* All of these are taken from include/uapi/linux/sched.h */
+#ifndef CLONE_NEWNS
+# define CLONE_NEWNS 0x00020000 /* New mount namespace group */
+#endif
+#ifndef CLONE_NEWCGROUP
+# define CLONE_NEWCGROUP 0x02000000 /* New cgroup namespace */
+#endif
+#ifndef CLONE_NEWUTS
+# define CLONE_NEWUTS 0x04000000 /* New utsname namespace */
+#endif
+#ifndef CLONE_NEWIPC
+# define CLONE_NEWIPC 0x08000000 /* New ipc namespace */
+#endif
+#ifndef CLONE_NEWUSER
+# define CLONE_NEWUSER 0x10000000 /* New user namespace */
+#endif
+#ifndef CLONE_NEWPID
+# define CLONE_NEWPID 0x20000000 /* New pid namespace */
+#endif
+#ifndef CLONE_NEWNET
+# define CLONE_NEWNET 0x40000000 /* New network namespace */
+#endif
+
+#endif /* NSENTER_NAMESPACE_H */
--- /dev/null
+// +build linux,!gccgo
+
+package nsenter
+
+/*
+#cgo CFLAGS: -Wall
+extern void nsexec();
+void __attribute__((constructor)) init(void) {
+ nsexec();
+}
+*/
+import "C"
--- /dev/null
+// +build linux,gccgo
+
+package nsenter
+
+/*
+#cgo CFLAGS: -Wall
+extern void nsexec();
+void __attribute__((constructor)) init(void) {
+ nsexec();
+}
+*/
+import "C"
+
+// AlwaysFalse is here to stay false
+// (and be exported so the compiler doesn't optimize out its reference)
+var AlwaysFalse bool
+
+func init() {
+ if AlwaysFalse {
+ // by referencing this C init() in a noop test, it will ensure the compiler
+ // links in the C function.
+ // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=65134
+ C.init()
+ }
+}
--- /dev/null
+package nsenter
+
+import (
+ "bytes"
+ "encoding/json"
+ "fmt"
+ "io"
+ "io/ioutil"
+ "os"
+ "os/exec"
+ "strings"
+ "testing"
+
+ "github.com/opencontainers/runc/libcontainer"
+ "github.com/vishvananda/netlink/nl"
+
+ "golang.org/x/sys/unix"
+)
+
+type pid struct {
+ Pid int `json:"Pid"`
+}
+
+func TestNsenterValidPaths(t *testing.T) {
+ args := []string{"nsenter-exec"}
+ parent, child, err := newPipe()
+ if err != nil {
+ t.Fatalf("failed to create pipe %v", err)
+ }
+
+ namespaces := []string{
+ // join pid ns of the current process
+ fmt.Sprintf("pid:/proc/%d/ns/pid", os.Getpid()),
+ }
+ cmd := &exec.Cmd{
+ Path: os.Args[0],
+ Args: args,
+ ExtraFiles: []*os.File{child},
+ Env: []string{"_LIBCONTAINER_INITPIPE=3"},
+ Stdout: os.Stdout,
+ Stderr: os.Stderr,
+ }
+
+ if err := cmd.Start(); err != nil {
+ t.Fatalf("nsenter failed to start %v", err)
+ }
+ // write cloneFlags
+ r := nl.NewNetlinkRequest(int(libcontainer.InitMsg), 0)
+ r.AddData(&libcontainer.Int32msg{
+ Type: libcontainer.CloneFlagsAttr,
+ Value: uint32(unix.CLONE_NEWNET),
+ })
+ r.AddData(&libcontainer.Bytemsg{
+ Type: libcontainer.NsPathsAttr,
+ Value: []byte(strings.Join(namespaces, ",")),
+ })
+ if _, err := io.Copy(parent, bytes.NewReader(r.Serialize())); err != nil {
+ t.Fatal(err)
+ }
+
+ decoder := json.NewDecoder(parent)
+ var pid *pid
+
+ if err := cmd.Wait(); err != nil {
+ t.Fatalf("nsenter exits with a non-zero exit status")
+ }
+ if err := decoder.Decode(&pid); err != nil {
+ dir, _ := ioutil.ReadDir(fmt.Sprintf("/proc/%d/ns", os.Getpid()))
+ for _, d := range dir {
+ t.Log(d.Name())
+ }
+ t.Fatalf("%v", err)
+ }
+
+ p, err := os.FindProcess(pid.Pid)
+ if err != nil {
+ t.Fatalf("%v", err)
+ }
+ p.Wait()
+}
+
+func TestNsenterInvalidPaths(t *testing.T) {
+ args := []string{"nsenter-exec"}
+ parent, child, err := newPipe()
+ if err != nil {
+ t.Fatalf("failed to create pipe %v", err)
+ }
+
+ namespaces := []string{
+ // join pid ns of the current process
+ fmt.Sprintf("pid:/proc/%d/ns/pid", -1),
+ }
+ cmd := &exec.Cmd{
+ Path: os.Args[0],
+ Args: args,
+ ExtraFiles: []*os.File{child},
+ Env: []string{"_LIBCONTAINER_INITPIPE=3"},
+ }
+
+ if err := cmd.Start(); err != nil {
+ t.Fatal(err)
+ }
+ // write cloneFlags
+ r := nl.NewNetlinkRequest(int(libcontainer.InitMsg), 0)
+ r.AddData(&libcontainer.Int32msg{
+ Type: libcontainer.CloneFlagsAttr,
+ Value: uint32(unix.CLONE_NEWNET),
+ })
+ r.AddData(&libcontainer.Bytemsg{
+ Type: libcontainer.NsPathsAttr,
+ Value: []byte(strings.Join(namespaces, ",")),
+ })
+ if _, err := io.Copy(parent, bytes.NewReader(r.Serialize())); err != nil {
+ t.Fatal(err)
+ }
+
+ if err := cmd.Wait(); err == nil {
+ t.Fatalf("nsenter exits with a zero exit status")
+ }
+}
+
+func TestNsenterIncorrectPathType(t *testing.T) {
+ args := []string{"nsenter-exec"}
+ parent, child, err := newPipe()
+ if err != nil {
+ t.Fatalf("failed to create pipe %v", err)
+ }
+
+ namespaces := []string{
+ // join pid ns of the current process
+ fmt.Sprintf("net:/proc/%d/ns/pid", os.Getpid()),
+ }
+ cmd := &exec.Cmd{
+ Path: os.Args[0],
+ Args: args,
+ ExtraFiles: []*os.File{child},
+ Env: []string{"_LIBCONTAINER_INITPIPE=3"},
+ }
+
+ if err := cmd.Start(); err != nil {
+ t.Fatal(err)
+ }
+ // write cloneFlags
+ r := nl.NewNetlinkRequest(int(libcontainer.InitMsg), 0)
+ r.AddData(&libcontainer.Int32msg{
+ Type: libcontainer.CloneFlagsAttr,
+ Value: uint32(unix.CLONE_NEWNET),
+ })
+ r.AddData(&libcontainer.Bytemsg{
+ Type: libcontainer.NsPathsAttr,
+ Value: []byte(strings.Join(namespaces, ",")),
+ })
+ if _, err := io.Copy(parent, bytes.NewReader(r.Serialize())); err != nil {
+ t.Fatal(err)
+ }
+
+ if err := cmd.Wait(); err == nil {
+ t.Fatalf("nsenter exits with a zero exit status")
+ }
+}
+
+func init() {
+ if strings.HasPrefix(os.Args[0], "nsenter-") {
+ os.Exit(0)
+ }
+ return
+}
+
+func newPipe() (parent *os.File, child *os.File, err error) {
+ fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0)
+ if err != nil {
+ return nil, nil, err
+ }
+ return os.NewFile(uintptr(fds[1]), "parent"), os.NewFile(uintptr(fds[0]), "child"), nil
+}
--- /dev/null
+// +build !linux !cgo
+
+package nsenter
+
+import "C"
--- /dev/null
+
+#define _GNU_SOURCE
+#include <endian.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <grp.h>
+#include <sched.h>
+#include <setjmp.h>
+#include <signal.h>
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <sys/ioctl.h>
+#include <sys/prctl.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+
+#include <linux/limits.h>
+#include <linux/netlink.h>
+#include <linux/types.h>
+
+/* Get all of the CLONE_NEW* flags. */
+#include "namespace.h"
+
+/* Synchronisation values. */
+enum sync_t {
+ SYNC_USERMAP_PLS = 0x40, /* Request parent to map our users. */
+ SYNC_USERMAP_ACK = 0x41, /* Mapping finished by the parent. */
+ SYNC_RECVPID_PLS = 0x42, /* Tell parent we're sending the PID. */
+ SYNC_RECVPID_ACK = 0x43, /* PID was correctly received by parent. */
+ SYNC_GRANDCHILD = 0x44, /* The grandchild is ready to run. */
+ SYNC_CHILD_READY = 0x45, /* The child or grandchild is ready to return. */
+
+ /* XXX: This doesn't help with segfaults and other such issues. */
+ SYNC_ERR = 0xFF, /* Fatal error, no turning back. The error code follows. */
+};
+
+/*
+ * Synchronisation value for cgroup namespace setup.
+ * The same constant is defined in process_linux.go as "createCgroupns".
+ */
+#define CREATECGROUPNS 0x80
+
+/* longjmp() arguments. */
+#define JUMP_PARENT 0x00
+#define JUMP_CHILD 0xA0
+#define JUMP_INIT 0xA1
+
+/* JSON buffer. */
+#define JSON_MAX 4096
+
+/* Assume the stack grows down, so arguments should be above it. */
+struct clone_t {
+ /*
+ * Reserve some space for clone() to locate arguments
+ * and retcode in this place
+ */
+ char stack[4096] __attribute__ ((aligned(16)));
+ char stack_ptr[0];
+
+ /* There's two children. This is used to execute the different code. */
+ jmp_buf *env;
+ int jmpval;
+};
+
+struct nlconfig_t {
+ char *data;
+
+ /* Process settings. */
+ uint32_t cloneflags;
+ char *oom_score_adj;
+ size_t oom_score_adj_len;
+
+ /* User namespace settings. */
+ char *uidmap;
+ size_t uidmap_len;
+ char *gidmap;
+ size_t gidmap_len;
+ char *namespaces;
+ size_t namespaces_len;
+ uint8_t is_setgroup;
+
+ /* Rootless container settings. */
+ uint8_t is_rootless_euid; /* boolean */
+ char *uidmappath;
+ size_t uidmappath_len;
+ char *gidmappath;
+ size_t gidmappath_len;
+};
+
+/*
+ * List of netlink message types sent to us as part of bootstrapping the init.
+ * These constants are defined in libcontainer/message_linux.go.
+ */
+#define INIT_MSG 62000
+#define CLONE_FLAGS_ATTR 27281
+#define NS_PATHS_ATTR 27282
+#define UIDMAP_ATTR 27283
+#define GIDMAP_ATTR 27284
+#define SETGROUP_ATTR 27285
+#define OOM_SCORE_ADJ_ATTR 27286
+#define ROOTLESS_EUID_ATTR 27287
+#define UIDMAPPATH_ATTR 27288
+#define GIDMAPPATH_ATTR 27289
+
+/*
+ * Use the raw syscall for versions of glibc which don't include a function for
+ * it, namely (glibc 2.12).
+ */
+#if __GLIBC__ == 2 && __GLIBC_MINOR__ < 14
+# define _GNU_SOURCE
+# include "syscall.h"
+# if !defined(SYS_setns) && defined(__NR_setns)
+# define SYS_setns __NR_setns
+# endif
+
+#ifndef SYS_setns
+# error "setns(2) syscall not supported by glibc version"
+#endif
+
+int setns(int fd, int nstype)
+{
+ return syscall(SYS_setns, fd, nstype);
+}
+#endif
+
+/* XXX: This is ugly. */
+static int syncfd = -1;
+
+/* TODO(cyphar): Fix this so it correctly deals with syncT. */
+#define bail(fmt, ...) \
+ do { \
+ int ret = __COUNTER__ + 1; \
+ fprintf(stderr, "nsenter: " fmt ": %m\n", ##__VA_ARGS__); \
+ if (syncfd >= 0) { \
+ enum sync_t s = SYNC_ERR; \
+ if (write(syncfd, &s, sizeof(s)) != sizeof(s)) \
+ fprintf(stderr, "nsenter: failed: write(s)"); \
+ if (write(syncfd, &ret, sizeof(ret)) != sizeof(ret)) \
+ fprintf(stderr, "nsenter: failed: write(ret)"); \
+ } \
+ exit(ret); \
+ } while(0)
+
+static int write_file(char *data, size_t data_len, char *pathfmt, ...)
+{
+ int fd, len, ret = 0;
+ char path[PATH_MAX];
+
+ va_list ap;
+ va_start(ap, pathfmt);
+ len = vsnprintf(path, PATH_MAX, pathfmt, ap);
+ va_end(ap);
+ if (len < 0)
+ return -1;
+
+ fd = open(path, O_RDWR);
+ if (fd < 0) {
+ return -1;
+ }
+
+ len = write(fd, data, data_len);
+ if (len != data_len) {
+ ret = -1;
+ goto out;
+ }
+
+ out:
+ close(fd);
+ return ret;
+}
+
+enum policy_t {
+ SETGROUPS_DEFAULT = 0,
+ SETGROUPS_ALLOW,
+ SETGROUPS_DENY,
+};
+
+/* This *must* be called before we touch gid_map. */
+static void update_setgroups(int pid, enum policy_t setgroup)
+{
+ char *policy;
+
+ switch (setgroup) {
+ case SETGROUPS_ALLOW:
+ policy = "allow";
+ break;
+ case SETGROUPS_DENY:
+ policy = "deny";
+ break;
+ case SETGROUPS_DEFAULT:
+ default:
+ /* Nothing to do. */
+ return;
+ }
+
+ if (write_file(policy, strlen(policy), "/proc/%d/setgroups", pid) < 0) {
+ /*
+ * If the kernel is too old to support /proc/pid/setgroups,
+ * open(2) or write(2) will return ENOENT. This is fine.
+ */
+ if (errno != ENOENT)
+ bail("failed to write '%s' to /proc/%d/setgroups", policy, pid);
+ }
+}
+
+static int try_mapping_tool(const char *app, int pid, char *map, size_t map_len)
+{
+ int child;
+
+ /*
+ * If @app is NULL, execve will segfault. Just check it here and bail (if
+ * we're in this path, the caller is already getting desperate and there
+ * isn't a backup to this failing). This usually would be a configuration
+ * or programming issue.
+ */
+ if (!app)
+ bail("mapping tool not present");
+
+ child = fork();
+ if (child < 0)
+ bail("failed to fork");
+
+ if (!child) {
+#define MAX_ARGV 20
+ char *argv[MAX_ARGV];
+ char *envp[] = { NULL };
+ char pid_fmt[16];
+ int argc = 0;
+ char *next;
+
+ snprintf(pid_fmt, 16, "%d", pid);
+
+ argv[argc++] = (char *)app;
+ argv[argc++] = pid_fmt;
+ /*
+ * Convert the map string into a list of argument that
+ * newuidmap/newgidmap can understand.
+ */
+
+ while (argc < MAX_ARGV) {
+ if (*map == '\0') {
+ argv[argc++] = NULL;
+ break;
+ }
+ argv[argc++] = map;
+ next = strpbrk(map, "\n ");
+ if (next == NULL)
+ break;
+ *next++ = '\0';
+ map = next + strspn(next, "\n ");
+ }
+
+ execve(app, argv, envp);
+ bail("failed to execv");
+ } else {
+ int status;
+
+ while (true) {
+ if (waitpid(child, &status, 0) < 0) {
+ if (errno == EINTR)
+ continue;
+ bail("failed to waitpid");
+ }
+ if (WIFEXITED(status) || WIFSIGNALED(status))
+ return WEXITSTATUS(status);
+ }
+ }
+
+ return -1;
+}
+
+static void update_uidmap(const char *path, int pid, char *map, size_t map_len)
+{
+ if (map == NULL || map_len <= 0)
+ return;
+
+ if (write_file(map, map_len, "/proc/%d/uid_map", pid) < 0) {
+ if (errno != EPERM)
+ bail("failed to update /proc/%d/uid_map", pid);
+ if (try_mapping_tool(path, pid, map, map_len))
+ bail("failed to use newuid map on %d", pid);
+ }
+}
+
+static void update_gidmap(const char *path, int pid, char *map, size_t map_len)
+{
+ if (map == NULL || map_len <= 0)
+ return;
+
+ if (write_file(map, map_len, "/proc/%d/gid_map", pid) < 0) {
+ if (errno != EPERM)
+ bail("failed to update /proc/%d/gid_map", pid);
+ if (try_mapping_tool(path, pid, map, map_len))
+ bail("failed to use newgid map on %d", pid);
+ }
+}
+
+static void update_oom_score_adj(char *data, size_t len)
+{
+ if (data == NULL || len <= 0)
+ return;
+
+ if (write_file(data, len, "/proc/self/oom_score_adj") < 0)
+ bail("failed to update /proc/self/oom_score_adj");
+}
+
+/* A dummy function that just jumps to the given jumpval. */
+static int child_func(void *arg) __attribute__ ((noinline));
+static int child_func(void *arg)
+{
+ struct clone_t *ca = (struct clone_t *)arg;
+ longjmp(*ca->env, ca->jmpval);
+}
+
+static int clone_parent(jmp_buf *env, int jmpval) __attribute__ ((noinline));
+static int clone_parent(jmp_buf *env, int jmpval)
+{
+ struct clone_t ca = {
+ .env = env,
+ .jmpval = jmpval,
+ };
+
+ return clone(child_func, ca.stack_ptr, CLONE_PARENT | SIGCHLD, &ca);
+}
+
+/*
+ * Gets the init pipe fd from the environment, which is used to read the
+ * bootstrap data and tell the parent what the new pid is after we finish
+ * setting up the environment.
+ */
+static int initpipe(void)
+{
+ int pipenum;
+ char *initpipe, *endptr;
+
+ initpipe = getenv("_LIBCONTAINER_INITPIPE");
+ if (initpipe == NULL || *initpipe == '\0')
+ return -1;
+
+ pipenum = strtol(initpipe, &endptr, 10);
+ if (*endptr != '\0')
+ bail("unable to parse _LIBCONTAINER_INITPIPE");
+
+ return pipenum;
+}
+
+/* Returns the clone(2) flag for a namespace, given the name of a namespace. */
+static int nsflag(char *name)
+{
+ if (!strcmp(name, "cgroup"))
+ return CLONE_NEWCGROUP;
+ else if (!strcmp(name, "ipc"))
+ return CLONE_NEWIPC;
+ else if (!strcmp(name, "mnt"))
+ return CLONE_NEWNS;
+ else if (!strcmp(name, "net"))
+ return CLONE_NEWNET;
+ else if (!strcmp(name, "pid"))
+ return CLONE_NEWPID;
+ else if (!strcmp(name, "user"))
+ return CLONE_NEWUSER;
+ else if (!strcmp(name, "uts"))
+ return CLONE_NEWUTS;
+
+ /* If we don't recognise a name, fallback to 0. */
+ return 0;
+}
+
+static uint32_t readint32(char *buf)
+{
+ return *(uint32_t *) buf;
+}
+
+static uint8_t readint8(char *buf)
+{
+ return *(uint8_t *) buf;
+}
+
+static void nl_parse(int fd, struct nlconfig_t *config)
+{
+ size_t len, size;
+ struct nlmsghdr hdr;
+ char *data, *current;
+
+ /* Retrieve the netlink header. */
+ len = read(fd, &hdr, NLMSG_HDRLEN);
+ if (len != NLMSG_HDRLEN)
+ bail("invalid netlink header length %zu", len);
+
+ if (hdr.nlmsg_type == NLMSG_ERROR)
+ bail("failed to read netlink message");
+
+ if (hdr.nlmsg_type != INIT_MSG)
+ bail("unexpected msg type %d", hdr.nlmsg_type);
+
+ /* Retrieve data. */
+ size = NLMSG_PAYLOAD(&hdr, 0);
+ current = data = malloc(size);
+ if (!data)
+ bail("failed to allocate %zu bytes of memory for nl_payload", size);
+
+ len = read(fd, data, size);
+ if (len != size)
+ bail("failed to read netlink payload, %zu != %zu", len, size);
+
+ /* Parse the netlink payload. */
+ config->data = data;
+ while (current < data + size) {
+ struct nlattr *nlattr = (struct nlattr *)current;
+ size_t payload_len = nlattr->nla_len - NLA_HDRLEN;
+
+ /* Advance to payload. */
+ current += NLA_HDRLEN;
+
+ /* Handle payload. */
+ switch (nlattr->nla_type) {
+ case CLONE_FLAGS_ATTR:
+ config->cloneflags = readint32(current);
+ break;
+ case ROOTLESS_EUID_ATTR:
+ config->is_rootless_euid = readint8(current); /* boolean */
+ break;
+ case OOM_SCORE_ADJ_ATTR:
+ config->oom_score_adj = current;
+ config->oom_score_adj_len = payload_len;
+ break;
+ case NS_PATHS_ATTR:
+ config->namespaces = current;
+ config->namespaces_len = payload_len;
+ break;
+ case UIDMAP_ATTR:
+ config->uidmap = current;
+ config->uidmap_len = payload_len;
+ break;
+ case GIDMAP_ATTR:
+ config->gidmap = current;
+ config->gidmap_len = payload_len;
+ break;
+ case UIDMAPPATH_ATTR:
+ config->uidmappath = current;
+ config->uidmappath_len = payload_len;
+ break;
+ case GIDMAPPATH_ATTR:
+ config->gidmappath = current;
+ config->gidmappath_len = payload_len;
+ break;
+ case SETGROUP_ATTR:
+ config->is_setgroup = readint8(current);
+ break;
+ default:
+ bail("unknown netlink message type %d", nlattr->nla_type);
+ }
+
+ current += NLA_ALIGN(payload_len);
+ }
+}
+
+void nl_free(struct nlconfig_t *config)
+{
+ free(config->data);
+}
+
+void join_namespaces(char *nslist)
+{
+ int num = 0, i;
+ char *saveptr = NULL;
+ char *namespace = strtok_r(nslist, ",", &saveptr);
+ struct namespace_t {
+ int fd;
+ int ns;
+ char type[PATH_MAX];
+ char path[PATH_MAX];
+ } *namespaces = NULL;
+
+ if (!namespace || !strlen(namespace) || !strlen(nslist))
+ bail("ns paths are empty");
+
+ /*
+ * We have to open the file descriptors first, since after
+ * we join the mnt namespace we might no longer be able to
+ * access the paths.
+ */
+ do {
+ int fd;
+ char *path;
+ struct namespace_t *ns;
+
+ /* Resize the namespace array. */
+ namespaces = realloc(namespaces, ++num * sizeof(struct namespace_t));
+ if (!namespaces)
+ bail("failed to reallocate namespace array");
+ ns = &namespaces[num - 1];
+
+ /* Split 'ns:path'. */
+ path = strstr(namespace, ":");
+ if (!path)
+ bail("failed to parse %s", namespace);
+ *path++ = '\0';
+
+ fd = open(path, O_RDONLY);
+ if (fd < 0)
+ bail("failed to open %s", path);
+
+ ns->fd = fd;
+ ns->ns = nsflag(namespace);
+ strncpy(ns->path, path, PATH_MAX - 1);
+ ns->path[PATH_MAX - 1] = '\0';
+ } while ((namespace = strtok_r(NULL, ",", &saveptr)) != NULL);
+
+ /*
+ * The ordering in which we join namespaces is important. We should
+ * always join the user namespace *first*. This is all guaranteed
+ * from the container_linux.go side of this, so we're just going to
+ * follow the order given to us.
+ */
+
+ for (i = 0; i < num; i++) {
+ struct namespace_t ns = namespaces[i];
+
+ if (setns(ns.fd, ns.ns) < 0)
+ bail("failed to setns to %s", ns.path);
+
+ close(ns.fd);
+ }
+
+ free(namespaces);
+}
+
+void nsexec(void)
+{
+ int pipenum;
+ jmp_buf env;
+ int sync_child_pipe[2], sync_grandchild_pipe[2];
+ struct nlconfig_t config = { 0 };
+
+ /*
+ * If we don't have an init pipe, just return to the go routine.
+ * We'll only get an init pipe for start or exec.
+ */
+ pipenum = initpipe();
+ if (pipenum == -1)
+ return;
+
+ /* Parse all of the netlink configuration. */
+ nl_parse(pipenum, &config);
+
+ /* Set oom_score_adj. This has to be done before !dumpable because
+ * /proc/self/oom_score_adj is not writeable unless you're an privileged
+ * user (if !dumpable is set). All children inherit their parent's
+ * oom_score_adj value on fork(2) so this will always be propagated
+ * properly.
+ */
+ update_oom_score_adj(config.oom_score_adj, config.oom_score_adj_len);
+
+ /*
+ * Make the process non-dumpable, to avoid various race conditions that
+ * could cause processes in namespaces we're joining to access host
+ * resources (or potentially execute code).
+ *
+ * However, if the number of namespaces we are joining is 0, we are not
+ * going to be switching to a different security context. Thus setting
+ * ourselves to be non-dumpable only breaks things (like rootless
+ * containers), which is the recommendation from the kernel folks.
+ */
+ if (config.namespaces) {
+ if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0)
+ bail("failed to set process as non-dumpable");
+ }
+
+ /* Pipe so we can tell the child when we've finished setting up. */
+ if (socketpair(AF_LOCAL, SOCK_STREAM, 0, sync_child_pipe) < 0)
+ bail("failed to setup sync pipe between parent and child");
+
+ /*
+ * We need a new socketpair to sync with grandchild so we don't have
+ * race condition with child.
+ */
+ if (socketpair(AF_LOCAL, SOCK_STREAM, 0, sync_grandchild_pipe) < 0)
+ bail("failed to setup sync pipe between parent and grandchild");
+
+ /* TODO: Currently we aren't dealing with child deaths properly. */
+
+ /*
+ * Okay, so this is quite annoying.
+ *
+ * In order for this unsharing code to be more extensible we need to split
+ * up unshare(CLONE_NEWUSER) and clone() in various ways. The ideal case
+ * would be if we did clone(CLONE_NEWUSER) and the other namespaces
+ * separately, but because of SELinux issues we cannot really do that. But
+ * we cannot just dump the namespace flags into clone(...) because several
+ * usecases (such as rootless containers) require more granularity around
+ * the namespace setup. In addition, some older kernels had issues where
+ * CLONE_NEWUSER wasn't handled before other namespaces (but we cannot
+ * handle this while also dealing with SELinux so we choose SELinux support
+ * over broken kernel support).
+ *
+ * However, if we unshare(2) the user namespace *before* we clone(2), then
+ * all hell breaks loose.
+ *
+ * The parent no longer has permissions to do many things (unshare(2) drops
+ * all capabilities in your old namespace), and the container cannot be set
+ * up to have more than one {uid,gid} mapping. This is obviously less than
+ * ideal. In order to fix this, we have to first clone(2) and then unshare.
+ *
+ * Unfortunately, it's not as simple as that. We have to fork to enter the
+ * PID namespace (the PID namespace only applies to children). Since we'll
+ * have to double-fork, this clone_parent() call won't be able to get the
+ * PID of the _actual_ init process (without doing more synchronisation than
+ * I can deal with at the moment). So we'll just get the parent to send it
+ * for us, the only job of this process is to update
+ * /proc/pid/{setgroups,uid_map,gid_map}.
+ *
+ * And as a result of the above, we also need to setns(2) in the first child
+ * because if we join a PID namespace in the topmost parent then our child
+ * will be in that namespace (and it will not be able to give us a PID value
+ * that makes sense without resorting to sending things with cmsg).
+ *
+ * This also deals with an older issue caused by dumping cloneflags into
+ * clone(2): On old kernels, CLONE_PARENT didn't work with CLONE_NEWPID, so
+ * we have to unshare(2) before clone(2) in order to do this. This was fixed
+ * in upstream commit 1f7f4dde5c945f41a7abc2285be43d918029ecc5, and was
+ * introduced by 40a0d32d1eaffe6aac7324ca92604b6b3977eb0e. As far as we're
+ * aware, the last mainline kernel which had this bug was Linux 3.12.
+ * However, we cannot comment on which kernels the broken patch was
+ * backported to.
+ *
+ * -- Aleksa "what has my life come to?" Sarai
+ */
+
+ switch (setjmp(env)) {
+ /*
+ * Stage 0: We're in the parent. Our job is just to create a new child
+ * (stage 1: JUMP_CHILD) process and write its uid_map and
+ * gid_map. That process will go on to create a new process, then
+ * it will send us its PID which we will send to the bootstrap
+ * process.
+ */
+ case JUMP_PARENT:{
+ int len;
+ pid_t child, first_child = -1;
+ bool ready = false;
+
+ /* For debugging. */
+ prctl(PR_SET_NAME, (unsigned long)"runc:[0:PARENT]", 0, 0, 0);
+
+ /* Start the process of getting a container. */
+ child = clone_parent(&env, JUMP_CHILD);
+ if (child < 0)
+ bail("unable to fork: child_func");
+
+ /*
+ * State machine for synchronisation with the children.
+ *
+ * Father only return when both child and grandchild are
+ * ready, so we can receive all possible error codes
+ * generated by children.
+ */
+ while (!ready) {
+ enum sync_t s;
+ int ret;
+
+ syncfd = sync_child_pipe[1];
+ close(sync_child_pipe[0]);
+
+ if (read(syncfd, &s, sizeof(s)) != sizeof(s))
+ bail("failed to sync with child: next state");
+
+ switch (s) {
+ case SYNC_ERR:
+ /* We have to mirror the error code of the child. */
+ if (read(syncfd, &ret, sizeof(ret)) != sizeof(ret))
+ bail("failed to sync with child: read(error code)");
+
+ exit(ret);
+ case SYNC_USERMAP_PLS:
+ /*
+ * Enable setgroups(2) if we've been asked to. But we also
+ * have to explicitly disable setgroups(2) if we're
+ * creating a rootless container for single-entry mapping.
+ * i.e. config.is_setgroup == false.
+ * (this is required since Linux 3.19).
+ *
+ * For rootless multi-entry mapping, config.is_setgroup shall be true and
+ * newuidmap/newgidmap shall be used.
+ */
+
+ if (config.is_rootless_euid && !config.is_setgroup)
+ update_setgroups(child, SETGROUPS_DENY);
+
+ /* Set up mappings. */
+ update_uidmap(config.uidmappath, child, config.uidmap, config.uidmap_len);
+ update_gidmap(config.gidmappath, child, config.gidmap, config.gidmap_len);
+
+ s = SYNC_USERMAP_ACK;
+ if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
+ kill(child, SIGKILL);
+ bail("failed to sync with child: write(SYNC_USERMAP_ACK)");
+ }
+ break;
+ case SYNC_RECVPID_PLS:{
+ first_child = child;
+
+ /* Get the init_func pid. */
+ if (read(syncfd, &child, sizeof(child)) != sizeof(child)) {
+ kill(first_child, SIGKILL);
+ bail("failed to sync with child: read(childpid)");
+ }
+
+ /* Send ACK. */
+ s = SYNC_RECVPID_ACK;
+ if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
+ kill(first_child, SIGKILL);
+ kill(child, SIGKILL);
+ bail("failed to sync with child: write(SYNC_RECVPID_ACK)");
+ }
+
+ /* Send the init_func pid back to our parent.
+ *
+ * Send the init_func pid and the pid of the first child back to our parent.
+ * We need to send both back because we can't reap the first child we created (CLONE_PARENT).
+ * It becomes the responsibility of our parent to reap the first child.
+ */
+ len = dprintf(pipenum, "{\"pid\": %d, \"pid_first\": %d}\n", child, first_child);
+ if (len < 0) {
+ kill(child, SIGKILL);
+ bail("unable to generate JSON for child pid");
+ }
+ }
+ break;
+ case SYNC_CHILD_READY:
+ ready = true;
+ break;
+ default:
+ bail("unexpected sync value: %u", s);
+ }
+ }
+
+ /* Now sync with grandchild. */
+
+ ready = false;
+ while (!ready) {
+ enum sync_t s;
+ int ret;
+
+ syncfd = sync_grandchild_pipe[1];
+ close(sync_grandchild_pipe[0]);
+
+ s = SYNC_GRANDCHILD;
+ if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
+ kill(child, SIGKILL);
+ bail("failed to sync with child: write(SYNC_GRANDCHILD)");
+ }
+
+ if (read(syncfd, &s, sizeof(s)) != sizeof(s))
+ bail("failed to sync with child: next state");
+
+ switch (s) {
+ case SYNC_ERR:
+ /* We have to mirror the error code of the child. */
+ if (read(syncfd, &ret, sizeof(ret)) != sizeof(ret))
+ bail("failed to sync with child: read(error code)");
+
+ exit(ret);
+ case SYNC_CHILD_READY:
+ ready = true;
+ break;
+ default:
+ bail("unexpected sync value: %u", s);
+ }
+ }
+ exit(0);
+ }
+
+ /*
+ * Stage 1: We're in the first child process. Our job is to join any
+ * provided namespaces in the netlink payload and unshare all
+ * of the requested namespaces. If we've been asked to
+ * CLONE_NEWUSER, we will ask our parent (stage 0) to set up
+ * our user mappings for us. Then, we create a new child
+ * (stage 2: JUMP_INIT) for PID namespace. We then send the
+ * child's PID to our parent (stage 0).
+ */
+ case JUMP_CHILD:{
+ pid_t child;
+ enum sync_t s;
+
+ /* We're in a child and thus need to tell the parent if we die. */
+ syncfd = sync_child_pipe[0];
+ close(sync_child_pipe[1]);
+
+ /* For debugging. */
+ prctl(PR_SET_NAME, (unsigned long)"runc:[1:CHILD]", 0, 0, 0);
+
+ /*
+ * We need to setns first. We cannot do this earlier (in stage 0)
+ * because of the fact that we forked to get here (the PID of
+ * [stage 2: JUMP_INIT]) would be meaningless). We could send it
+ * using cmsg(3) but that's just annoying.
+ */
+ if (config.namespaces)
+ join_namespaces(config.namespaces);
+
+ /*
+ * Deal with user namespaces first. They are quite special, as they
+ * affect our ability to unshare other namespaces and are used as
+ * context for privilege checks.
+ *
+ * We don't unshare all namespaces in one go. The reason for this
+ * is that, while the kernel documentation may claim otherwise,
+ * there are certain cases where unsharing all namespaces at once
+ * will result in namespace objects being owned incorrectly.
+ * Ideally we should just fix these kernel bugs, but it's better to
+ * be safe than sorry, and fix them separately.
+ *
+ * A specific case of this is that the SELinux label of the
+ * internal kern-mount that mqueue uses will be incorrect if the
+ * UTS namespace is cloned before the USER namespace is mapped.
+ * I've also heard of similar problems with the network namespace
+ * in some scenarios. This also mirrors how LXC deals with this
+ * problem.
+ */
+ if (config.cloneflags & CLONE_NEWUSER) {
+ if (unshare(CLONE_NEWUSER) < 0)
+ bail("failed to unshare user namespace");
+ config.cloneflags &= ~CLONE_NEWUSER;
+
+ /*
+ * We don't have the privileges to do any mapping here (see the
+ * clone_parent rant). So signal our parent to hook us up.
+ */
+
+ /* Switching is only necessary if we joined namespaces. */
+ if (config.namespaces) {
+ if (prctl(PR_SET_DUMPABLE, 1, 0, 0, 0) < 0)
+ bail("failed to set process as dumpable");
+ }
+ s = SYNC_USERMAP_PLS;
+ if (write(syncfd, &s, sizeof(s)) != sizeof(s))
+ bail("failed to sync with parent: write(SYNC_USERMAP_PLS)");
+
+ /* ... wait for mapping ... */
+
+ if (read(syncfd, &s, sizeof(s)) != sizeof(s))
+ bail("failed to sync with parent: read(SYNC_USERMAP_ACK)");
+ if (s != SYNC_USERMAP_ACK)
+ bail("failed to sync with parent: SYNC_USERMAP_ACK: got %u", s);
+ /* Switching is only necessary if we joined namespaces. */
+ if (config.namespaces) {
+ if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0)
+ bail("failed to set process as dumpable");
+ }
+
+ /* Become root in the namespace proper. */
+ if (setresuid(0, 0, 0) < 0)
+ bail("failed to become root in user namespace");
+ }
+ /*
+ * Unshare all of the namespaces. Now, it should be noted that this
+ * ordering might break in the future (especially with rootless
+ * containers). But for now, it's not possible to split this into
+ * CLONE_NEWUSER + [the rest] because of some RHEL SELinux issues.
+ *
+ * Note that we don't merge this with clone() because there were
+ * some old kernel versions where clone(CLONE_PARENT | CLONE_NEWPID)
+ * was broken, so we'll just do it the long way anyway.
+ */
+ if (unshare(config.cloneflags & ~CLONE_NEWCGROUP) < 0)
+ bail("failed to unshare namespaces");
+
+ /*
+ * TODO: What about non-namespace clone flags that we're dropping here?
+ *
+ * We fork again because of PID namespace, setns(2) or unshare(2) don't
+ * change the PID namespace of the calling process, because doing so
+ * would change the caller's idea of its own PID (as reported by getpid()),
+ * which would break many applications and libraries, so we must fork
+ * to actually enter the new PID namespace.
+ */
+ child = clone_parent(&env, JUMP_INIT);
+ if (child < 0)
+ bail("unable to fork: init_func");
+
+ /* Send the child to our parent, which knows what it's doing. */
+ s = SYNC_RECVPID_PLS;
+ if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
+ kill(child, SIGKILL);
+ bail("failed to sync with parent: write(SYNC_RECVPID_PLS)");
+ }
+ if (write(syncfd, &child, sizeof(child)) != sizeof(child)) {
+ kill(child, SIGKILL);
+ bail("failed to sync with parent: write(childpid)");
+ }
+
+ /* ... wait for parent to get the pid ... */
+
+ if (read(syncfd, &s, sizeof(s)) != sizeof(s)) {
+ kill(child, SIGKILL);
+ bail("failed to sync with parent: read(SYNC_RECVPID_ACK)");
+ }
+ if (s != SYNC_RECVPID_ACK) {
+ kill(child, SIGKILL);
+ bail("failed to sync with parent: SYNC_RECVPID_ACK: got %u", s);
+ }
+
+ s = SYNC_CHILD_READY;
+ if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
+ kill(child, SIGKILL);
+ bail("failed to sync with parent: write(SYNC_CHILD_READY)");
+ }
+
+ /* Our work is done. [Stage 2: JUMP_INIT] is doing the rest of the work. */
+ exit(0);
+ }
+
+ /*
+ * Stage 2: We're the final child process, and the only process that will
+ * actually return to the Go runtime. Our job is to just do the
+ * final cleanup steps and then return to the Go runtime to allow
+ * init_linux.go to run.
+ */
+ case JUMP_INIT:{
+ /*
+ * We're inside the child now, having jumped from the
+ * start_child() code after forking in the parent.
+ */
+ enum sync_t s;
+
+ /* We're in a child and thus need to tell the parent if we die. */
+ syncfd = sync_grandchild_pipe[0];
+ close(sync_grandchild_pipe[1]);
+ close(sync_child_pipe[0]);
+ close(sync_child_pipe[1]);
+
+ /* For debugging. */
+ prctl(PR_SET_NAME, (unsigned long)"runc:[2:INIT]", 0, 0, 0);
+
+ if (read(syncfd, &s, sizeof(s)) != sizeof(s))
+ bail("failed to sync with parent: read(SYNC_GRANDCHILD)");
+ if (s != SYNC_GRANDCHILD)
+ bail("failed to sync with parent: SYNC_GRANDCHILD: got %u", s);
+
+ if (setsid() < 0)
+ bail("setsid failed");
+
+ if (setuid(0) < 0)
+ bail("setuid failed");
+
+ if (setgid(0) < 0)
+ bail("setgid failed");
+
+ if (!config.is_rootless_euid && config.is_setgroup) {
+ if (setgroups(0, NULL) < 0)
+ bail("setgroups failed");
+ }
+
+ /* ... wait until our topmost parent has finished cgroup setup in p.manager.Apply() ... */
+ if (config.cloneflags & CLONE_NEWCGROUP) {
+ uint8_t value;
+ if (read(pipenum, &value, sizeof(value)) != sizeof(value))
+ bail("read synchronisation value failed");
+ if (value == CREATECGROUPNS) {
+ if (unshare(CLONE_NEWCGROUP) < 0)
+ bail("failed to unshare cgroup namespace");
+ } else
+ bail("received unknown synchronisation value");
+ }
+
+ s = SYNC_CHILD_READY;
+ if (write(syncfd, &s, sizeof(s)) != sizeof(s))
+ bail("failed to sync with patent: write(SYNC_CHILD_READY)");
+
+ /* Close sync pipes. */
+ close(sync_grandchild_pipe[0]);
+
+ /* Free netlink data. */
+ nl_free(&config);
+
+ /* Finish executing, let the Go runtime take over. */
+ return;
+ }
+ default:
+ bail("unexpected jump value");
+ }
+
+ /* Should never be reached. */
+ bail("should never be reached");
+}
--- /dev/null
+package libcontainer
+
+import (
+ "fmt"
+ "io"
+ "math"
+ "os"
+
+ "github.com/opencontainers/runc/libcontainer/configs"
+)
+
+type processOperations interface {
+ wait() (*os.ProcessState, error)
+ signal(sig os.Signal) error
+ pid() int
+}
+
+// Process specifies the configuration and IO for a process inside
+// a container.
+type Process struct {
+ // The command to be run followed by any arguments.
+ Args []string
+
+ // Env specifies the environment variables for the process.
+ Env []string
+
+ // User will set the uid and gid of the executing process running inside the container
+ // local to the container's user and group configuration.
+ User string
+
+ // AdditionalGroups specifies the gids that should be added to supplementary groups
+ // in addition to those that the user belongs to.
+ AdditionalGroups []string
+
+ // Cwd will change the processes current working directory inside the container's rootfs.
+ Cwd string
+
+ // Stdin is a pointer to a reader which provides the standard input stream.
+ Stdin io.Reader
+
+ // Stdout is a pointer to a writer which receives the standard output stream.
+ Stdout io.Writer
+
+ // Stderr is a pointer to a writer which receives the standard error stream.
+ Stderr io.Writer
+
+ // ExtraFiles specifies additional open files to be inherited by the container
+ ExtraFiles []*os.File
+
+ // Initial sizings for the console
+ ConsoleWidth uint16
+ ConsoleHeight uint16
+
+ // Capabilities specify the capabilities to keep when executing the process inside the container
+ // All capabilities not specified will be dropped from the processes capability mask
+ Capabilities *configs.Capabilities
+
+ // AppArmorProfile specifies the profile to apply to the process and is
+ // changed at the time the process is execed
+ AppArmorProfile string
+
+ // Label specifies the label to apply to the process. It is commonly used by selinux
+ Label string
+
+ // NoNewPrivileges controls whether processes can gain additional privileges.
+ NoNewPrivileges *bool
+
+ // Rlimits specifies the resource limits, such as max open files, to set in the container
+ // If Rlimits are not set, the container will inherit rlimits from the parent process
+ Rlimits []configs.Rlimit
+
+ // ConsoleSocket provides the masterfd console.
+ ConsoleSocket *os.File
+
+ // Init specifies whether the process is the first process in the container.
+ Init bool
+
+ ops processOperations
+}
+
+// Wait waits for the process to exit.
+// Wait releases any resources associated with the Process
+func (p Process) Wait() (*os.ProcessState, error) {
+ if p.ops == nil {
+ return nil, newGenericError(fmt.Errorf("invalid process"), NoProcessOps)
+ }
+ return p.ops.wait()
+}
+
+// Pid returns the process ID
+func (p Process) Pid() (int, error) {
+ // math.MinInt32 is returned here, because it's invalid value
+ // for the kill() system call.
+ if p.ops == nil {
+ return math.MinInt32, newGenericError(fmt.Errorf("invalid process"), NoProcessOps)
+ }
+ return p.ops.pid(), nil
+}
+
+// Signal sends a signal to the Process.
+func (p Process) Signal(sig os.Signal) error {
+ if p.ops == nil {
+ return newGenericError(fmt.Errorf("invalid process"), NoProcessOps)
+ }
+ return p.ops.signal(sig)
+}
+
+// IO holds the process's STDIO
+type IO struct {
+ Stdin io.WriteCloser
+ Stdout io.ReadCloser
+ Stderr io.ReadCloser
+}
--- /dev/null
+// +build linux
+
+package libcontainer
+
+import (
+ "encoding/json"
+ "errors"
+ "fmt"
+ "io"
+ "os"
+ "os/exec"
+ "path/filepath"
+ "strconv"
+ "syscall" // only for Signal
+
+ "github.com/opencontainers/runc/libcontainer/cgroups"
+ "github.com/opencontainers/runc/libcontainer/configs"
+ "github.com/opencontainers/runc/libcontainer/intelrdt"
+ "github.com/opencontainers/runc/libcontainer/system"
+ "github.com/opencontainers/runc/libcontainer/utils"
+
+ "golang.org/x/sys/unix"
+)
+
+// Synchronisation value for cgroup namespace setup.
+// The same constant is defined in nsexec.c as "CREATECGROUPNS".
+const createCgroupns = 0x80
+
+type parentProcess interface {
+ // pid returns the pid for the running process.
+ pid() int
+
+ // start starts the process execution.
+ start() error
+
+ // send a SIGKILL to the process and wait for the exit.
+ terminate() error
+
+ // wait waits on the process returning the process state.
+ wait() (*os.ProcessState, error)
+
+ // startTime returns the process start time.
+ startTime() (uint64, error)
+
+ signal(os.Signal) error
+
+ externalDescriptors() []string
+
+ setExternalDescriptors(fds []string)
+}
+
+type setnsProcess struct {
+ cmd *exec.Cmd
+ parentPipe *os.File
+ childPipe *os.File
+ cgroupPaths map[string]string
+ rootlessCgroups bool
+ intelRdtPath string
+ config *initConfig
+ fds []string
+ process *Process
+ bootstrapData io.Reader
+}
+
+func (p *setnsProcess) startTime() (uint64, error) {
+ stat, err := system.Stat(p.pid())
+ return stat.StartTime, err
+}
+
+func (p *setnsProcess) signal(sig os.Signal) error {
+ s, ok := sig.(syscall.Signal)
+ if !ok {
+ return errors.New("os: unsupported signal type")
+ }
+ return unix.Kill(p.pid(), s)
+}
+
+func (p *setnsProcess) start() (err error) {
+ defer p.parentPipe.Close()
+ err = p.cmd.Start()
+ p.childPipe.Close()
+ if err != nil {
+ return newSystemErrorWithCause(err, "starting setns process")
+ }
+ if p.bootstrapData != nil {
+ if _, err := io.Copy(p.parentPipe, p.bootstrapData); err != nil {
+ return newSystemErrorWithCause(err, "copying bootstrap data to pipe")
+ }
+ }
+ if err = p.execSetns(); err != nil {
+ return newSystemErrorWithCause(err, "executing setns process")
+ }
+ if len(p.cgroupPaths) > 0 {
+ if err := cgroups.EnterPid(p.cgroupPaths, p.pid()); err != nil && !p.rootlessCgroups {
+ return newSystemErrorWithCausef(err, "adding pid %d to cgroups", p.pid())
+ }
+ }
+ if p.intelRdtPath != "" {
+ // if Intel RDT "resource control" filesystem path exists
+ _, err := os.Stat(p.intelRdtPath)
+ if err == nil {
+ if err := intelrdt.WriteIntelRdtTasks(p.intelRdtPath, p.pid()); err != nil {
+ return newSystemErrorWithCausef(err, "adding pid %d to Intel RDT resource control filesystem", p.pid())
+ }
+ }
+ }
+ // set rlimits, this has to be done here because we lose permissions
+ // to raise the limits once we enter a user-namespace
+ if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil {
+ return newSystemErrorWithCause(err, "setting rlimits for process")
+ }
+ if err := utils.WriteJSON(p.parentPipe, p.config); err != nil {
+ return newSystemErrorWithCause(err, "writing config to pipe")
+ }
+
+ ierr := parseSync(p.parentPipe, func(sync *syncT) error {
+ switch sync.Type {
+ case procReady:
+ // This shouldn't happen.
+ panic("unexpected procReady in setns")
+ case procHooks:
+ // This shouldn't happen.
+ panic("unexpected procHooks in setns")
+ default:
+ return newSystemError(fmt.Errorf("invalid JSON payload from child"))
+ }
+ })
+
+ if err := unix.Shutdown(int(p.parentPipe.Fd()), unix.SHUT_WR); err != nil {
+ return newSystemErrorWithCause(err, "calling shutdown on init pipe")
+ }
+ // Must be done after Shutdown so the child will exit and we can wait for it.
+ if ierr != nil {
+ p.wait()
+ return ierr
+ }
+ return nil
+}
+
+// execSetns runs the process that executes C code to perform the setns calls
+// because setns support requires the C process to fork off a child and perform the setns
+// before the go runtime boots, we wait on the process to die and receive the child's pid
+// over the provided pipe.
+func (p *setnsProcess) execSetns() error {
+ status, err := p.cmd.Process.Wait()
+ if err != nil {
+ p.cmd.Wait()
+ return newSystemErrorWithCause(err, "waiting on setns process to finish")
+ }
+ if !status.Success() {
+ p.cmd.Wait()
+ return newSystemError(&exec.ExitError{ProcessState: status})
+ }
+ var pid *pid
+ if err := json.NewDecoder(p.parentPipe).Decode(&pid); err != nil {
+ p.cmd.Wait()
+ return newSystemErrorWithCause(err, "reading pid from init pipe")
+ }
+
+ // Clean up the zombie parent process
+ firstChildProcess, err := os.FindProcess(pid.PidFirstChild)
+ if err != nil {
+ return err
+ }
+
+ // Ignore the error in case the child has already been reaped for any reason
+ _, _ = firstChildProcess.Wait()
+
+ process, err := os.FindProcess(pid.Pid)
+ if err != nil {
+ return err
+ }
+ p.cmd.Process = process
+ p.process.ops = p
+ return nil
+}
+
+// terminate sends a SIGKILL to the forked process for the setns routine then waits to
+// avoid the process becoming a zombie.
+func (p *setnsProcess) terminate() error {
+ if p.cmd.Process == nil {
+ return nil
+ }
+ err := p.cmd.Process.Kill()
+ if _, werr := p.wait(); err == nil {
+ err = werr
+ }
+ return err
+}
+
+func (p *setnsProcess) wait() (*os.ProcessState, error) {
+ err := p.cmd.Wait()
+
+ // Return actual ProcessState even on Wait error
+ return p.cmd.ProcessState, err
+}
+
+func (p *setnsProcess) pid() int {
+ return p.cmd.Process.Pid
+}
+
+func (p *setnsProcess) externalDescriptors() []string {
+ return p.fds
+}
+
+func (p *setnsProcess) setExternalDescriptors(newFds []string) {
+ p.fds = newFds
+}
+
+type initProcess struct {
+ cmd *exec.Cmd
+ parentPipe *os.File
+ childPipe *os.File
+ config *initConfig
+ manager cgroups.Manager
+ intelRdtManager intelrdt.Manager
+ container *linuxContainer
+ fds []string
+ process *Process
+ bootstrapData io.Reader
+ sharePidns bool
+}
+
+func (p *initProcess) pid() int {
+ return p.cmd.Process.Pid
+}
+
+func (p *initProcess) externalDescriptors() []string {
+ return p.fds
+}
+
+// getChildPid receives the final child's pid over the provided pipe.
+func (p *initProcess) getChildPid() (int, error) {
+ var pid pid
+ if err := json.NewDecoder(p.parentPipe).Decode(&pid); err != nil {
+ p.cmd.Wait()
+ return -1, err
+ }
+ return pid.Pid, nil
+}
+
+func (p *initProcess) waitForChildExit(childPid int) error {
+ status, err := p.cmd.Process.Wait()
+ if err != nil {
+ p.cmd.Wait()
+ return err
+ }
+ if !status.Success() {
+ p.cmd.Wait()
+ return &exec.ExitError{ProcessState: status}
+ }
+
+ process, err := os.FindProcess(childPid)
+ if err != nil {
+ return err
+ }
+ p.cmd.Process = process
+ p.process.ops = p
+ return nil
+}
+
+func (p *initProcess) start() error {
+ defer p.parentPipe.Close()
+ err := p.cmd.Start()
+ p.process.ops = p
+ p.childPipe.Close()
+ if err != nil {
+ p.process.ops = nil
+ return newSystemErrorWithCause(err, "starting init process command")
+ }
+ // Do this before syncing with child so that no children can escape the
+ // cgroup. We don't need to worry about not doing this and not being root
+ // because we'd be using the rootless cgroup manager in that case.
+ if err := p.manager.Apply(p.pid()); err != nil {
+ return newSystemErrorWithCause(err, "applying cgroup configuration for process")
+ }
+ if p.intelRdtManager != nil {
+ if err := p.intelRdtManager.Apply(p.pid()); err != nil {
+ return newSystemErrorWithCause(err, "applying Intel RDT configuration for process")
+ }
+ }
+ defer func() {
+ if err != nil {
+ // TODO: should not be the responsibility to call here
+ p.manager.Destroy()
+ if p.intelRdtManager != nil {
+ p.intelRdtManager.Destroy()
+ }
+ }
+ }()
+
+ if _, err := io.Copy(p.parentPipe, p.bootstrapData); err != nil {
+ return newSystemErrorWithCause(err, "copying bootstrap data to pipe")
+ }
+ childPid, err := p.getChildPid()
+ if err != nil {
+ return newSystemErrorWithCause(err, "getting the final child's pid from pipe")
+ }
+
+ // Save the standard descriptor names before the container process
+ // can potentially move them (e.g., via dup2()). If we don't do this now,
+ // we won't know at checkpoint time which file descriptor to look up.
+ fds, err := getPipeFds(childPid)
+ if err != nil {
+ return newSystemErrorWithCausef(err, "getting pipe fds for pid %d", childPid)
+ }
+ p.setExternalDescriptors(fds)
+ // Do this before syncing with child so that no children
+ // can escape the cgroup
+ if err := p.manager.Apply(childPid); err != nil {
+ return newSystemErrorWithCause(err, "applying cgroup configuration for process")
+ }
+ if p.intelRdtManager != nil {
+ if err := p.intelRdtManager.Apply(childPid); err != nil {
+ return newSystemErrorWithCause(err, "applying Intel RDT configuration for process")
+ }
+ }
+ // Now it's time to setup cgroup namesapce
+ if p.config.Config.Namespaces.Contains(configs.NEWCGROUP) && p.config.Config.Namespaces.PathOf(configs.NEWCGROUP) == "" {
+ if _, err := p.parentPipe.Write([]byte{createCgroupns}); err != nil {
+ return newSystemErrorWithCause(err, "sending synchronization value to init process")
+ }
+ }
+
+ // Wait for our first child to exit
+ if err := p.waitForChildExit(childPid); err != nil {
+ return newSystemErrorWithCause(err, "waiting for our first child to exit")
+ }
+
+ defer func() {
+ if err != nil {
+ // TODO: should not be the responsibility to call here
+ p.manager.Destroy()
+ }
+ }()
+ if err := p.createNetworkInterfaces(); err != nil {
+ return newSystemErrorWithCause(err, "creating network interfaces")
+ }
+ if err := p.sendConfig(); err != nil {
+ return newSystemErrorWithCause(err, "sending config to init process")
+ }
+ var (
+ sentRun bool
+ sentResume bool
+ )
+
+ ierr := parseSync(p.parentPipe, func(sync *syncT) error {
+ switch sync.Type {
+ case procReady:
+ // set rlimits, this has to be done here because we lose permissions
+ // to raise the limits once we enter a user-namespace
+ if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil {
+ return newSystemErrorWithCause(err, "setting rlimits for ready process")
+ }
+ // call prestart hooks
+ if !p.config.Config.Namespaces.Contains(configs.NEWNS) {
+ // Setup cgroup before prestart hook, so that the prestart hook could apply cgroup permissions.
+ if err := p.manager.Set(p.config.Config); err != nil {
+ return newSystemErrorWithCause(err, "setting cgroup config for ready process")
+ }
+ if p.intelRdtManager != nil {
+ if err := p.intelRdtManager.Set(p.config.Config); err != nil {
+ return newSystemErrorWithCause(err, "setting Intel RDT config for ready process")
+ }
+ }
+
+ if p.config.Config.Hooks != nil {
+ s, err := p.container.currentOCIState()
+ if err != nil {
+ return err
+ }
+ // initProcessStartTime hasn't been set yet.
+ s.Pid = p.cmd.Process.Pid
+ s.Status = "creating"
+ for i, hook := range p.config.Config.Hooks.Prestart {
+ if err := hook.Run(s); err != nil {
+ return newSystemErrorWithCausef(err, "running prestart hook %d", i)
+ }
+ }
+ }
+ }
+ // Sync with child.
+ if err := writeSync(p.parentPipe, procRun); err != nil {
+ return newSystemErrorWithCause(err, "writing syncT 'run'")
+ }
+ sentRun = true
+ case procHooks:
+ // Setup cgroup before prestart hook, so that the prestart hook could apply cgroup permissions.
+ if err := p.manager.Set(p.config.Config); err != nil {
+ return newSystemErrorWithCause(err, "setting cgroup config for procHooks process")
+ }
+ if p.intelRdtManager != nil {
+ if err := p.intelRdtManager.Set(p.config.Config); err != nil {
+ return newSystemErrorWithCause(err, "setting Intel RDT config for procHooks process")
+ }
+ }
+ if p.config.Config.Hooks != nil {
+ s, err := p.container.currentOCIState()
+ if err != nil {
+ return err
+ }
+ // initProcessStartTime hasn't been set yet.
+ s.Pid = p.cmd.Process.Pid
+ s.Status = "creating"
+ for i, hook := range p.config.Config.Hooks.Prestart {
+ if err := hook.Run(s); err != nil {
+ return newSystemErrorWithCausef(err, "running prestart hook %d", i)
+ }
+ }
+ }
+ // Sync with child.
+ if err := writeSync(p.parentPipe, procResume); err != nil {
+ return newSystemErrorWithCause(err, "writing syncT 'resume'")
+ }
+ sentResume = true
+ default:
+ return newSystemError(fmt.Errorf("invalid JSON payload from child"))
+ }
+
+ return nil
+ })
+
+ if !sentRun {
+ return newSystemErrorWithCause(ierr, "container init")
+ }
+ if p.config.Config.Namespaces.Contains(configs.NEWNS) && !sentResume {
+ return newSystemError(fmt.Errorf("could not synchronise after executing prestart hooks with container process"))
+ }
+ if err := unix.Shutdown(int(p.parentPipe.Fd()), unix.SHUT_WR); err != nil {
+ return newSystemErrorWithCause(err, "shutting down init pipe")
+ }
+
+ // Must be done after Shutdown so the child will exit and we can wait for it.
+ if ierr != nil {
+ p.wait()
+ return ierr
+ }
+ return nil
+}
+
+func (p *initProcess) wait() (*os.ProcessState, error) {
+ err := p.cmd.Wait()
+ if err != nil {
+ return p.cmd.ProcessState, err
+ }
+ // we should kill all processes in cgroup when init is died if we use host PID namespace
+ if p.sharePidns {
+ signalAllProcesses(p.manager, unix.SIGKILL)
+ }
+ return p.cmd.ProcessState, nil
+}
+
+func (p *initProcess) terminate() error {
+ if p.cmd.Process == nil {
+ return nil
+ }
+ err := p.cmd.Process.Kill()
+ if _, werr := p.wait(); err == nil {
+ err = werr
+ }
+ return err
+}
+
+func (p *initProcess) startTime() (uint64, error) {
+ stat, err := system.Stat(p.pid())
+ return stat.StartTime, err
+}
+
+func (p *initProcess) sendConfig() error {
+ // send the config to the container's init process, we don't use JSON Encode
+ // here because there might be a problem in JSON decoder in some cases, see:
+ // https://github.com/docker/docker/issues/14203#issuecomment-174177790
+ return utils.WriteJSON(p.parentPipe, p.config)
+}
+
+func (p *initProcess) createNetworkInterfaces() error {
+ for _, config := range p.config.Config.Networks {
+ strategy, err := getStrategy(config.Type)
+ if err != nil {
+ return err
+ }
+ n := &network{
+ Network: *config,
+ }
+ if err := strategy.create(n, p.pid()); err != nil {
+ return err
+ }
+ p.config.Networks = append(p.config.Networks, n)
+ }
+ return nil
+}
+
+func (p *initProcess) signal(sig os.Signal) error {
+ s, ok := sig.(syscall.Signal)
+ if !ok {
+ return errors.New("os: unsupported signal type")
+ }
+ return unix.Kill(p.pid(), s)
+}
+
+func (p *initProcess) setExternalDescriptors(newFds []string) {
+ p.fds = newFds
+}
+
+func getPipeFds(pid int) ([]string, error) {
+ fds := make([]string, 3)
+
+ dirPath := filepath.Join("/proc", strconv.Itoa(pid), "/fd")
+ for i := 0; i < 3; i++ {
+ // XXX: This breaks if the path is not a valid symlink (which can
+ // happen in certain particularly unlucky mount namespace setups).
+ f := filepath.Join(dirPath, strconv.Itoa(i))
+ target, err := os.Readlink(f)
+ if err != nil {
+ // Ignore permission errors, for rootless containers and other
+ // non-dumpable processes. if we can't get the fd for a particular
+ // file, there's not much we can do.
+ if os.IsPermission(err) {
+ continue
+ }
+ return fds, err
+ }
+ fds[i] = target
+ }
+ return fds, nil
+}
+
+// InitializeIO creates pipes for use with the process's stdio and returns the
+// opposite side for each. Do not use this if you want to have a pseudoterminal
+// set up for you by libcontainer (TODO: fix that too).
+// TODO: This is mostly unnecessary, and should be handled by clients.
+func (p *Process) InitializeIO(rootuid, rootgid int) (i *IO, err error) {
+ var fds []uintptr
+ i = &IO{}
+ // cleanup in case of an error
+ defer func() {
+ if err != nil {
+ for _, fd := range fds {
+ unix.Close(int(fd))
+ }
+ }
+ }()
+ // STDIN
+ r, w, err := os.Pipe()
+ if err != nil {
+ return nil, err
+ }
+ fds = append(fds, r.Fd(), w.Fd())
+ p.Stdin, i.Stdin = r, w
+ // STDOUT
+ if r, w, err = os.Pipe(); err != nil {
+ return nil, err
+ }
+ fds = append(fds, r.Fd(), w.Fd())
+ p.Stdout, i.Stdout = w, r
+ // STDERR
+ if r, w, err = os.Pipe(); err != nil {
+ return nil, err
+ }
+ fds = append(fds, r.Fd(), w.Fd())
+ p.Stderr, i.Stderr = w, r
+ // change ownership of the pipes in case we are in a user namespace
+ for _, fd := range fds {
+ if err := unix.Fchown(int(fd), rootuid, rootgid); err != nil {
+ return nil, err
+ }
+ }
+ return i, nil
+}
--- /dev/null
+// +build linux
+
+package libcontainer
+
+import (
+ "fmt"
+ "os"
+
+ "github.com/opencontainers/runc/libcontainer/system"
+)
+
+func newRestoredProcess(pid int, fds []string) (*restoredProcess, error) {
+ var (
+ err error
+ )
+ proc, err := os.FindProcess(pid)
+ if err != nil {
+ return nil, err
+ }
+ stat, err := system.Stat(pid)
+ if err != nil {
+ return nil, err
+ }
+ return &restoredProcess{
+ proc: proc,
+ processStartTime: stat.StartTime,
+ fds: fds,
+ }, nil
+}
+
+type restoredProcess struct {
+ proc *os.Process
+ processStartTime uint64
+ fds []string
+}
+
+func (p *restoredProcess) start() error {
+ return newGenericError(fmt.Errorf("restored process cannot be started"), SystemError)
+}
+
+func (p *restoredProcess) pid() int {
+ return p.proc.Pid
+}
+
+func (p *restoredProcess) terminate() error {
+ err := p.proc.Kill()
+ if _, werr := p.wait(); err == nil {
+ err = werr
+ }
+ return err
+}
+
+func (p *restoredProcess) wait() (*os.ProcessState, error) {
+ // TODO: how do we wait on the actual process?
+ // maybe use --exec-cmd in criu
+ st, err := p.proc.Wait()
+ if err != nil {
+ return nil, err
+ }
+ return st, nil
+}
+
+func (p *restoredProcess) startTime() (uint64, error) {
+ return p.processStartTime, nil
+}
+
+func (p *restoredProcess) signal(s os.Signal) error {
+ return p.proc.Signal(s)
+}
+
+func (p *restoredProcess) externalDescriptors() []string {
+ return p.fds
+}
+
+func (p *restoredProcess) setExternalDescriptors(newFds []string) {
+ p.fds = newFds
+}
+
+// nonChildProcess represents a process where the calling process is not
+// the parent process. This process is created when a factory loads a container from
+// a persisted state.
+type nonChildProcess struct {
+ processPid int
+ processStartTime uint64
+ fds []string
+}
+
+func (p *nonChildProcess) start() error {
+ return newGenericError(fmt.Errorf("restored process cannot be started"), SystemError)
+}
+
+func (p *nonChildProcess) pid() int {
+ return p.processPid
+}
+
+func (p *nonChildProcess) terminate() error {
+ return newGenericError(fmt.Errorf("restored process cannot be terminated"), SystemError)
+}
+
+func (p *nonChildProcess) wait() (*os.ProcessState, error) {
+ return nil, newGenericError(fmt.Errorf("restored process cannot be waited on"), SystemError)
+}
+
+func (p *nonChildProcess) startTime() (uint64, error) {
+ return p.processStartTime, nil
+}
+
+func (p *nonChildProcess) signal(s os.Signal) error {
+ proc, err := os.FindProcess(p.processPid)
+ if err != nil {
+ return err
+ }
+ return proc.Signal(s)
+}
+
+func (p *nonChildProcess) externalDescriptors() []string {
+ return p.fds
+}
+
+func (p *nonChildProcess) setExternalDescriptors(newFds []string) {
+ p.fds = newFds
+}
--- /dev/null
+// +build linux
+
+package libcontainer
+
+import (
+ "fmt"
+ "io"
+ "io/ioutil"
+ "os"
+ "os/exec"
+ "path"
+ "path/filepath"
+ "strings"
+ "time"
+
+ "github.com/cyphar/filepath-securejoin"
+ "github.com/mrunalp/fileutils"
+ "github.com/opencontainers/runc/libcontainer/cgroups"
+ "github.com/opencontainers/runc/libcontainer/configs"
+ "github.com/opencontainers/runc/libcontainer/mount"
+ "github.com/opencontainers/runc/libcontainer/system"
+ libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils"
+ "github.com/opencontainers/selinux/go-selinux/label"
+
+ "golang.org/x/sys/unix"
+)
+
+const defaultMountFlags = unix.MS_NOEXEC | unix.MS_NOSUID | unix.MS_NODEV
+
+// needsSetupDev returns true if /dev needs to be set up.
+func needsSetupDev(config *configs.Config) bool {
+ for _, m := range config.Mounts {
+ if m.Device == "bind" && libcontainerUtils.CleanPath(m.Destination) == "/dev" {
+ return false
+ }
+ }
+ return true
+}
+
+// prepareRootfs sets up the devices, mount points, and filesystems for use
+// inside a new mount namespace. It doesn't set anything as ro. You must call
+// finalizeRootfs after this function to finish setting up the rootfs.
+func prepareRootfs(pipe io.ReadWriter, iConfig *initConfig) (err error) {
+ config := iConfig.Config
+ if err := prepareRoot(config); err != nil {
+ return newSystemErrorWithCause(err, "preparing rootfs")
+ }
+
+ hasCgroupns := config.Namespaces.Contains(configs.NEWCGROUP)
+ setupDev := needsSetupDev(config)
+ for _, m := range config.Mounts {
+ for _, precmd := range m.PremountCmds {
+ if err := mountCmd(precmd); err != nil {
+ return newSystemErrorWithCause(err, "running premount command")
+ }
+ }
+ if err := mountToRootfs(m, config.Rootfs, config.MountLabel, hasCgroupns); err != nil {
+ return newSystemErrorWithCausef(err, "mounting %q to rootfs %q at %q", m.Source, config.Rootfs, m.Destination)
+ }
+
+ for _, postcmd := range m.PostmountCmds {
+ if err := mountCmd(postcmd); err != nil {
+ return newSystemErrorWithCause(err, "running postmount command")
+ }
+ }
+ }
+
+ if setupDev {
+ if err := createDevices(config); err != nil {
+ return newSystemErrorWithCause(err, "creating device nodes")
+ }
+ if err := setupPtmx(config); err != nil {
+ return newSystemErrorWithCause(err, "setting up ptmx")
+ }
+ if err := setupDevSymlinks(config.Rootfs); err != nil {
+ return newSystemErrorWithCause(err, "setting up /dev symlinks")
+ }
+ }
+
+ // Signal the parent to run the pre-start hooks.
+ // The hooks are run after the mounts are setup, but before we switch to the new
+ // root, so that the old root is still available in the hooks for any mount
+ // manipulations.
+ // Note that iConfig.Cwd is not guaranteed to exist here.
+ if err := syncParentHooks(pipe); err != nil {
+ return err
+ }
+
+ // The reason these operations are done here rather than in finalizeRootfs
+ // is because the console-handling code gets quite sticky if we have to set
+ // up the console before doing the pivot_root(2). This is because the
+ // Console API has to also work with the ExecIn case, which means that the
+ // API must be able to deal with being inside as well as outside the
+ // container. It's just cleaner to do this here (at the expense of the
+ // operation not being perfectly split).
+
+ if err := unix.Chdir(config.Rootfs); err != nil {
+ return newSystemErrorWithCausef(err, "changing dir to %q", config.Rootfs)
+ }
+
+ if config.NoPivotRoot {
+ err = msMoveRoot(config.Rootfs)
+ } else if config.Namespaces.Contains(configs.NEWNS) {
+ err = pivotRoot(config.Rootfs)
+ } else {
+ err = chroot(config.Rootfs)
+ }
+ if err != nil {
+ return newSystemErrorWithCause(err, "jailing process inside rootfs")
+ }
+
+ if setupDev {
+ if err := reOpenDevNull(); err != nil {
+ return newSystemErrorWithCause(err, "reopening /dev/null inside container")
+ }
+ }
+
+ if cwd := iConfig.Cwd; cwd != "" {
+ // Note that spec.Process.Cwd can contain unclean value like "../../../../foo/bar...".
+ // However, we are safe to call MkDirAll directly because we are in the jail here.
+ if err := os.MkdirAll(cwd, 0755); err != nil {
+ return err
+ }
+ }
+
+ return nil
+}
+
+// finalizeRootfs sets anything to ro if necessary. You must call
+// prepareRootfs first.
+func finalizeRootfs(config *configs.Config) (err error) {
+ // remount dev as ro if specified
+ for _, m := range config.Mounts {
+ if libcontainerUtils.CleanPath(m.Destination) == "/dev" {
+ if m.Flags&unix.MS_RDONLY == unix.MS_RDONLY {
+ if err := remountReadonly(m); err != nil {
+ return newSystemErrorWithCausef(err, "remounting %q as readonly", m.Destination)
+ }
+ }
+ break
+ }
+ }
+
+ // set rootfs ( / ) as readonly
+ if config.Readonlyfs {
+ if err := setReadonly(); err != nil {
+ return newSystemErrorWithCause(err, "setting rootfs as readonly")
+ }
+ }
+
+ unix.Umask(0022)
+ return nil
+}
+
+// /tmp has to be mounted as private to allow MS_MOVE to work in all situations
+func prepareTmp(topTmpDir string) (string, error) {
+ tmpdir, err := ioutil.TempDir(topTmpDir, "runctop")
+ if err != nil {
+ return "", err
+ }
+ if err := unix.Mount(tmpdir, tmpdir, "bind", unix.MS_BIND, ""); err != nil {
+ return "", err
+ }
+ if err := unix.Mount("", tmpdir, "", uintptr(unix.MS_PRIVATE), ""); err != nil {
+ return "", err
+ }
+ return tmpdir, nil
+}
+
+func cleanupTmp(tmpdir string) error {
+ unix.Unmount(tmpdir, 0)
+ return os.RemoveAll(tmpdir)
+}
+
+func mountCmd(cmd configs.Command) error {
+ command := exec.Command(cmd.Path, cmd.Args[:]...)
+ command.Env = cmd.Env
+ command.Dir = cmd.Dir
+ if out, err := command.CombinedOutput(); err != nil {
+ return fmt.Errorf("%#v failed: %s: %v", cmd, string(out), err)
+ }
+ return nil
+}
+
+func mountToRootfs(m *configs.Mount, rootfs, mountLabel string, enableCgroupns bool) error {
+ var (
+ dest = m.Destination
+ )
+ if !strings.HasPrefix(dest, rootfs) {
+ dest = filepath.Join(rootfs, dest)
+ }
+
+ switch m.Device {
+ case "proc", "sysfs":
+ if err := os.MkdirAll(dest, 0755); err != nil {
+ return err
+ }
+ // Selinux kernels do not support labeling of /proc or /sys
+ return mountPropagate(m, rootfs, "")
+ case "mqueue":
+ if err := os.MkdirAll(dest, 0755); err != nil {
+ return err
+ }
+ if err := mountPropagate(m, rootfs, mountLabel); err != nil {
+ // older kernels do not support labeling of /dev/mqueue
+ if err := mountPropagate(m, rootfs, ""); err != nil {
+ return err
+ }
+ return label.SetFileLabel(dest, mountLabel)
+ }
+ return nil
+ case "tmpfs":
+ copyUp := m.Extensions&configs.EXT_COPYUP == configs.EXT_COPYUP
+ tmpDir := ""
+ stat, err := os.Stat(dest)
+ if err != nil {
+ if err := os.MkdirAll(dest, 0755); err != nil {
+ return err
+ }
+ }
+ if copyUp {
+ tmpdir, err := prepareTmp("/tmp")
+ if err != nil {
+ return newSystemErrorWithCause(err, "tmpcopyup: failed to setup tmpdir")
+ }
+ defer cleanupTmp(tmpdir)
+ tmpDir, err = ioutil.TempDir(tmpdir, "runctmpdir")
+ if err != nil {
+ return newSystemErrorWithCause(err, "tmpcopyup: failed to create tmpdir")
+ }
+ defer os.RemoveAll(tmpDir)
+ m.Destination = tmpDir
+ }
+ if err := mountPropagate(m, rootfs, mountLabel); err != nil {
+ return err
+ }
+ if copyUp {
+ if err := fileutils.CopyDirectory(dest, tmpDir); err != nil {
+ errMsg := fmt.Errorf("tmpcopyup: failed to copy %s to %s: %v", dest, tmpDir, err)
+ if err1 := unix.Unmount(tmpDir, unix.MNT_DETACH); err1 != nil {
+ return newSystemErrorWithCausef(err1, "tmpcopyup: %v: failed to unmount", errMsg)
+ }
+ return errMsg
+ }
+ if err := unix.Mount(tmpDir, dest, "", unix.MS_MOVE, ""); err != nil {
+ errMsg := fmt.Errorf("tmpcopyup: failed to move mount %s to %s: %v", tmpDir, dest, err)
+ if err1 := unix.Unmount(tmpDir, unix.MNT_DETACH); err1 != nil {
+ return newSystemErrorWithCausef(err1, "tmpcopyup: %v: failed to unmount", errMsg)
+ }
+ return errMsg
+ }
+ }
+ if stat != nil {
+ if err = os.Chmod(dest, stat.Mode()); err != nil {
+ return err
+ }
+ }
+ return nil
+ case "bind":
+ stat, err := os.Stat(m.Source)
+ if err != nil {
+ // error out if the source of a bind mount does not exist as we will be
+ // unable to bind anything to it.
+ return err
+ }
+ // ensure that the destination of the bind mount is resolved of symlinks at mount time because
+ // any previous mounts can invalidate the next mount's destination.
+ // this can happen when a user specifies mounts within other mounts to cause breakouts or other
+ // evil stuff to try to escape the container's rootfs.
+ if dest, err = securejoin.SecureJoin(rootfs, m.Destination); err != nil {
+ return err
+ }
+ if err := checkMountDestination(rootfs, dest); err != nil {
+ return err
+ }
+ // update the mount with the correct dest after symlinks are resolved.
+ m.Destination = dest
+ if err := createIfNotExists(dest, stat.IsDir()); err != nil {
+ return err
+ }
+ if err := mountPropagate(m, rootfs, mountLabel); err != nil {
+ return err
+ }
+ // bind mount won't change mount options, we need remount to make mount options effective.
+ // first check that we have non-default options required before attempting a remount
+ if m.Flags&^(unix.MS_REC|unix.MS_REMOUNT|unix.MS_BIND) != 0 {
+ // only remount if unique mount options are set
+ if err := remount(m, rootfs); err != nil {
+ return err
+ }
+ }
+
+ if m.Relabel != "" {
+ if err := label.Validate(m.Relabel); err != nil {
+ return err
+ }
+ shared := label.IsShared(m.Relabel)
+ if err := label.Relabel(m.Source, mountLabel, shared); err != nil {
+ return err
+ }
+ }
+ case "cgroup":
+ binds, err := getCgroupMounts(m)
+ if err != nil {
+ return err
+ }
+ var merged []string
+ for _, b := range binds {
+ ss := filepath.Base(b.Destination)
+ if strings.Contains(ss, ",") {
+ merged = append(merged, ss)
+ }
+ }
+ tmpfs := &configs.Mount{
+ Source: "tmpfs",
+ Device: "tmpfs",
+ Destination: m.Destination,
+ Flags: defaultMountFlags,
+ Data: "mode=755",
+ PropagationFlags: m.PropagationFlags,
+ }
+ if err := mountToRootfs(tmpfs, rootfs, mountLabel, enableCgroupns); err != nil {
+ return err
+ }
+ for _, b := range binds {
+ if enableCgroupns {
+ subsystemPath := filepath.Join(rootfs, b.Destination)
+ if err := os.MkdirAll(subsystemPath, 0755); err != nil {
+ return err
+ }
+ flags := defaultMountFlags
+ if m.Flags&unix.MS_RDONLY != 0 {
+ flags = flags | unix.MS_RDONLY
+ }
+ cgroupmount := &configs.Mount{
+ Source: "cgroup",
+ Device: "cgroup",
+ Destination: subsystemPath,
+ Flags: flags,
+ Data: filepath.Base(subsystemPath),
+ }
+ if err := mountNewCgroup(cgroupmount); err != nil {
+ return err
+ }
+ } else {
+ if err := mountToRootfs(b, rootfs, mountLabel, enableCgroupns); err != nil {
+ return err
+ }
+ }
+ }
+ for _, mc := range merged {
+ for _, ss := range strings.Split(mc, ",") {
+ // symlink(2) is very dumb, it will just shove the path into
+ // the link and doesn't do any checks or relative path
+ // conversion. Also, don't error out if the cgroup already exists.
+ if err := os.Symlink(mc, filepath.Join(rootfs, m.Destination, ss)); err != nil && !os.IsExist(err) {
+ return err
+ }
+ }
+ }
+ if m.Flags&unix.MS_RDONLY != 0 {
+ // remount cgroup root as readonly
+ mcgrouproot := &configs.Mount{
+ Source: m.Destination,
+ Device: "bind",
+ Destination: m.Destination,
+ Flags: defaultMountFlags | unix.MS_RDONLY | unix.MS_BIND,
+ }
+ if err := remount(mcgrouproot, rootfs); err != nil {
+ return err
+ }
+ }
+ default:
+ // ensure that the destination of the mount is resolved of symlinks at mount time because
+ // any previous mounts can invalidate the next mount's destination.
+ // this can happen when a user specifies mounts within other mounts to cause breakouts or other
+ // evil stuff to try to escape the container's rootfs.
+ var err error
+ if dest, err = securejoin.SecureJoin(rootfs, m.Destination); err != nil {
+ return err
+ }
+ if err := checkMountDestination(rootfs, dest); err != nil {
+ return err
+ }
+ // update the mount with the correct dest after symlinks are resolved.
+ m.Destination = dest
+ if err := os.MkdirAll(dest, 0755); err != nil {
+ return err
+ }
+ return mountPropagate(m, rootfs, mountLabel)
+ }
+ return nil
+}
+
+func getCgroupMounts(m *configs.Mount) ([]*configs.Mount, error) {
+ mounts, err := cgroups.GetCgroupMounts(false)
+ if err != nil {
+ return nil, err
+ }
+
+ cgroupPaths, err := cgroups.ParseCgroupFile("/proc/self/cgroup")
+ if err != nil {
+ return nil, err
+ }
+
+ var binds []*configs.Mount
+
+ for _, mm := range mounts {
+ dir, err := mm.GetOwnCgroup(cgroupPaths)
+ if err != nil {
+ return nil, err
+ }
+ relDir, err := filepath.Rel(mm.Root, dir)
+ if err != nil {
+ return nil, err
+ }
+ binds = append(binds, &configs.Mount{
+ Device: "bind",
+ Source: filepath.Join(mm.Mountpoint, relDir),
+ Destination: filepath.Join(m.Destination, filepath.Base(mm.Mountpoint)),
+ Flags: unix.MS_BIND | unix.MS_REC | m.Flags,
+ PropagationFlags: m.PropagationFlags,
+ })
+ }
+
+ return binds, nil
+}
+
+// checkMountDestination checks to ensure that the mount destination is not over the top of /proc.
+// dest is required to be an abs path and have any symlinks resolved before calling this function.
+func checkMountDestination(rootfs, dest string) error {
+ invalidDestinations := []string{
+ "/proc",
+ }
+ // White list, it should be sub directories of invalid destinations
+ validDestinations := []string{
+ // These entries can be bind mounted by files emulated by fuse,
+ // so commands like top, free displays stats in container.
+ "/proc/cpuinfo",
+ "/proc/diskstats",
+ "/proc/meminfo",
+ "/proc/stat",
+ "/proc/swaps",
+ "/proc/uptime",
+ "/proc/loadavg",
+ "/proc/net/dev",
+ }
+ for _, valid := range validDestinations {
+ path, err := filepath.Rel(filepath.Join(rootfs, valid), dest)
+ if err != nil {
+ return err
+ }
+ if path == "." {
+ return nil
+ }
+ }
+ for _, invalid := range invalidDestinations {
+ path, err := filepath.Rel(filepath.Join(rootfs, invalid), dest)
+ if err != nil {
+ return err
+ }
+ if path != "." && !strings.HasPrefix(path, "..") {
+ return fmt.Errorf("%q cannot be mounted because it is located inside %q", dest, invalid)
+ }
+ }
+ return nil
+}
+
+func setupDevSymlinks(rootfs string) error {
+ var links = [][2]string{
+ {"/proc/self/fd", "/dev/fd"},
+ {"/proc/self/fd/0", "/dev/stdin"},
+ {"/proc/self/fd/1", "/dev/stdout"},
+ {"/proc/self/fd/2", "/dev/stderr"},
+ }
+ // kcore support can be toggled with CONFIG_PROC_KCORE; only create a symlink
+ // in /dev if it exists in /proc.
+ if _, err := os.Stat("/proc/kcore"); err == nil {
+ links = append(links, [2]string{"/proc/kcore", "/dev/core"})
+ }
+ for _, link := range links {
+ var (
+ src = link[0]
+ dst = filepath.Join(rootfs, link[1])
+ )
+ if err := os.Symlink(src, dst); err != nil && !os.IsExist(err) {
+ return fmt.Errorf("symlink %s %s %s", src, dst, err)
+ }
+ }
+ return nil
+}
+
+// If stdin, stdout, and/or stderr are pointing to `/dev/null` in the parent's rootfs
+// this method will make them point to `/dev/null` in this container's rootfs. This
+// needs to be called after we chroot/pivot into the container's rootfs so that any
+// symlinks are resolved locally.
+func reOpenDevNull() error {
+ var stat, devNullStat unix.Stat_t
+ file, err := os.OpenFile("/dev/null", os.O_RDWR, 0)
+ if err != nil {
+ return fmt.Errorf("Failed to open /dev/null - %s", err)
+ }
+ defer file.Close()
+ if err := unix.Fstat(int(file.Fd()), &devNullStat); err != nil {
+ return err
+ }
+ for fd := 0; fd < 3; fd++ {
+ if err := unix.Fstat(fd, &stat); err != nil {
+ return err
+ }
+ if stat.Rdev == devNullStat.Rdev {
+ // Close and re-open the fd.
+ if err := unix.Dup3(int(file.Fd()), fd, 0); err != nil {
+ return err
+ }
+ }
+ }
+ return nil
+}
+
+// Create the device nodes in the container.
+func createDevices(config *configs.Config) error {
+ useBindMount := system.RunningInUserNS() || config.Namespaces.Contains(configs.NEWUSER)
+ oldMask := unix.Umask(0000)
+ for _, node := range config.Devices {
+ // containers running in a user namespace are not allowed to mknod
+ // devices so we can just bind mount it from the host.
+ if err := createDeviceNode(config.Rootfs, node, useBindMount); err != nil {
+ unix.Umask(oldMask)
+ return err
+ }
+ }
+ unix.Umask(oldMask)
+ return nil
+}
+
+func bindMountDeviceNode(dest string, node *configs.Device) error {
+ f, err := os.Create(dest)
+ if err != nil && !os.IsExist(err) {
+ return err
+ }
+ if f != nil {
+ f.Close()
+ }
+ return unix.Mount(node.Path, dest, "bind", unix.MS_BIND, "")
+}
+
+// Creates the device node in the rootfs of the container.
+func createDeviceNode(rootfs string, node *configs.Device, bind bool) error {
+ dest := filepath.Join(rootfs, node.Path)
+ if err := os.MkdirAll(filepath.Dir(dest), 0755); err != nil {
+ return err
+ }
+
+ if bind {
+ return bindMountDeviceNode(dest, node)
+ }
+ if err := mknodDevice(dest, node); err != nil {
+ if os.IsExist(err) {
+ return nil
+ } else if os.IsPermission(err) {
+ return bindMountDeviceNode(dest, node)
+ }
+ return err
+ }
+ return nil
+}
+
+func mknodDevice(dest string, node *configs.Device) error {
+ fileMode := node.FileMode
+ switch node.Type {
+ case 'c', 'u':
+ fileMode |= unix.S_IFCHR
+ case 'b':
+ fileMode |= unix.S_IFBLK
+ case 'p':
+ fileMode |= unix.S_IFIFO
+ default:
+ return fmt.Errorf("%c is not a valid device type for device %s", node.Type, node.Path)
+ }
+ if err := unix.Mknod(dest, uint32(fileMode), node.Mkdev()); err != nil {
+ return err
+ }
+ return unix.Chown(dest, int(node.Uid), int(node.Gid))
+}
+
+func getMountInfo(mountinfo []*mount.Info, dir string) *mount.Info {
+ for _, m := range mountinfo {
+ if m.Mountpoint == dir {
+ return m
+ }
+ }
+ return nil
+}
+
+// Get the parent mount point of directory passed in as argument. Also return
+// optional fields.
+func getParentMount(rootfs string) (string, string, error) {
+ var path string
+
+ mountinfos, err := mount.GetMounts()
+ if err != nil {
+ return "", "", err
+ }
+
+ mountinfo := getMountInfo(mountinfos, rootfs)
+ if mountinfo != nil {
+ return rootfs, mountinfo.Optional, nil
+ }
+
+ path = rootfs
+ for {
+ path = filepath.Dir(path)
+
+ mountinfo = getMountInfo(mountinfos, path)
+ if mountinfo != nil {
+ return path, mountinfo.Optional, nil
+ }
+
+ if path == "/" {
+ break
+ }
+ }
+
+ // If we are here, we did not find parent mount. Something is wrong.
+ return "", "", fmt.Errorf("Could not find parent mount of %s", rootfs)
+}
+
+// Make parent mount private if it was shared
+func rootfsParentMountPrivate(rootfs string) error {
+ sharedMount := false
+
+ parentMount, optionalOpts, err := getParentMount(rootfs)
+ if err != nil {
+ return err
+ }
+
+ optsSplit := strings.Split(optionalOpts, " ")
+ for _, opt := range optsSplit {
+ if strings.HasPrefix(opt, "shared:") {
+ sharedMount = true
+ break
+ }
+ }
+
+ // Make parent mount PRIVATE if it was shared. It is needed for two
+ // reasons. First of all pivot_root() will fail if parent mount is
+ // shared. Secondly when we bind mount rootfs it will propagate to
+ // parent namespace and we don't want that to happen.
+ if sharedMount {
+ return unix.Mount("", parentMount, "", unix.MS_PRIVATE, "")
+ }
+
+ return nil
+}
+
+func prepareRoot(config *configs.Config) error {
+ flag := unix.MS_SLAVE | unix.MS_REC
+ if config.RootPropagation != 0 {
+ flag = config.RootPropagation
+ }
+ if err := unix.Mount("", "/", "", uintptr(flag), ""); err != nil {
+ return err
+ }
+
+ // Make parent mount private to make sure following bind mount does
+ // not propagate in other namespaces. Also it will help with kernel
+ // check pass in pivot_root. (IS_SHARED(new_mnt->mnt_parent))
+ if err := rootfsParentMountPrivate(config.Rootfs); err != nil {
+ return err
+ }
+
+ return unix.Mount(config.Rootfs, config.Rootfs, "bind", unix.MS_BIND|unix.MS_REC, "")
+}
+
+func setReadonly() error {
+ return unix.Mount("/", "/", "bind", unix.MS_BIND|unix.MS_REMOUNT|unix.MS_RDONLY|unix.MS_REC, "")
+}
+
+func setupPtmx(config *configs.Config) error {
+ ptmx := filepath.Join(config.Rootfs, "dev/ptmx")
+ if err := os.Remove(ptmx); err != nil && !os.IsNotExist(err) {
+ return err
+ }
+ if err := os.Symlink("pts/ptmx", ptmx); err != nil {
+ return fmt.Errorf("symlink dev ptmx %s", err)
+ }
+ return nil
+}
+
+// pivotRoot will call pivot_root such that rootfs becomes the new root
+// filesystem, and everything else is cleaned up.
+func pivotRoot(rootfs string) error {
+ // While the documentation may claim otherwise, pivot_root(".", ".") is
+ // actually valid. What this results in is / being the new root but
+ // /proc/self/cwd being the old root. Since we can play around with the cwd
+ // with pivot_root this allows us to pivot without creating directories in
+ // the rootfs. Shout-outs to the LXC developers for giving us this idea.
+
+ oldroot, err := unix.Open("/", unix.O_DIRECTORY|unix.O_RDONLY, 0)
+ if err != nil {
+ return err
+ }
+ defer unix.Close(oldroot)
+
+ newroot, err := unix.Open(rootfs, unix.O_DIRECTORY|unix.O_RDONLY, 0)
+ if err != nil {
+ return err
+ }
+ defer unix.Close(newroot)
+
+ // Change to the new root so that the pivot_root actually acts on it.
+ if err := unix.Fchdir(newroot); err != nil {
+ return err
+ }
+
+ if err := unix.PivotRoot(".", "."); err != nil {
+ return fmt.Errorf("pivot_root %s", err)
+ }
+
+ // Currently our "." is oldroot (according to the current kernel code).
+ // However, purely for safety, we will fchdir(oldroot) since there isn't
+ // really any guarantee from the kernel what /proc/self/cwd will be after a
+ // pivot_root(2).
+
+ if err := unix.Fchdir(oldroot); err != nil {
+ return err
+ }
+
+ // Make oldroot rslave to make sure our unmounts don't propagate to the
+ // host (and thus bork the machine). We don't use rprivate because this is
+ // known to cause issues due to races where we still have a reference to a
+ // mount while a process in the host namespace are trying to operate on
+ // something they think has no mounts (devicemapper in particular).
+ if err := unix.Mount("", ".", "", unix.MS_SLAVE|unix.MS_REC, ""); err != nil {
+ return err
+ }
+ // Preform the unmount. MNT_DETACH allows us to unmount /proc/self/cwd.
+ if err := unix.Unmount(".", unix.MNT_DETACH); err != nil {
+ return err
+ }
+
+ // Switch back to our shiny new root.
+ if err := unix.Chdir("/"); err != nil {
+ return fmt.Errorf("chdir / %s", err)
+ }
+ return nil
+}
+
+func msMoveRoot(rootfs string) error {
+ if err := unix.Mount(rootfs, "/", "", unix.MS_MOVE, ""); err != nil {
+ return err
+ }
+ return chroot(rootfs)
+}
+
+func chroot(rootfs string) error {
+ if err := unix.Chroot("."); err != nil {
+ return err
+ }
+ return unix.Chdir("/")
+}
+
+// createIfNotExists creates a file or a directory only if it does not already exist.
+func createIfNotExists(path string, isDir bool) error {
+ if _, err := os.Stat(path); err != nil {
+ if os.IsNotExist(err) {
+ if isDir {
+ return os.MkdirAll(path, 0755)
+ }
+ if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
+ return err
+ }
+ f, err := os.OpenFile(path, os.O_CREATE, 0755)
+ if err != nil {
+ return err
+ }
+ f.Close()
+ }
+ }
+ return nil
+}
+
+// readonlyPath will make a path read only.
+func readonlyPath(path string) error {
+ if err := unix.Mount(path, path, "", unix.MS_BIND|unix.MS_REC, ""); err != nil {
+ if os.IsNotExist(err) {
+ return nil
+ }
+ return err
+ }
+ return unix.Mount(path, path, "", unix.MS_BIND|unix.MS_REMOUNT|unix.MS_RDONLY|unix.MS_REC, "")
+}
+
+// remountReadonly will remount an existing mount point and ensure that it is read-only.
+func remountReadonly(m *configs.Mount) error {
+ var (
+ dest = m.Destination
+ flags = m.Flags
+ )
+ for i := 0; i < 5; i++ {
+ // There is a special case in the kernel for
+ // MS_REMOUNT | MS_BIND, which allows us to change only the
+ // flags even as an unprivileged user (i.e. user namespace)
+ // assuming we don't drop any security related flags (nodev,
+ // nosuid, etc.). So, let's use that case so that we can do
+ // this re-mount without failing in a userns.
+ flags |= unix.MS_REMOUNT | unix.MS_BIND | unix.MS_RDONLY
+ if err := unix.Mount("", dest, "", uintptr(flags), ""); err != nil {
+ switch err {
+ case unix.EBUSY:
+ time.Sleep(100 * time.Millisecond)
+ continue
+ default:
+ return err
+ }
+ }
+ return nil
+ }
+ return fmt.Errorf("unable to mount %s as readonly max retries reached", dest)
+}
+
+// maskPath masks the top of the specified path inside a container to avoid
+// security issues from processes reading information from non-namespace aware
+// mounts ( proc/kcore ).
+// For files, maskPath bind mounts /dev/null over the top of the specified path.
+// For directories, maskPath mounts read-only tmpfs over the top of the specified path.
+func maskPath(path string, mountLabel string) error {
+ if err := unix.Mount("/dev/null", path, "", unix.MS_BIND, ""); err != nil && !os.IsNotExist(err) {
+ if err == unix.ENOTDIR {
+ return unix.Mount("tmpfs", path, "tmpfs", unix.MS_RDONLY, label.FormatMountLabel("", mountLabel))
+ }
+ return err
+ }
+ return nil
+}
+
+// writeSystemProperty writes the value to a path under /proc/sys as determined from the key.
+// For e.g. net.ipv4.ip_forward translated to /proc/sys/net/ipv4/ip_forward.
+func writeSystemProperty(key, value string) error {
+ keyPath := strings.Replace(key, ".", "/", -1)
+ return ioutil.WriteFile(path.Join("/proc/sys", keyPath), []byte(value), 0644)
+}
+
+func remount(m *configs.Mount, rootfs string) error {
+ var (
+ dest = m.Destination
+ )
+ if !strings.HasPrefix(dest, rootfs) {
+ dest = filepath.Join(rootfs, dest)
+ }
+ return unix.Mount(m.Source, dest, m.Device, uintptr(m.Flags|unix.MS_REMOUNT), "")
+}
+
+// Do the mount operation followed by additional mounts required to take care
+// of propagation flags.
+func mountPropagate(m *configs.Mount, rootfs string, mountLabel string) error {
+ var (
+ dest = m.Destination
+ data = label.FormatMountLabel(m.Data, mountLabel)
+ flags = m.Flags
+ )
+ if libcontainerUtils.CleanPath(dest) == "/dev" {
+ flags &= ^unix.MS_RDONLY
+ }
+
+ copyUp := m.Extensions&configs.EXT_COPYUP == configs.EXT_COPYUP
+ if !(copyUp || strings.HasPrefix(dest, rootfs)) {
+ dest = filepath.Join(rootfs, dest)
+ }
+
+ if err := unix.Mount(m.Source, dest, m.Device, uintptr(flags), data); err != nil {
+ return err
+ }
+
+ for _, pflag := range m.PropagationFlags {
+ if err := unix.Mount("", dest, "", uintptr(pflag), ""); err != nil {
+ return err
+ }
+ }
+ return nil
+}
+
+func mountNewCgroup(m *configs.Mount) error {
+ var (
+ data = m.Data
+ source = m.Source
+ )
+ if data == "systemd" {
+ data = cgroups.CgroupNamePrefix + data
+ source = "systemd"
+ }
+ if err := unix.Mount(source, m.Destination, m.Device, uintptr(m.Flags), data); err != nil {
+ return err
+ }
+ return nil
+}
--- /dev/null
+// +build linux
+
+package libcontainer
+
+import (
+ "testing"
+
+ "github.com/opencontainers/runc/libcontainer/configs"
+)
+
+func TestCheckMountDestOnProc(t *testing.T) {
+ dest := "/rootfs/proc/sys"
+ err := checkMountDestination("/rootfs", dest)
+ if err == nil {
+ t.Fatal("destination inside proc should return an error")
+ }
+}
+
+func TestCheckMountDestOnProcChroot(t *testing.T) {
+ dest := "/rootfs/proc/"
+ err := checkMountDestination("/rootfs", dest)
+ if err != nil {
+ t.Fatal("destination inside proc when using chroot should not return an error")
+ }
+}
+
+func TestCheckMountDestInSys(t *testing.T) {
+ dest := "/rootfs//sys/fs/cgroup"
+ err := checkMountDestination("/rootfs", dest)
+ if err != nil {
+ t.Fatal("destination inside /sys should not return an error")
+ }
+}
+
+func TestCheckMountDestFalsePositive(t *testing.T) {
+ dest := "/rootfs/sysfiles/fs/cgroup"
+ err := checkMountDestination("/rootfs", dest)
+ if err != nil {
+ t.Fatal(err)
+ }
+}
+
+func TestNeedsSetupDev(t *testing.T) {
+ config := &configs.Config{
+ Mounts: []*configs.Mount{
+ {
+ Device: "bind",
+ Source: "/dev",
+ Destination: "/dev",
+ },
+ },
+ }
+ if needsSetupDev(config) {
+ t.Fatal("expected needsSetupDev to be false, got true")
+ }
+}
+
+func TestNeedsSetupDevStrangeSource(t *testing.T) {
+ config := &configs.Config{
+ Mounts: []*configs.Mount{
+ {
+ Device: "bind",
+ Source: "/devx",
+ Destination: "/dev",
+ },
+ },
+ }
+ if needsSetupDev(config) {
+ t.Fatal("expected needsSetupDev to be false, got true")
+ }
+}
+
+func TestNeedsSetupDevStrangeDest(t *testing.T) {
+ config := &configs.Config{
+ Mounts: []*configs.Mount{
+ {
+ Device: "bind",
+ Source: "/dev",
+ Destination: "/devx",
+ },
+ },
+ }
+ if !needsSetupDev(config) {
+ t.Fatal("expected needsSetupDev to be true, got false")
+ }
+}
+
+func TestNeedsSetupDevStrangeSourceDest(t *testing.T) {
+ config := &configs.Config{
+ Mounts: []*configs.Mount{
+ {
+ Device: "bind",
+ Source: "/devx",
+ Destination: "/devx",
+ },
+ },
+ }
+ if !needsSetupDev(config) {
+ t.Fatal("expected needsSetupDev to be true, got false")
+ }
+}
--- /dev/null
+package seccomp
+
+import (
+ "fmt"
+
+ "github.com/opencontainers/runc/libcontainer/configs"
+)
+
+var operators = map[string]configs.Operator{
+ "SCMP_CMP_NE": configs.NotEqualTo,
+ "SCMP_CMP_LT": configs.LessThan,
+ "SCMP_CMP_LE": configs.LessThanOrEqualTo,
+ "SCMP_CMP_EQ": configs.EqualTo,
+ "SCMP_CMP_GE": configs.GreaterThanOrEqualTo,
+ "SCMP_CMP_GT": configs.GreaterThan,
+ "SCMP_CMP_MASKED_EQ": configs.MaskEqualTo,
+}
+
+var actions = map[string]configs.Action{
+ "SCMP_ACT_KILL": configs.Kill,
+ "SCMP_ACT_ERRNO": configs.Errno,
+ "SCMP_ACT_TRAP": configs.Trap,
+ "SCMP_ACT_ALLOW": configs.Allow,
+ "SCMP_ACT_TRACE": configs.Trace,
+}
+
+var archs = map[string]string{
+ "SCMP_ARCH_X86": "x86",
+ "SCMP_ARCH_X86_64": "amd64",
+ "SCMP_ARCH_X32": "x32",
+ "SCMP_ARCH_ARM": "arm",
+ "SCMP_ARCH_AARCH64": "arm64",
+ "SCMP_ARCH_MIPS": "mips",
+ "SCMP_ARCH_MIPS64": "mips64",
+ "SCMP_ARCH_MIPS64N32": "mips64n32",
+ "SCMP_ARCH_MIPSEL": "mipsel",
+ "SCMP_ARCH_MIPSEL64": "mipsel64",
+ "SCMP_ARCH_MIPSEL64N32": "mipsel64n32",
+ "SCMP_ARCH_PPC": "ppc",
+ "SCMP_ARCH_PPC64": "ppc64",
+ "SCMP_ARCH_PPC64LE": "ppc64le",
+ "SCMP_ARCH_S390": "s390",
+ "SCMP_ARCH_S390X": "s390x",
+}
+
+// ConvertStringToOperator converts a string into a Seccomp comparison operator.
+// Comparison operators use the names they are assigned by Libseccomp's header.
+// Attempting to convert a string that is not a valid operator results in an
+// error.
+func ConvertStringToOperator(in string) (configs.Operator, error) {
+ if op, ok := operators[in]; ok == true {
+ return op, nil
+ }
+ return 0, fmt.Errorf("string %s is not a valid operator for seccomp", in)
+}
+
+// ConvertStringToAction converts a string into a Seccomp rule match action.
+// Actions use the names they are assigned in Libseccomp's header, though some
+// (notable, SCMP_ACT_TRACE) are not available in this implementation and will
+// return errors.
+// Attempting to convert a string that is not a valid action results in an
+// error.
+func ConvertStringToAction(in string) (configs.Action, error) {
+ if act, ok := actions[in]; ok == true {
+ return act, nil
+ }
+ return 0, fmt.Errorf("string %s is not a valid action for seccomp", in)
+}
+
+// ConvertStringToArch converts a string into a Seccomp comparison arch.
+func ConvertStringToArch(in string) (string, error) {
+ if arch, ok := archs[in]; ok == true {
+ return arch, nil
+ }
+ return "", fmt.Errorf("string %s is not a valid arch for seccomp", in)
+}
--- /dev/null
+Name: cat
+State: R (running)
+Tgid: 19383
+Ngid: 0
+Pid: 19383
+PPid: 19275
+TracerPid: 0
+Uid: 1000 1000 1000 1000
+Gid: 1000 1000 1000 1000
+FDSize: 256
+Groups: 24 25 27 29 30 44 46 102 104 108 111 1000 1001
+NStgid: 19383
+NSpid: 19383
+NSpgid: 19383
+NSsid: 19275
+VmPeak: 5944 kB
+VmSize: 5944 kB
+VmLck: 0 kB
+VmPin: 0 kB
+VmHWM: 744 kB
+VmRSS: 744 kB
+VmData: 324 kB
+VmStk: 136 kB
+VmExe: 48 kB
+VmLib: 1776 kB
+VmPTE: 32 kB
+VmPMD: 12 kB
+VmSwap: 0 kB
+Threads: 1
+SigQ: 0/30067
+SigPnd: 0000000000000000
+ShdPnd: 0000000000000000
+SigBlk: 0000000000000000
+SigIgn: 0000000000000080
+SigCgt: 0000000000000000
+CapInh: 0000000000000000
+CapPrm: 0000000000000000
+CapEff: 0000000000000000
+CapBnd: 0000003fffffffff
+CapAmb: 0000000000000000
+Seccomp: 0
+Cpus_allowed: f
+Cpus_allowed_list: 0-3
+Mems_allowed: 00000000,00000001
+Mems_allowed_list: 0
+voluntary_ctxt_switches: 0
+nonvoluntary_ctxt_switches: 1
--- /dev/null
+// +build linux,cgo,seccomp
+
+package seccomp
+
+import (
+ "bufio"
+ "fmt"
+ "os"
+ "strings"
+
+ "github.com/opencontainers/runc/libcontainer/configs"
+ libseccomp "github.com/seccomp/libseccomp-golang"
+
+ "golang.org/x/sys/unix"
+)
+
+var (
+ actAllow = libseccomp.ActAllow
+ actTrap = libseccomp.ActTrap
+ actKill = libseccomp.ActKill
+ actTrace = libseccomp.ActTrace.SetReturnCode(int16(unix.EPERM))
+ actErrno = libseccomp.ActErrno.SetReturnCode(int16(unix.EPERM))
+)
+
+const (
+ // Linux system calls can have at most 6 arguments
+ syscallMaxArguments int = 6
+)
+
+// Filters given syscalls in a container, preventing them from being used
+// Started in the container init process, and carried over to all child processes
+// Setns calls, however, require a separate invocation, as they are not children
+// of the init until they join the namespace
+func InitSeccomp(config *configs.Seccomp) error {
+ if config == nil {
+ return fmt.Errorf("cannot initialize Seccomp - nil config passed")
+ }
+
+ defaultAction, err := getAction(config.DefaultAction)
+ if err != nil {
+ return fmt.Errorf("error initializing seccomp - invalid default action")
+ }
+
+ filter, err := libseccomp.NewFilter(defaultAction)
+ if err != nil {
+ return fmt.Errorf("error creating filter: %s", err)
+ }
+
+ // Add extra architectures
+ for _, arch := range config.Architectures {
+ scmpArch, err := libseccomp.GetArchFromString(arch)
+ if err != nil {
+ return fmt.Errorf("error validating Seccomp architecture: %s", err)
+ }
+
+ if err := filter.AddArch(scmpArch); err != nil {
+ return fmt.Errorf("error adding architecture to seccomp filter: %s", err)
+ }
+ }
+
+ // Unset no new privs bit
+ if err := filter.SetNoNewPrivsBit(false); err != nil {
+ return fmt.Errorf("error setting no new privileges: %s", err)
+ }
+
+ // Add a rule for each syscall
+ for _, call := range config.Syscalls {
+ if call == nil {
+ return fmt.Errorf("encountered nil syscall while initializing Seccomp")
+ }
+
+ if err = matchCall(filter, call); err != nil {
+ return err
+ }
+ }
+
+ if err = filter.Load(); err != nil {
+ return fmt.Errorf("error loading seccomp filter into kernel: %s", err)
+ }
+
+ return nil
+}
+
+// IsEnabled returns if the kernel has been configured to support seccomp.
+func IsEnabled() bool {
+ // Try to read from /proc/self/status for kernels > 3.8
+ s, err := parseStatusFile("/proc/self/status")
+ if err != nil {
+ // Check if Seccomp is supported, via CONFIG_SECCOMP.
+ if err := unix.Prctl(unix.PR_GET_SECCOMP, 0, 0, 0, 0); err != unix.EINVAL {
+ // Make sure the kernel has CONFIG_SECCOMP_FILTER.
+ if err := unix.Prctl(unix.PR_SET_SECCOMP, unix.SECCOMP_MODE_FILTER, 0, 0, 0); err != unix.EINVAL {
+ return true
+ }
+ }
+ return false
+ }
+ _, ok := s["Seccomp"]
+ return ok
+}
+
+// Convert Libcontainer Action to Libseccomp ScmpAction
+func getAction(act configs.Action) (libseccomp.ScmpAction, error) {
+ switch act {
+ case configs.Kill:
+ return actKill, nil
+ case configs.Errno:
+ return actErrno, nil
+ case configs.Trap:
+ return actTrap, nil
+ case configs.Allow:
+ return actAllow, nil
+ case configs.Trace:
+ return actTrace, nil
+ default:
+ return libseccomp.ActInvalid, fmt.Errorf("invalid action, cannot use in rule")
+ }
+}
+
+// Convert Libcontainer Operator to Libseccomp ScmpCompareOp
+func getOperator(op configs.Operator) (libseccomp.ScmpCompareOp, error) {
+ switch op {
+ case configs.EqualTo:
+ return libseccomp.CompareEqual, nil
+ case configs.NotEqualTo:
+ return libseccomp.CompareNotEqual, nil
+ case configs.GreaterThan:
+ return libseccomp.CompareGreater, nil
+ case configs.GreaterThanOrEqualTo:
+ return libseccomp.CompareGreaterEqual, nil
+ case configs.LessThan:
+ return libseccomp.CompareLess, nil
+ case configs.LessThanOrEqualTo:
+ return libseccomp.CompareLessOrEqual, nil
+ case configs.MaskEqualTo:
+ return libseccomp.CompareMaskedEqual, nil
+ default:
+ return libseccomp.CompareInvalid, fmt.Errorf("invalid operator, cannot use in rule")
+ }
+}
+
+// Convert Libcontainer Arg to Libseccomp ScmpCondition
+func getCondition(arg *configs.Arg) (libseccomp.ScmpCondition, error) {
+ cond := libseccomp.ScmpCondition{}
+
+ if arg == nil {
+ return cond, fmt.Errorf("cannot convert nil to syscall condition")
+ }
+
+ op, err := getOperator(arg.Op)
+ if err != nil {
+ return cond, err
+ }
+
+ return libseccomp.MakeCondition(arg.Index, op, arg.Value, arg.ValueTwo)
+}
+
+// Add a rule to match a single syscall
+func matchCall(filter *libseccomp.ScmpFilter, call *configs.Syscall) error {
+ if call == nil || filter == nil {
+ return fmt.Errorf("cannot use nil as syscall to block")
+ }
+
+ if len(call.Name) == 0 {
+ return fmt.Errorf("empty string is not a valid syscall")
+ }
+
+ // If we can't resolve the syscall, assume it's not supported on this kernel
+ // Ignore it, don't error out
+ callNum, err := libseccomp.GetSyscallFromName(call.Name)
+ if err != nil {
+ return nil
+ }
+
+ // Convert the call's action to the libseccomp equivalent
+ callAct, err := getAction(call.Action)
+ if err != nil {
+ return fmt.Errorf("action in seccomp profile is invalid: %s", err)
+ }
+
+ // Unconditional match - just add the rule
+ if len(call.Args) == 0 {
+ if err = filter.AddRule(callNum, callAct); err != nil {
+ return fmt.Errorf("error adding seccomp filter rule for syscall %s: %s", call.Name, err)
+ }
+ } else {
+ // If two or more arguments have the same condition,
+ // Revert to old behavior, adding each condition as a separate rule
+ argCounts := make([]uint, syscallMaxArguments)
+ conditions := []libseccomp.ScmpCondition{}
+
+ for _, cond := range call.Args {
+ newCond, err := getCondition(cond)
+ if err != nil {
+ return fmt.Errorf("error creating seccomp syscall condition for syscall %s: %s", call.Name, err)
+ }
+
+ argCounts[cond.Index] += 1
+
+ conditions = append(conditions, newCond)
+ }
+
+ hasMultipleArgs := false
+ for _, count := range argCounts {
+ if count > 1 {
+ hasMultipleArgs = true
+ break
+ }
+ }
+
+ if hasMultipleArgs {
+ // Revert to old behavior
+ // Add each condition attached to a separate rule
+ for _, cond := range conditions {
+ condArr := []libseccomp.ScmpCondition{cond}
+
+ if err = filter.AddRuleConditional(callNum, callAct, condArr); err != nil {
+ return fmt.Errorf("error adding seccomp rule for syscall %s: %s", call.Name, err)
+ }
+ }
+ } else {
+ // No conditions share same argument
+ // Use new, proper behavior
+ if err = filter.AddRuleConditional(callNum, callAct, conditions); err != nil {
+ return fmt.Errorf("error adding seccomp rule for syscall %s: %s", call.Name, err)
+ }
+ }
+ }
+
+ return nil
+}
+
+func parseStatusFile(path string) (map[string]string, error) {
+ f, err := os.Open(path)
+ if err != nil {
+ return nil, err
+ }
+ defer f.Close()
+
+ s := bufio.NewScanner(f)
+ status := make(map[string]string)
+
+ for s.Scan() {
+ text := s.Text()
+ parts := strings.Split(text, ":")
+
+ if len(parts) <= 1 {
+ continue
+ }
+
+ status[parts[0]] = parts[1]
+ }
+ if err := s.Err(); err != nil {
+ return nil, err
+ }
+
+ return status, nil
+}
--- /dev/null
+// +build linux,cgo,seccomp
+
+package seccomp
+
+import "testing"
+
+func TestParseStatusFile(t *testing.T) {
+ s, err := parseStatusFile("fixtures/proc_self_status")
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ if _, ok := s["Seccomp"]; !ok {
+
+ t.Fatal("expected to find 'Seccomp' in the map but did not.")
+ }
+}
--- /dev/null
+// +build !linux !cgo !seccomp
+
+package seccomp
+
+import (
+ "errors"
+
+ "github.com/opencontainers/runc/libcontainer/configs"
+)
+
+var ErrSeccompNotEnabled = errors.New("seccomp: config provided but seccomp not supported")
+
+// InitSeccomp does nothing because seccomp is not supported.
+func InitSeccomp(config *configs.Seccomp) error {
+ if config != nil {
+ return ErrSeccompNotEnabled
+ }
+ return nil
+}
+
+// IsEnabled returns false, because it is not supported.
+func IsEnabled() bool {
+ return false
+}
--- /dev/null
+// +build linux
+
+package libcontainer
+
+import (
+ "fmt"
+ "os"
+ "runtime"
+
+ "github.com/opencontainers/runc/libcontainer/apparmor"
+ "github.com/opencontainers/runc/libcontainer/keys"
+ "github.com/opencontainers/runc/libcontainer/seccomp"
+ "github.com/opencontainers/runc/libcontainer/system"
+ "github.com/opencontainers/selinux/go-selinux/label"
+ "github.com/pkg/errors"
+
+ "golang.org/x/sys/unix"
+)
+
+// linuxSetnsInit performs the container's initialization for running a new process
+// inside an existing container.
+type linuxSetnsInit struct {
+ pipe *os.File
+ consoleSocket *os.File
+ config *initConfig
+}
+
+func (l *linuxSetnsInit) getSessionRingName() string {
+ return fmt.Sprintf("_ses.%s", l.config.ContainerId)
+}
+
+func (l *linuxSetnsInit) Init() error {
+ runtime.LockOSThread()
+ defer runtime.UnlockOSThread()
+
+ if !l.config.Config.NoNewKeyring {
+ // Do not inherit the parent's session keyring.
+ if _, err := keys.JoinSessionKeyring(l.getSessionRingName()); err != nil {
+ // Same justification as in standart_init_linux.go as to why we
+ // don't bail on ENOSYS.
+ //
+ // TODO(cyphar): And we should have logging here too.
+ if errors.Cause(err) != unix.ENOSYS {
+ return errors.Wrap(err, "join session keyring")
+ }
+ }
+ }
+ if l.config.CreateConsole {
+ if err := setupConsole(l.consoleSocket, l.config, false); err != nil {
+ return err
+ }
+ if err := system.Setctty(); err != nil {
+ return err
+ }
+ }
+ if l.config.NoNewPrivileges {
+ if err := unix.Prctl(unix.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil {
+ return err
+ }
+ }
+ if err := label.SetProcessLabel(l.config.ProcessLabel); err != nil {
+ return err
+ }
+ defer label.SetProcessLabel("")
+ // Without NoNewPrivileges seccomp is a privileged operation, so we need to
+ // do this before dropping capabilities; otherwise do it as late as possible
+ // just before execve so as few syscalls take place after it as possible.
+ if l.config.Config.Seccomp != nil && !l.config.NoNewPrivileges {
+ if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
+ return err
+ }
+ }
+ if err := finalizeNamespace(l.config); err != nil {
+ return err
+ }
+ if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil {
+ return err
+ }
+ // Set seccomp as close to execve as possible, so as few syscalls take
+ // place afterward (reducing the amount of syscalls that users need to
+ // enable in their seccomp profiles).
+ if l.config.Config.Seccomp != nil && l.config.NoNewPrivileges {
+ if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
+ return newSystemErrorWithCause(err, "init seccomp")
+ }
+ }
+ return system.Execv(l.config.Args[0], l.config.Args[0:], os.Environ())
+}
--- /dev/null
+package specconv
+
+import (
+ "os"
+ "strings"
+
+ "github.com/opencontainers/runtime-spec/specs-go"
+)
+
+// Example returns an example spec file, with many options set so a user can
+// see what a standard spec file looks like.
+func Example() *specs.Spec {
+ return &specs.Spec{
+ Version: specs.Version,
+ Root: &specs.Root{
+ Path: "rootfs",
+ Readonly: true,
+ },
+ Process: &specs.Process{
+ Terminal: true,
+ User: specs.User{},
+ Args: []string{
+ "sh",
+ },
+ Env: []string{
+ "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
+ "TERM=xterm",
+ },
+ Cwd: "/",
+ NoNewPrivileges: true,
+ Capabilities: &specs.LinuxCapabilities{
+ Bounding: []string{
+ "CAP_AUDIT_WRITE",
+ "CAP_KILL",
+ "CAP_NET_BIND_SERVICE",
+ },
+ Permitted: []string{
+ "CAP_AUDIT_WRITE",
+ "CAP_KILL",
+ "CAP_NET_BIND_SERVICE",
+ },
+ Inheritable: []string{
+ "CAP_AUDIT_WRITE",
+ "CAP_KILL",
+ "CAP_NET_BIND_SERVICE",
+ },
+ Ambient: []string{
+ "CAP_AUDIT_WRITE",
+ "CAP_KILL",
+ "CAP_NET_BIND_SERVICE",
+ },
+ Effective: []string{
+ "CAP_AUDIT_WRITE",
+ "CAP_KILL",
+ "CAP_NET_BIND_SERVICE",
+ },
+ },
+ Rlimits: []specs.POSIXRlimit{
+ {
+ Type: "RLIMIT_NOFILE",
+ Hard: uint64(1024),
+ Soft: uint64(1024),
+ },
+ },
+ },
+ Hostname: "runc",
+ Mounts: []specs.Mount{
+ {
+ Destination: "/proc",
+ Type: "proc",
+ Source: "proc",
+ Options: nil,
+ },
+ {
+ Destination: "/dev",
+ Type: "tmpfs",
+ Source: "tmpfs",
+ Options: []string{"nosuid", "strictatime", "mode=755", "size=65536k"},
+ },
+ {
+ Destination: "/dev/pts",
+ Type: "devpts",
+ Source: "devpts",
+ Options: []string{"nosuid", "noexec", "newinstance", "ptmxmode=0666", "mode=0620", "gid=5"},
+ },
+ {
+ Destination: "/dev/shm",
+ Type: "tmpfs",
+ Source: "shm",
+ Options: []string{"nosuid", "noexec", "nodev", "mode=1777", "size=65536k"},
+ },
+ {
+ Destination: "/dev/mqueue",
+ Type: "mqueue",
+ Source: "mqueue",
+ Options: []string{"nosuid", "noexec", "nodev"},
+ },
+ {
+ Destination: "/sys",
+ Type: "sysfs",
+ Source: "sysfs",
+ Options: []string{"nosuid", "noexec", "nodev", "ro"},
+ },
+ {
+ Destination: "/sys/fs/cgroup",
+ Type: "cgroup",
+ Source: "cgroup",
+ Options: []string{"nosuid", "noexec", "nodev", "relatime", "ro"},
+ },
+ },
+ Linux: &specs.Linux{
+ MaskedPaths: []string{
+ "/proc/kcore",
+ "/proc/latency_stats",
+ "/proc/timer_list",
+ "/proc/timer_stats",
+ "/proc/sched_debug",
+ "/sys/firmware",
+ "/proc/scsi",
+ },
+ ReadonlyPaths: []string{
+ "/proc/asound",
+ "/proc/bus",
+ "/proc/fs",
+ "/proc/irq",
+ "/proc/sys",
+ "/proc/sysrq-trigger",
+ },
+ Resources: &specs.LinuxResources{
+ Devices: []specs.LinuxDeviceCgroup{
+ {
+ Allow: false,
+ Access: "rwm",
+ },
+ },
+ },
+ Namespaces: []specs.LinuxNamespace{
+ {
+ Type: "pid",
+ },
+ {
+ Type: "network",
+ },
+ {
+ Type: "ipc",
+ },
+ {
+ Type: "uts",
+ },
+ {
+ Type: "mount",
+ },
+ },
+ },
+ }
+}
+
+// ToRootless converts the given spec file into one that should work with
+// rootless containers (euid != 0), by removing incompatible options and adding others that
+// are needed.
+func ToRootless(spec *specs.Spec) {
+ var namespaces []specs.LinuxNamespace
+
+ // Remove networkns from the spec.
+ for _, ns := range spec.Linux.Namespaces {
+ switch ns.Type {
+ case specs.NetworkNamespace, specs.UserNamespace:
+ // Do nothing.
+ default:
+ namespaces = append(namespaces, ns)
+ }
+ }
+ // Add userns to the spec.
+ namespaces = append(namespaces, specs.LinuxNamespace{
+ Type: specs.UserNamespace,
+ })
+ spec.Linux.Namespaces = namespaces
+
+ // Add mappings for the current user.
+ spec.Linux.UIDMappings = []specs.LinuxIDMapping{{
+ HostID: uint32(os.Geteuid()),
+ ContainerID: 0,
+ Size: 1,
+ }}
+ spec.Linux.GIDMappings = []specs.LinuxIDMapping{{
+ HostID: uint32(os.Getegid()),
+ ContainerID: 0,
+ Size: 1,
+ }}
+
+ // Fix up mounts.
+ var mounts []specs.Mount
+ for _, mount := range spec.Mounts {
+ // Ignore all mounts that are under /sys.
+ if strings.HasPrefix(mount.Destination, "/sys") {
+ continue
+ }
+
+ // Remove all gid= and uid= mappings.
+ var options []string
+ for _, option := range mount.Options {
+ if !strings.HasPrefix(option, "gid=") && !strings.HasPrefix(option, "uid=") {
+ options = append(options, option)
+ }
+ }
+
+ mount.Options = options
+ mounts = append(mounts, mount)
+ }
+ // Add the sysfs mount as an rbind.
+ mounts = append(mounts, specs.Mount{
+ Source: "/sys",
+ Destination: "/sys",
+ Type: "none",
+ Options: []string{"rbind", "nosuid", "noexec", "nodev", "ro"},
+ })
+ spec.Mounts = mounts
+
+ // Remove cgroup settings.
+ spec.Linux.Resources = nil
+}
--- /dev/null
+// +build linux
+
+// Package specconv implements conversion of specifications to libcontainer
+// configurations
+package specconv
+
+import (
+ "fmt"
+ "os"
+ "path/filepath"
+ "strings"
+ "time"
+
+ "github.com/opencontainers/runc/libcontainer/configs"
+ "github.com/opencontainers/runc/libcontainer/seccomp"
+ libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils"
+ "github.com/opencontainers/runtime-spec/specs-go"
+
+ "golang.org/x/sys/unix"
+)
+
+const wildcard = -1
+
+var namespaceMapping = map[specs.LinuxNamespaceType]configs.NamespaceType{
+ specs.PIDNamespace: configs.NEWPID,
+ specs.NetworkNamespace: configs.NEWNET,
+ specs.MountNamespace: configs.NEWNS,
+ specs.UserNamespace: configs.NEWUSER,
+ specs.IPCNamespace: configs.NEWIPC,
+ specs.UTSNamespace: configs.NEWUTS,
+ specs.CgroupNamespace: configs.NEWCGROUP,
+}
+
+var mountPropagationMapping = map[string]int{
+ "rprivate": unix.MS_PRIVATE | unix.MS_REC,
+ "private": unix.MS_PRIVATE,
+ "rslave": unix.MS_SLAVE | unix.MS_REC,
+ "slave": unix.MS_SLAVE,
+ "rshared": unix.MS_SHARED | unix.MS_REC,
+ "shared": unix.MS_SHARED,
+ "runbindable": unix.MS_UNBINDABLE | unix.MS_REC,
+ "unbindable": unix.MS_UNBINDABLE,
+ "": 0,
+}
+
+var allowedDevices = []*configs.Device{
+ // allow mknod for any device
+ {
+ Type: 'c',
+ Major: wildcard,
+ Minor: wildcard,
+ Permissions: "m",
+ Allow: true,
+ },
+ {
+ Type: 'b',
+ Major: wildcard,
+ Minor: wildcard,
+ Permissions: "m",
+ Allow: true,
+ },
+ {
+ Type: 'c',
+ Path: "/dev/null",
+ Major: 1,
+ Minor: 3,
+ Permissions: "rwm",
+ Allow: true,
+ },
+ {
+ Type: 'c',
+ Path: "/dev/random",
+ Major: 1,
+ Minor: 8,
+ Permissions: "rwm",
+ Allow: true,
+ },
+ {
+ Type: 'c',
+ Path: "/dev/full",
+ Major: 1,
+ Minor: 7,
+ Permissions: "rwm",
+ Allow: true,
+ },
+ {
+ Type: 'c',
+ Path: "/dev/tty",
+ Major: 5,
+ Minor: 0,
+ Permissions: "rwm",
+ Allow: true,
+ },
+ {
+ Type: 'c',
+ Path: "/dev/zero",
+ Major: 1,
+ Minor: 5,
+ Permissions: "rwm",
+ Allow: true,
+ },
+ {
+ Type: 'c',
+ Path: "/dev/urandom",
+ Major: 1,
+ Minor: 9,
+ Permissions: "rwm",
+ Allow: true,
+ },
+ {
+ Path: "/dev/console",
+ Type: 'c',
+ Major: 5,
+ Minor: 1,
+ Permissions: "rwm",
+ Allow: true,
+ },
+ // /dev/pts/ - pts namespaces are "coming soon"
+ {
+ Path: "",
+ Type: 'c',
+ Major: 136,
+ Minor: wildcard,
+ Permissions: "rwm",
+ Allow: true,
+ },
+ {
+ Path: "",
+ Type: 'c',
+ Major: 5,
+ Minor: 2,
+ Permissions: "rwm",
+ Allow: true,
+ },
+ // tuntap
+ {
+ Path: "",
+ Type: 'c',
+ Major: 10,
+ Minor: 200,
+ Permissions: "rwm",
+ Allow: true,
+ },
+}
+
+type CreateOpts struct {
+ CgroupName string
+ UseSystemdCgroup bool
+ NoPivotRoot bool
+ NoNewKeyring bool
+ Spec *specs.Spec
+ RootlessEUID bool
+ RootlessCgroups bool
+}
+
+// CreateLibcontainerConfig creates a new libcontainer configuration from a
+// given specification and a cgroup name
+func CreateLibcontainerConfig(opts *CreateOpts) (*configs.Config, error) {
+ // runc's cwd will always be the bundle path
+ rcwd, err := os.Getwd()
+ if err != nil {
+ return nil, err
+ }
+ cwd, err := filepath.Abs(rcwd)
+ if err != nil {
+ return nil, err
+ }
+ spec := opts.Spec
+ if spec.Root == nil {
+ return nil, fmt.Errorf("Root must be specified")
+ }
+ rootfsPath := spec.Root.Path
+ if !filepath.IsAbs(rootfsPath) {
+ rootfsPath = filepath.Join(cwd, rootfsPath)
+ }
+ labels := []string{}
+ for k, v := range spec.Annotations {
+ labels = append(labels, fmt.Sprintf("%s=%s", k, v))
+ }
+ config := &configs.Config{
+ Rootfs: rootfsPath,
+ NoPivotRoot: opts.NoPivotRoot,
+ Readonlyfs: spec.Root.Readonly,
+ Hostname: spec.Hostname,
+ Labels: append(labels, fmt.Sprintf("bundle=%s", cwd)),
+ NoNewKeyring: opts.NoNewKeyring,
+ RootlessEUID: opts.RootlessEUID,
+ RootlessCgroups: opts.RootlessCgroups,
+ }
+
+ exists := false
+ for _, m := range spec.Mounts {
+ config.Mounts = append(config.Mounts, createLibcontainerMount(cwd, m))
+ }
+ if err := createDevices(spec, config); err != nil {
+ return nil, err
+ }
+ c, err := createCgroupConfig(opts)
+ if err != nil {
+ return nil, err
+ }
+ config.Cgroups = c
+ // set linux-specific config
+ if spec.Linux != nil {
+ if config.RootPropagation, exists = mountPropagationMapping[spec.Linux.RootfsPropagation]; !exists {
+ return nil, fmt.Errorf("rootfsPropagation=%v is not supported", spec.Linux.RootfsPropagation)
+ }
+ if config.NoPivotRoot && (config.RootPropagation&unix.MS_PRIVATE != 0) {
+ return nil, fmt.Errorf("rootfsPropagation of [r]private is not safe without pivot_root")
+ }
+
+ for _, ns := range spec.Linux.Namespaces {
+ t, exists := namespaceMapping[ns.Type]
+ if !exists {
+ return nil, fmt.Errorf("namespace %q does not exist", ns)
+ }
+ if config.Namespaces.Contains(t) {
+ return nil, fmt.Errorf("malformed spec file: duplicated ns %q", ns)
+ }
+ config.Namespaces.Add(t, ns.Path)
+ }
+ if config.Namespaces.Contains(configs.NEWNET) && config.Namespaces.PathOf(configs.NEWNET) == "" {
+ config.Networks = []*configs.Network{
+ {
+ Type: "loopback",
+ },
+ }
+ }
+ if config.Namespaces.Contains(configs.NEWUSER) {
+ if err := setupUserNamespace(spec, config); err != nil {
+ return nil, err
+ }
+ }
+ config.MaskPaths = spec.Linux.MaskedPaths
+ config.ReadonlyPaths = spec.Linux.ReadonlyPaths
+ config.MountLabel = spec.Linux.MountLabel
+ config.Sysctl = spec.Linux.Sysctl
+ if spec.Linux.Seccomp != nil {
+ seccomp, err := SetupSeccomp(spec.Linux.Seccomp)
+ if err != nil {
+ return nil, err
+ }
+ config.Seccomp = seccomp
+ }
+ if spec.Linux.IntelRdt != nil {
+ config.IntelRdt = &configs.IntelRdt{}
+ if spec.Linux.IntelRdt.L3CacheSchema != "" {
+ config.IntelRdt.L3CacheSchema = spec.Linux.IntelRdt.L3CacheSchema
+ }
+ if spec.Linux.IntelRdt.MemBwSchema != "" {
+ config.IntelRdt.MemBwSchema = spec.Linux.IntelRdt.MemBwSchema
+ }
+ }
+ }
+ if spec.Process != nil {
+ config.OomScoreAdj = spec.Process.OOMScoreAdj
+ if spec.Process.SelinuxLabel != "" {
+ config.ProcessLabel = spec.Process.SelinuxLabel
+ }
+ if spec.Process.Capabilities != nil {
+ config.Capabilities = &configs.Capabilities{
+ Bounding: spec.Process.Capabilities.Bounding,
+ Effective: spec.Process.Capabilities.Effective,
+ Permitted: spec.Process.Capabilities.Permitted,
+ Inheritable: spec.Process.Capabilities.Inheritable,
+ Ambient: spec.Process.Capabilities.Ambient,
+ }
+ }
+ }
+ createHooks(spec, config)
+ config.Version = specs.Version
+ return config, nil
+}
+
+func createLibcontainerMount(cwd string, m specs.Mount) *configs.Mount {
+ flags, pgflags, data, ext := parseMountOptions(m.Options)
+ source := m.Source
+ device := m.Type
+ if flags&unix.MS_BIND != 0 {
+ if device == "" {
+ device = "bind"
+ }
+ if !filepath.IsAbs(source) {
+ source = filepath.Join(cwd, m.Source)
+ }
+ }
+ return &configs.Mount{
+ Device: device,
+ Source: source,
+ Destination: m.Destination,
+ Data: data,
+ Flags: flags,
+ PropagationFlags: pgflags,
+ Extensions: ext,
+ }
+}
+
+func createCgroupConfig(opts *CreateOpts) (*configs.Cgroup, error) {
+ var (
+ myCgroupPath string
+
+ spec = opts.Spec
+ useSystemdCgroup = opts.UseSystemdCgroup
+ name = opts.CgroupName
+ )
+
+ c := &configs.Cgroup{
+ Resources: &configs.Resources{},
+ }
+
+ if spec.Linux != nil && spec.Linux.CgroupsPath != "" {
+ myCgroupPath = libcontainerUtils.CleanPath(spec.Linux.CgroupsPath)
+ if useSystemdCgroup {
+ myCgroupPath = spec.Linux.CgroupsPath
+ }
+ }
+
+ if useSystemdCgroup {
+ if myCgroupPath == "" {
+ c.Parent = "system.slice"
+ c.ScopePrefix = "runc"
+ c.Name = name
+ } else {
+ // Parse the path from expected "slice:prefix:name"
+ // for e.g. "system.slice:docker:1234"
+ parts := strings.Split(myCgroupPath, ":")
+ if len(parts) != 3 {
+ return nil, fmt.Errorf("expected cgroupsPath to be of format \"slice:prefix:name\" for systemd cgroups")
+ }
+ c.Parent = parts[0]
+ c.ScopePrefix = parts[1]
+ c.Name = parts[2]
+ }
+ } else {
+ if myCgroupPath == "" {
+ c.Name = name
+ }
+ c.Path = myCgroupPath
+ }
+
+ // In rootless containers, any attempt to make cgroup changes is likely to fail.
+ // libcontainer will validate this but ignores the error.
+ c.Resources.AllowedDevices = allowedDevices
+ if spec.Linux != nil {
+ r := spec.Linux.Resources
+ if r == nil {
+ return c, nil
+ }
+ for i, d := range spec.Linux.Resources.Devices {
+ var (
+ t = "a"
+ major = int64(-1)
+ minor = int64(-1)
+ )
+ if d.Type != "" {
+ t = d.Type
+ }
+ if d.Major != nil {
+ major = *d.Major
+ }
+ if d.Minor != nil {
+ minor = *d.Minor
+ }
+ if d.Access == "" {
+ return nil, fmt.Errorf("device access at %d field cannot be empty", i)
+ }
+ dt, err := stringToCgroupDeviceRune(t)
+ if err != nil {
+ return nil, err
+ }
+ dd := &configs.Device{
+ Type: dt,
+ Major: major,
+ Minor: minor,
+ Permissions: d.Access,
+ Allow: d.Allow,
+ }
+ c.Resources.Devices = append(c.Resources.Devices, dd)
+ }
+ if r.Memory != nil {
+ if r.Memory.Limit != nil {
+ c.Resources.Memory = *r.Memory.Limit
+ }
+ if r.Memory.Reservation != nil {
+ c.Resources.MemoryReservation = *r.Memory.Reservation
+ }
+ if r.Memory.Swap != nil {
+ c.Resources.MemorySwap = *r.Memory.Swap
+ }
+ if r.Memory.Kernel != nil {
+ c.Resources.KernelMemory = *r.Memory.Kernel
+ }
+ if r.Memory.KernelTCP != nil {
+ c.Resources.KernelMemoryTCP = *r.Memory.KernelTCP
+ }
+ if r.Memory.Swappiness != nil {
+ c.Resources.MemorySwappiness = r.Memory.Swappiness
+ }
+ if r.Memory.DisableOOMKiller != nil {
+ c.Resources.OomKillDisable = *r.Memory.DisableOOMKiller
+ }
+ }
+ if r.CPU != nil {
+ if r.CPU.Shares != nil {
+ c.Resources.CpuShares = *r.CPU.Shares
+ }
+ if r.CPU.Quota != nil {
+ c.Resources.CpuQuota = *r.CPU.Quota
+ }
+ if r.CPU.Period != nil {
+ c.Resources.CpuPeriod = *r.CPU.Period
+ }
+ if r.CPU.RealtimeRuntime != nil {
+ c.Resources.CpuRtRuntime = *r.CPU.RealtimeRuntime
+ }
+ if r.CPU.RealtimePeriod != nil {
+ c.Resources.CpuRtPeriod = *r.CPU.RealtimePeriod
+ }
+ if r.CPU.Cpus != "" {
+ c.Resources.CpusetCpus = r.CPU.Cpus
+ }
+ if r.CPU.Mems != "" {
+ c.Resources.CpusetMems = r.CPU.Mems
+ }
+ }
+ if r.Pids != nil {
+ c.Resources.PidsLimit = r.Pids.Limit
+ }
+ if r.BlockIO != nil {
+ if r.BlockIO.Weight != nil {
+ c.Resources.BlkioWeight = *r.BlockIO.Weight
+ }
+ if r.BlockIO.LeafWeight != nil {
+ c.Resources.BlkioLeafWeight = *r.BlockIO.LeafWeight
+ }
+ if r.BlockIO.WeightDevice != nil {
+ for _, wd := range r.BlockIO.WeightDevice {
+ var weight, leafWeight uint16
+ if wd.Weight != nil {
+ weight = *wd.Weight
+ }
+ if wd.LeafWeight != nil {
+ leafWeight = *wd.LeafWeight
+ }
+ weightDevice := configs.NewWeightDevice(wd.Major, wd.Minor, weight, leafWeight)
+ c.Resources.BlkioWeightDevice = append(c.Resources.BlkioWeightDevice, weightDevice)
+ }
+ }
+ if r.BlockIO.ThrottleReadBpsDevice != nil {
+ for _, td := range r.BlockIO.ThrottleReadBpsDevice {
+ rate := td.Rate
+ throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, rate)
+ c.Resources.BlkioThrottleReadBpsDevice = append(c.Resources.BlkioThrottleReadBpsDevice, throttleDevice)
+ }
+ }
+ if r.BlockIO.ThrottleWriteBpsDevice != nil {
+ for _, td := range r.BlockIO.ThrottleWriteBpsDevice {
+ rate := td.Rate
+ throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, rate)
+ c.Resources.BlkioThrottleWriteBpsDevice = append(c.Resources.BlkioThrottleWriteBpsDevice, throttleDevice)
+ }
+ }
+ if r.BlockIO.ThrottleReadIOPSDevice != nil {
+ for _, td := range r.BlockIO.ThrottleReadIOPSDevice {
+ rate := td.Rate
+ throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, rate)
+ c.Resources.BlkioThrottleReadIOPSDevice = append(c.Resources.BlkioThrottleReadIOPSDevice, throttleDevice)
+ }
+ }
+ if r.BlockIO.ThrottleWriteIOPSDevice != nil {
+ for _, td := range r.BlockIO.ThrottleWriteIOPSDevice {
+ rate := td.Rate
+ throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, rate)
+ c.Resources.BlkioThrottleWriteIOPSDevice = append(c.Resources.BlkioThrottleWriteIOPSDevice, throttleDevice)
+ }
+ }
+ }
+ for _, l := range r.HugepageLimits {
+ c.Resources.HugetlbLimit = append(c.Resources.HugetlbLimit, &configs.HugepageLimit{
+ Pagesize: l.Pagesize,
+ Limit: l.Limit,
+ })
+ }
+ if r.Network != nil {
+ if r.Network.ClassID != nil {
+ c.Resources.NetClsClassid = *r.Network.ClassID
+ }
+ for _, m := range r.Network.Priorities {
+ c.Resources.NetPrioIfpriomap = append(c.Resources.NetPrioIfpriomap, &configs.IfPrioMap{
+ Interface: m.Name,
+ Priority: int64(m.Priority),
+ })
+ }
+ }
+ }
+ // append the default allowed devices to the end of the list
+ c.Resources.Devices = append(c.Resources.Devices, allowedDevices...)
+ return c, nil
+}
+
+func stringToCgroupDeviceRune(s string) (rune, error) {
+ switch s {
+ case "a":
+ return 'a', nil
+ case "b":
+ return 'b', nil
+ case "c":
+ return 'c', nil
+ default:
+ return 0, fmt.Errorf("invalid cgroup device type %q", s)
+ }
+}
+
+func stringToDeviceRune(s string) (rune, error) {
+ switch s {
+ case "p":
+ return 'p', nil
+ case "u":
+ return 'u', nil
+ case "b":
+ return 'b', nil
+ case "c":
+ return 'c', nil
+ default:
+ return 0, fmt.Errorf("invalid device type %q", s)
+ }
+}
+
+func createDevices(spec *specs.Spec, config *configs.Config) error {
+ // add whitelisted devices
+ config.Devices = []*configs.Device{
+ {
+ Type: 'c',
+ Path: "/dev/null",
+ Major: 1,
+ Minor: 3,
+ FileMode: 0666,
+ Uid: 0,
+ Gid: 0,
+ },
+ {
+ Type: 'c',
+ Path: "/dev/random",
+ Major: 1,
+ Minor: 8,
+ FileMode: 0666,
+ Uid: 0,
+ Gid: 0,
+ },
+ {
+ Type: 'c',
+ Path: "/dev/full",
+ Major: 1,
+ Minor: 7,
+ FileMode: 0666,
+ Uid: 0,
+ Gid: 0,
+ },
+ {
+ Type: 'c',
+ Path: "/dev/tty",
+ Major: 5,
+ Minor: 0,
+ FileMode: 0666,
+ Uid: 0,
+ Gid: 0,
+ },
+ {
+ Type: 'c',
+ Path: "/dev/zero",
+ Major: 1,
+ Minor: 5,
+ FileMode: 0666,
+ Uid: 0,
+ Gid: 0,
+ },
+ {
+ Type: 'c',
+ Path: "/dev/urandom",
+ Major: 1,
+ Minor: 9,
+ FileMode: 0666,
+ Uid: 0,
+ Gid: 0,
+ },
+ }
+ // merge in additional devices from the spec
+ if spec.Linux != nil {
+ for _, d := range spec.Linux.Devices {
+ var uid, gid uint32
+ var filemode os.FileMode = 0666
+
+ if d.UID != nil {
+ uid = *d.UID
+ }
+ if d.GID != nil {
+ gid = *d.GID
+ }
+ dt, err := stringToDeviceRune(d.Type)
+ if err != nil {
+ return err
+ }
+ if d.FileMode != nil {
+ filemode = *d.FileMode
+ }
+ device := &configs.Device{
+ Type: dt,
+ Path: d.Path,
+ Major: d.Major,
+ Minor: d.Minor,
+ FileMode: filemode,
+ Uid: uid,
+ Gid: gid,
+ }
+ config.Devices = append(config.Devices, device)
+ }
+ }
+ return nil
+}
+
+func setupUserNamespace(spec *specs.Spec, config *configs.Config) error {
+ create := func(m specs.LinuxIDMapping) configs.IDMap {
+ return configs.IDMap{
+ HostID: int(m.HostID),
+ ContainerID: int(m.ContainerID),
+ Size: int(m.Size),
+ }
+ }
+ if spec.Linux != nil {
+ for _, m := range spec.Linux.UIDMappings {
+ config.UidMappings = append(config.UidMappings, create(m))
+ }
+ for _, m := range spec.Linux.GIDMappings {
+ config.GidMappings = append(config.GidMappings, create(m))
+ }
+ }
+ rootUID, err := config.HostRootUID()
+ if err != nil {
+ return err
+ }
+ rootGID, err := config.HostRootGID()
+ if err != nil {
+ return err
+ }
+ for _, node := range config.Devices {
+ node.Uid = uint32(rootUID)
+ node.Gid = uint32(rootGID)
+ }
+ return nil
+}
+
+// parseMountOptions parses the string and returns the flags, propagation
+// flags and any mount data that it contains.
+func parseMountOptions(options []string) (int, []int, string, int) {
+ var (
+ flag int
+ pgflag []int
+ data []string
+ extFlags int
+ )
+ flags := map[string]struct {
+ clear bool
+ flag int
+ }{
+ "acl": {false, unix.MS_POSIXACL},
+ "async": {true, unix.MS_SYNCHRONOUS},
+ "atime": {true, unix.MS_NOATIME},
+ "bind": {false, unix.MS_BIND},
+ "defaults": {false, 0},
+ "dev": {true, unix.MS_NODEV},
+ "diratime": {true, unix.MS_NODIRATIME},
+ "dirsync": {false, unix.MS_DIRSYNC},
+ "exec": {true, unix.MS_NOEXEC},
+ "iversion": {false, unix.MS_I_VERSION},
+ "lazytime": {false, unix.MS_LAZYTIME},
+ "loud": {true, unix.MS_SILENT},
+ "mand": {false, unix.MS_MANDLOCK},
+ "noacl": {true, unix.MS_POSIXACL},
+ "noatime": {false, unix.MS_NOATIME},
+ "nodev": {false, unix.MS_NODEV},
+ "nodiratime": {false, unix.MS_NODIRATIME},
+ "noexec": {false, unix.MS_NOEXEC},
+ "noiversion": {true, unix.MS_I_VERSION},
+ "nolazytime": {true, unix.MS_LAZYTIME},
+ "nomand": {true, unix.MS_MANDLOCK},
+ "norelatime": {true, unix.MS_RELATIME},
+ "nostrictatime": {true, unix.MS_STRICTATIME},
+ "nosuid": {false, unix.MS_NOSUID},
+ "rbind": {false, unix.MS_BIND | unix.MS_REC},
+ "relatime": {false, unix.MS_RELATIME},
+ "remount": {false, unix.MS_REMOUNT},
+ "ro": {false, unix.MS_RDONLY},
+ "rw": {true, unix.MS_RDONLY},
+ "silent": {false, unix.MS_SILENT},
+ "strictatime": {false, unix.MS_STRICTATIME},
+ "suid": {true, unix.MS_NOSUID},
+ "sync": {false, unix.MS_SYNCHRONOUS},
+ }
+ propagationFlags := map[string]int{
+ "private": unix.MS_PRIVATE,
+ "shared": unix.MS_SHARED,
+ "slave": unix.MS_SLAVE,
+ "unbindable": unix.MS_UNBINDABLE,
+ "rprivate": unix.MS_PRIVATE | unix.MS_REC,
+ "rshared": unix.MS_SHARED | unix.MS_REC,
+ "rslave": unix.MS_SLAVE | unix.MS_REC,
+ "runbindable": unix.MS_UNBINDABLE | unix.MS_REC,
+ }
+ extensionFlags := map[string]struct {
+ clear bool
+ flag int
+ }{
+ "tmpcopyup": {false, configs.EXT_COPYUP},
+ }
+ for _, o := range options {
+ // If the option does not exist in the flags table or the flag
+ // is not supported on the platform,
+ // then it is a data value for a specific fs type
+ if f, exists := flags[o]; exists && f.flag != 0 {
+ if f.clear {
+ flag &= ^f.flag
+ } else {
+ flag |= f.flag
+ }
+ } else if f, exists := propagationFlags[o]; exists && f != 0 {
+ pgflag = append(pgflag, f)
+ } else if f, exists := extensionFlags[o]; exists && f.flag != 0 {
+ if f.clear {
+ extFlags &= ^f.flag
+ } else {
+ extFlags |= f.flag
+ }
+ } else {
+ data = append(data, o)
+ }
+ }
+ return flag, pgflag, strings.Join(data, ","), extFlags
+}
+
+func SetupSeccomp(config *specs.LinuxSeccomp) (*configs.Seccomp, error) {
+ if config == nil {
+ return nil, nil
+ }
+
+ // No default action specified, no syscalls listed, assume seccomp disabled
+ if config.DefaultAction == "" && len(config.Syscalls) == 0 {
+ return nil, nil
+ }
+
+ newConfig := new(configs.Seccomp)
+ newConfig.Syscalls = []*configs.Syscall{}
+
+ if len(config.Architectures) > 0 {
+ newConfig.Architectures = []string{}
+ for _, arch := range config.Architectures {
+ newArch, err := seccomp.ConvertStringToArch(string(arch))
+ if err != nil {
+ return nil, err
+ }
+ newConfig.Architectures = append(newConfig.Architectures, newArch)
+ }
+ }
+
+ // Convert default action from string representation
+ newDefaultAction, err := seccomp.ConvertStringToAction(string(config.DefaultAction))
+ if err != nil {
+ return nil, err
+ }
+ newConfig.DefaultAction = newDefaultAction
+
+ // Loop through all syscall blocks and convert them to libcontainer format
+ for _, call := range config.Syscalls {
+ newAction, err := seccomp.ConvertStringToAction(string(call.Action))
+ if err != nil {
+ return nil, err
+ }
+
+ for _, name := range call.Names {
+ newCall := configs.Syscall{
+ Name: name,
+ Action: newAction,
+ Args: []*configs.Arg{},
+ }
+ // Loop through all the arguments of the syscall and convert them
+ for _, arg := range call.Args {
+ newOp, err := seccomp.ConvertStringToOperator(string(arg.Op))
+ if err != nil {
+ return nil, err
+ }
+
+ newArg := configs.Arg{
+ Index: arg.Index,
+ Value: arg.Value,
+ ValueTwo: arg.ValueTwo,
+ Op: newOp,
+ }
+
+ newCall.Args = append(newCall.Args, &newArg)
+ }
+ newConfig.Syscalls = append(newConfig.Syscalls, &newCall)
+ }
+ }
+
+ return newConfig, nil
+}
+
+func createHooks(rspec *specs.Spec, config *configs.Config) {
+ config.Hooks = &configs.Hooks{}
+ if rspec.Hooks != nil {
+
+ for _, h := range rspec.Hooks.Prestart {
+ cmd := createCommandHook(h)
+ config.Hooks.Prestart = append(config.Hooks.Prestart, configs.NewCommandHook(cmd))
+ }
+ for _, h := range rspec.Hooks.Poststart {
+ cmd := createCommandHook(h)
+ config.Hooks.Poststart = append(config.Hooks.Poststart, configs.NewCommandHook(cmd))
+ }
+ for _, h := range rspec.Hooks.Poststop {
+ cmd := createCommandHook(h)
+ config.Hooks.Poststop = append(config.Hooks.Poststop, configs.NewCommandHook(cmd))
+ }
+ }
+}
+
+func createCommandHook(h specs.Hook) configs.Command {
+ cmd := configs.Command{
+ Path: h.Path,
+ Args: h.Args,
+ Env: h.Env,
+ }
+ if h.Timeout != nil {
+ d := time.Duration(*h.Timeout) * time.Second
+ cmd.Timeout = &d
+ }
+ return cmd
+}
--- /dev/null
+// +build linux
+
+package specconv
+
+import (
+ "os"
+ "strings"
+ "testing"
+
+ "github.com/opencontainers/runc/libcontainer/configs"
+ "github.com/opencontainers/runc/libcontainer/configs/validate"
+ "github.com/opencontainers/runtime-spec/specs-go"
+)
+
+func TestCreateCommandHookTimeout(t *testing.T) {
+ timeout := 3600
+ hook := specs.Hook{
+ Path: "/some/hook/path",
+ Args: []string{"--some", "thing"},
+ Env: []string{"SOME=value"},
+ Timeout: &timeout,
+ }
+ command := createCommandHook(hook)
+ timeoutStr := command.Timeout.String()
+ if timeoutStr != "1h0m0s" {
+ t.Errorf("Expected the Timeout to be 1h0m0s, got: %s", timeoutStr)
+ }
+}
+
+func TestCreateHooks(t *testing.T) {
+ rspec := &specs.Spec{
+ Hooks: &specs.Hooks{
+ Prestart: []specs.Hook{
+ {
+ Path: "/some/hook/path",
+ },
+ {
+ Path: "/some/hook2/path",
+ Args: []string{"--some", "thing"},
+ },
+ },
+ Poststart: []specs.Hook{
+ {
+ Path: "/some/hook/path",
+ Args: []string{"--some", "thing"},
+ Env: []string{"SOME=value"},
+ },
+ {
+ Path: "/some/hook2/path",
+ },
+ {
+ Path: "/some/hook3/path",
+ },
+ },
+ Poststop: []specs.Hook{
+ {
+ Path: "/some/hook/path",
+ Args: []string{"--some", "thing"},
+ Env: []string{"SOME=value"},
+ },
+ {
+ Path: "/some/hook2/path",
+ },
+ {
+ Path: "/some/hook3/path",
+ },
+ {
+ Path: "/some/hook4/path",
+ Args: []string{"--some", "thing"},
+ },
+ },
+ },
+ }
+ conf := &configs.Config{}
+ createHooks(rspec, conf)
+
+ prestart := conf.Hooks.Prestart
+
+ if len(prestart) != 2 {
+ t.Error("Expected 2 Prestart hooks")
+ }
+
+ poststart := conf.Hooks.Poststart
+
+ if len(poststart) != 3 {
+ t.Error("Expected 3 Poststart hooks")
+ }
+
+ poststop := conf.Hooks.Poststop
+
+ if len(poststop) != 4 {
+ t.Error("Expected 4 Poststop hooks")
+ }
+
+}
+func TestSetupSeccomp(t *testing.T) {
+ conf := &specs.LinuxSeccomp{
+ DefaultAction: "SCMP_ACT_ERRNO",
+ Architectures: []specs.Arch{specs.ArchX86_64, specs.ArchARM},
+ Syscalls: []specs.LinuxSyscall{
+ {
+ Names: []string{"clone"},
+ Action: "SCMP_ACT_ALLOW",
+ Args: []specs.LinuxSeccompArg{
+ {
+ Index: 0,
+ Value: 2080505856,
+ ValueTwo: 0,
+ Op: "SCMP_CMP_MASKED_EQ",
+ },
+ },
+ },
+ {
+ Names: []string{
+ "select",
+ "semctl",
+ "semget",
+ "semop",
+ "semtimedop",
+ "send",
+ "sendfile",
+ },
+ Action: "SCMP_ACT_ALLOW",
+ },
+ },
+ }
+ seccomp, err := SetupSeccomp(conf)
+
+ if err != nil {
+ t.Errorf("Couldn't create Seccomp config: %v", err)
+ }
+
+ if seccomp.DefaultAction != 2 { // SCMP_ACT_ERRNO
+ t.Error("Wrong conversion for DefaultAction")
+ }
+
+ if len(seccomp.Architectures) != 2 {
+ t.Error("Wrong number of architectures")
+ }
+
+ if seccomp.Architectures[0] != "amd64" || seccomp.Architectures[1] != "arm" {
+ t.Error("Expected architectures are not found")
+ }
+
+ calls := seccomp.Syscalls
+
+ callsLength := len(calls)
+ if callsLength != 8 {
+ t.Errorf("Expected 8 syscalls, got :%d", callsLength)
+ }
+
+ for i, call := range calls {
+ if i == 0 {
+ expectedCloneSyscallArgs := configs.Arg{
+ Index: 0,
+ Op: 7, // SCMP_CMP_MASKED_EQ
+ Value: 2080505856,
+ ValueTwo: 0,
+ }
+ if expectedCloneSyscallArgs != *call.Args[0] {
+ t.Errorf("Wrong arguments conversion for the clone syscall under test")
+ }
+ }
+ if call.Action != 4 {
+ t.Error("Wrong conversion for the clone syscall action")
+ }
+
+ }
+
+}
+
+func TestLinuxCgroupWithMemoryResource(t *testing.T) {
+ cgroupsPath := "/user/cgroups/path/id"
+
+ spec := &specs.Spec{}
+ devices := []specs.LinuxDeviceCgroup{
+ {
+ Allow: false,
+ Access: "rwm",
+ },
+ }
+
+ limit := int64(100)
+ reservation := int64(50)
+ swap := int64(20)
+ kernel := int64(40)
+ kernelTCP := int64(45)
+ swappiness := uint64(1)
+ swappinessPtr := &swappiness
+ disableOOMKiller := true
+ resources := &specs.LinuxResources{
+ Devices: devices,
+ Memory: &specs.LinuxMemory{
+ Limit: &limit,
+ Reservation: &reservation,
+ Swap: &swap,
+ Kernel: &kernel,
+ KernelTCP: &kernelTCP,
+ Swappiness: swappinessPtr,
+ DisableOOMKiller: &disableOOMKiller,
+ },
+ }
+ spec.Linux = &specs.Linux{
+ CgroupsPath: cgroupsPath,
+ Resources: resources,
+ }
+
+ opts := &CreateOpts{
+ CgroupName: "ContainerID",
+ UseSystemdCgroup: false,
+ Spec: spec,
+ }
+
+ cgroup, err := createCgroupConfig(opts)
+ if err != nil {
+ t.Errorf("Couldn't create Cgroup config: %v", err)
+ }
+
+ if cgroup.Path != cgroupsPath {
+ t.Errorf("Wrong cgroupsPath, expected '%s' got '%s'", cgroupsPath, cgroup.Path)
+ }
+ if cgroup.Resources.Memory != limit {
+ t.Errorf("Expected to have %d as memory limit, got %d", limit, cgroup.Resources.Memory)
+ }
+ if cgroup.Resources.MemoryReservation != reservation {
+ t.Errorf("Expected to have %d as memory reservation, got %d", reservation, cgroup.Resources.MemoryReservation)
+ }
+ if cgroup.Resources.MemorySwap != swap {
+ t.Errorf("Expected to have %d as swap, got %d", swap, cgroup.Resources.MemorySwap)
+ }
+ if cgroup.Resources.KernelMemory != kernel {
+ t.Errorf("Expected to have %d as Kernel Memory, got %d", kernel, cgroup.Resources.KernelMemory)
+ }
+ if cgroup.Resources.KernelMemoryTCP != kernelTCP {
+ t.Errorf("Expected to have %d as TCP Kernel Memory, got %d", kernelTCP, cgroup.Resources.KernelMemoryTCP)
+ }
+ if cgroup.Resources.MemorySwappiness != swappinessPtr {
+ t.Errorf("Expected to have %d as memory swappiness, got %d", swappinessPtr, cgroup.Resources.MemorySwappiness)
+ }
+ if cgroup.Resources.OomKillDisable != disableOOMKiller {
+ t.Errorf("The OOMKiller should be enabled")
+ }
+}
+
+func TestLinuxCgroupSystemd(t *testing.T) {
+ cgroupsPath := "parent:scopeprefix:name"
+
+ spec := &specs.Spec{}
+ spec.Linux = &specs.Linux{
+ CgroupsPath: cgroupsPath,
+ }
+
+ opts := &CreateOpts{
+ UseSystemdCgroup: true,
+ Spec: spec,
+ }
+
+ cgroup, err := createCgroupConfig(opts)
+
+ if err != nil {
+ t.Errorf("Couldn't create Cgroup config: %v", err)
+ }
+
+ expectedParent := "parent"
+ if cgroup.Parent != expectedParent {
+ t.Errorf("Expected to have %s as Parent instead of %s", expectedParent, cgroup.Parent)
+ }
+
+ expectedScopePrefix := "scopeprefix"
+ if cgroup.ScopePrefix != expectedScopePrefix {
+ t.Errorf("Expected to have %s as ScopePrefix instead of %s", expectedScopePrefix, cgroup.ScopePrefix)
+ }
+
+ expectedName := "name"
+ if cgroup.Name != expectedName {
+ t.Errorf("Expected to have %s as Name instead of %s", expectedName, cgroup.Name)
+ }
+}
+
+func TestLinuxCgroupSystemdWithEmptyPath(t *testing.T) {
+ cgroupsPath := ""
+
+ spec := &specs.Spec{}
+ spec.Linux = &specs.Linux{
+ CgroupsPath: cgroupsPath,
+ }
+
+ opts := &CreateOpts{
+ CgroupName: "ContainerID",
+ UseSystemdCgroup: true,
+ Spec: spec,
+ }
+
+ cgroup, err := createCgroupConfig(opts)
+
+ if err != nil {
+ t.Errorf("Couldn't create Cgroup config: %v", err)
+ }
+
+ expectedParent := "system.slice"
+ if cgroup.Parent != expectedParent {
+ t.Errorf("Expected to have %s as Parent instead of %s", expectedParent, cgroup.Parent)
+ }
+
+ expectedScopePrefix := "runc"
+ if cgroup.ScopePrefix != expectedScopePrefix {
+ t.Errorf("Expected to have %s as ScopePrefix instead of %s", expectedScopePrefix, cgroup.ScopePrefix)
+ }
+
+ if cgroup.Name != opts.CgroupName {
+ t.Errorf("Expected to have %s as Name instead of %s", opts.CgroupName, cgroup.Name)
+ }
+}
+
+func TestLinuxCgroupSystemdWithInvalidPath(t *testing.T) {
+ cgroupsPath := "/user/cgroups/path/id"
+
+ spec := &specs.Spec{}
+ spec.Linux = &specs.Linux{
+ CgroupsPath: cgroupsPath,
+ }
+
+ opts := &CreateOpts{
+ CgroupName: "ContainerID",
+ UseSystemdCgroup: true,
+ Spec: spec,
+ }
+
+ _, err := createCgroupConfig(opts)
+ if err == nil {
+ t.Error("Expected to produce an error if not using the correct format for cgroup paths belonging to systemd")
+ }
+}
+func TestLinuxCgroupsPathSpecified(t *testing.T) {
+ cgroupsPath := "/user/cgroups/path/id"
+
+ spec := &specs.Spec{}
+ spec.Linux = &specs.Linux{
+ CgroupsPath: cgroupsPath,
+ }
+
+ opts := &CreateOpts{
+ CgroupName: "ContainerID",
+ UseSystemdCgroup: false,
+ Spec: spec,
+ }
+
+ cgroup, err := createCgroupConfig(opts)
+ if err != nil {
+ t.Errorf("Couldn't create Cgroup config: %v", err)
+ }
+
+ if cgroup.Path != cgroupsPath {
+ t.Errorf("Wrong cgroupsPath, expected '%s' got '%s'", cgroupsPath, cgroup.Path)
+ }
+}
+
+func TestLinuxCgroupsPathNotSpecified(t *testing.T) {
+ spec := &specs.Spec{}
+ opts := &CreateOpts{
+ CgroupName: "ContainerID",
+ UseSystemdCgroup: false,
+ Spec: spec,
+ }
+
+ cgroup, err := createCgroupConfig(opts)
+ if err != nil {
+ t.Errorf("Couldn't create Cgroup config: %v", err)
+ }
+
+ if cgroup.Path != "" {
+ t.Errorf("Wrong cgroupsPath, expected it to be empty string, got '%s'", cgroup.Path)
+ }
+}
+
+func TestSpecconvExampleValidate(t *testing.T) {
+ spec := Example()
+ spec.Root.Path = "/"
+
+ opts := &CreateOpts{
+ CgroupName: "ContainerID",
+ UseSystemdCgroup: false,
+ Spec: spec,
+ }
+
+ config, err := CreateLibcontainerConfig(opts)
+ if err != nil {
+ t.Errorf("Couldn't create libcontainer config: %v", err)
+ }
+
+ validator := validate.New()
+ if err := validator.Validate(config); err != nil {
+ t.Errorf("Expected specconv to produce valid container config: %v", err)
+ }
+}
+
+func TestDupNamespaces(t *testing.T) {
+ spec := &specs.Spec{
+ Root: &specs.Root{
+ Path: "rootfs",
+ },
+ Linux: &specs.Linux{
+ Namespaces: []specs.LinuxNamespace{
+ {
+ Type: "pid",
+ },
+ {
+ Type: "pid",
+ Path: "/proc/1/ns/pid",
+ },
+ },
+ },
+ }
+
+ _, err := CreateLibcontainerConfig(&CreateOpts{
+ Spec: spec,
+ })
+
+ if !strings.Contains(err.Error(), "malformed spec file: duplicated ns") {
+ t.Errorf("Duplicated namespaces should be forbidden")
+ }
+}
+
+func TestNonZeroEUIDCompatibleSpecconvValidate(t *testing.T) {
+ if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) {
+ t.Skip("userns is unsupported")
+ }
+
+ spec := Example()
+ spec.Root.Path = "/"
+ ToRootless(spec)
+
+ opts := &CreateOpts{
+ CgroupName: "ContainerID",
+ UseSystemdCgroup: false,
+ Spec: spec,
+ RootlessEUID: true,
+ RootlessCgroups: true,
+ }
+
+ config, err := CreateLibcontainerConfig(opts)
+ if err != nil {
+ t.Errorf("Couldn't create libcontainer config: %v", err)
+ }
+
+ validator := validate.New()
+ if err := validator.Validate(config); err != nil {
+ t.Errorf("Expected specconv to produce valid rootless container config: %v", err)
+ }
+}
--- /dev/null
+package stacktrace
+
+import "runtime"
+
+// Capture captures a stacktrace for the current calling go program
+//
+// skip is the number of frames to skip
+func Capture(userSkip int) Stacktrace {
+ var (
+ skip = userSkip + 1 // add one for our own function
+ frames []Frame
+ prevPc uintptr
+ )
+ for i := skip; ; i++ {
+ pc, file, line, ok := runtime.Caller(i)
+ //detect if caller is repeated to avoid loop, gccgo
+ //currently runs into a loop without this check
+ if !ok || pc == prevPc {
+ break
+ }
+ frames = append(frames, NewFrame(pc, file, line))
+ prevPc = pc
+ }
+ return Stacktrace{
+ Frames: frames,
+ }
+}
--- /dev/null
+package stacktrace
+
+import (
+ "strings"
+ "testing"
+)
+
+func captureFunc() Stacktrace {
+ return Capture(0)
+}
+
+func TestCaptureTestFunc(t *testing.T) {
+ stack := captureFunc()
+
+ if len(stack.Frames) == 0 {
+ t.Fatal("expected stack frames to be returned")
+ }
+
+ // the first frame is the caller
+ frame := stack.Frames[0]
+ if expected := "captureFunc"; frame.Function != expected {
+ t.Fatalf("expected function %q but received %q", expected, frame.Function)
+ }
+ expected := "/runc/libcontainer/stacktrace"
+ if !strings.HasSuffix(frame.Package, expected) {
+ t.Fatalf("expected package %q but received %q", expected, frame.Package)
+ }
+ if expected := "capture_test.go"; frame.File != expected {
+ t.Fatalf("expected file %q but received %q", expected, frame.File)
+ }
+}
--- /dev/null
+package stacktrace
+
+import (
+ "path/filepath"
+ "runtime"
+ "strings"
+)
+
+// NewFrame returns a new stack frame for the provided information
+func NewFrame(pc uintptr, file string, line int) Frame {
+ fn := runtime.FuncForPC(pc)
+ if fn == nil {
+ return Frame{}
+ }
+ pack, name := parseFunctionName(fn.Name())
+ return Frame{
+ Line: line,
+ File: filepath.Base(file),
+ Package: pack,
+ Function: name,
+ }
+}
+
+func parseFunctionName(name string) (string, string) {
+ i := strings.LastIndex(name, ".")
+ if i == -1 {
+ return "", name
+ }
+ return name[:i], name[i+1:]
+}
+
+// Frame contains all the information for a stack frame within a go program
+type Frame struct {
+ File string
+ Function string
+ Package string
+ Line int
+}
--- /dev/null
+package stacktrace
+
+import "testing"
+
+func TestParsePackageName(t *testing.T) {
+ var (
+ name = "github.com/opencontainers/runc/libcontainer/stacktrace.captureFunc"
+ expectedPackage = "github.com/opencontainers/runc/libcontainer/stacktrace"
+ expectedFunction = "captureFunc"
+ )
+
+ pack, funcName := parseFunctionName(name)
+ if pack != expectedPackage {
+ t.Fatalf("expected package %q but received %q", expectedPackage, pack)
+ }
+
+ if funcName != expectedFunction {
+ t.Fatalf("expected function %q but received %q", expectedFunction, funcName)
+ }
+}
--- /dev/null
+package stacktrace
+
+type Stacktrace struct {
+ Frames []Frame
+}
--- /dev/null
+// +build linux
+
+package libcontainer
+
+import (
+ "fmt"
+ "os"
+ "os/exec"
+ "runtime"
+ "syscall" //only for Exec
+
+ "github.com/opencontainers/runc/libcontainer/apparmor"
+ "github.com/opencontainers/runc/libcontainer/configs"
+ "github.com/opencontainers/runc/libcontainer/keys"
+ "github.com/opencontainers/runc/libcontainer/seccomp"
+ "github.com/opencontainers/runc/libcontainer/system"
+ "github.com/opencontainers/selinux/go-selinux/label"
+ "github.com/pkg/errors"
+
+ "golang.org/x/sys/unix"
+)
+
+type linuxStandardInit struct {
+ pipe *os.File
+ consoleSocket *os.File
+ parentPid int
+ fifoFd int
+ config *initConfig
+}
+
+func (l *linuxStandardInit) getSessionRingParams() (string, uint32, uint32) {
+ var newperms uint32
+
+ if l.config.Config.Namespaces.Contains(configs.NEWUSER) {
+ // With user ns we need 'other' search permissions.
+ newperms = 0x8
+ } else {
+ // Without user ns we need 'UID' search permissions.
+ newperms = 0x80000
+ }
+
+ // Create a unique per session container name that we can join in setns;
+ // However, other containers can also join it.
+ return fmt.Sprintf("_ses.%s", l.config.ContainerId), 0xffffffff, newperms
+}
+
+func (l *linuxStandardInit) Init() error {
+ runtime.LockOSThread()
+ defer runtime.UnlockOSThread()
+ if !l.config.Config.NoNewKeyring {
+ ringname, keepperms, newperms := l.getSessionRingParams()
+
+ // Do not inherit the parent's session keyring.
+ if sessKeyId, err := keys.JoinSessionKeyring(ringname); err != nil {
+ // If keyrings aren't supported then it is likely we are on an
+ // older kernel (or inside an LXC container). While we could bail,
+ // the security feature we are using here is best-effort (it only
+ // really provides marginal protection since VFS credentials are
+ // the only significant protection of keyrings).
+ //
+ // TODO(cyphar): Log this so people know what's going on, once we
+ // have proper logging in 'runc init'.
+ if errors.Cause(err) != unix.ENOSYS {
+ return errors.Wrap(err, "join session keyring")
+ }
+ } else {
+ // Make session keyring searcheable. If we've gotten this far we
+ // bail on any error -- we don't want to have a keyring with bad
+ // permissions.
+ if err := keys.ModKeyringPerm(sessKeyId, keepperms, newperms); err != nil {
+ return errors.Wrap(err, "mod keyring permissions")
+ }
+ }
+ }
+
+ if err := setupNetwork(l.config); err != nil {
+ return err
+ }
+ if err := setupRoute(l.config.Config); err != nil {
+ return err
+ }
+
+ label.Init()
+ if err := prepareRootfs(l.pipe, l.config); err != nil {
+ return err
+ }
+ // Set up the console. This has to be done *before* we finalize the rootfs,
+ // but *after* we've given the user the chance to set up all of the mounts
+ // they wanted.
+ if l.config.CreateConsole {
+ if err := setupConsole(l.consoleSocket, l.config, true); err != nil {
+ return err
+ }
+ if err := system.Setctty(); err != nil {
+ return errors.Wrap(err, "setctty")
+ }
+ }
+
+ // Finish the rootfs setup.
+ if l.config.Config.Namespaces.Contains(configs.NEWNS) {
+ if err := finalizeRootfs(l.config.Config); err != nil {
+ return err
+ }
+ }
+
+ if hostname := l.config.Config.Hostname; hostname != "" {
+ if err := unix.Sethostname([]byte(hostname)); err != nil {
+ return errors.Wrap(err, "sethostname")
+ }
+ }
+ if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil {
+ return errors.Wrap(err, "apply apparmor profile")
+ }
+
+ for key, value := range l.config.Config.Sysctl {
+ if err := writeSystemProperty(key, value); err != nil {
+ return errors.Wrapf(err, "write sysctl key %s", key)
+ }
+ }
+ for _, path := range l.config.Config.ReadonlyPaths {
+ if err := readonlyPath(path); err != nil {
+ return errors.Wrapf(err, "readonly path %s", path)
+ }
+ }
+ for _, path := range l.config.Config.MaskPaths {
+ if err := maskPath(path, l.config.Config.MountLabel); err != nil {
+ return errors.Wrapf(err, "mask path %s", path)
+ }
+ }
+ pdeath, err := system.GetParentDeathSignal()
+ if err != nil {
+ return errors.Wrap(err, "get pdeath signal")
+ }
+ if l.config.NoNewPrivileges {
+ if err := unix.Prctl(unix.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil {
+ return errors.Wrap(err, "set nonewprivileges")
+ }
+ }
+ // Tell our parent that we're ready to Execv. This must be done before the
+ // Seccomp rules have been applied, because we need to be able to read and
+ // write to a socket.
+ if err := syncParentReady(l.pipe); err != nil {
+ return errors.Wrap(err, "sync ready")
+ }
+ if err := label.SetProcessLabel(l.config.ProcessLabel); err != nil {
+ return errors.Wrap(err, "set process label")
+ }
+ defer label.SetProcessLabel("")
+ // Without NoNewPrivileges seccomp is a privileged operation, so we need to
+ // do this before dropping capabilities; otherwise do it as late as possible
+ // just before execve so as few syscalls take place after it as possible.
+ if l.config.Config.Seccomp != nil && !l.config.NoNewPrivileges {
+ if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
+ return err
+ }
+ }
+ if err := finalizeNamespace(l.config); err != nil {
+ return err
+ }
+ // finalizeNamespace can change user/group which clears the parent death
+ // signal, so we restore it here.
+ if err := pdeath.Restore(); err != nil {
+ return errors.Wrap(err, "restore pdeath signal")
+ }
+ // Compare the parent from the initial start of the init process and make
+ // sure that it did not change. if the parent changes that means it died
+ // and we were reparented to something else so we should just kill ourself
+ // and not cause problems for someone else.
+ if unix.Getppid() != l.parentPid {
+ return unix.Kill(unix.Getpid(), unix.SIGKILL)
+ }
+ // Check for the arg before waiting to make sure it exists and it is
+ // returned as a create time error.
+ name, err := exec.LookPath(l.config.Args[0])
+ if err != nil {
+ return err
+ }
+ // Close the pipe to signal that we have completed our init.
+ l.pipe.Close()
+ // Wait for the FIFO to be opened on the other side before exec-ing the
+ // user process. We open it through /proc/self/fd/$fd, because the fd that
+ // was given to us was an O_PATH fd to the fifo itself. Linux allows us to
+ // re-open an O_PATH fd through /proc.
+ fd, err := unix.Open(fmt.Sprintf("/proc/self/fd/%d", l.fifoFd), unix.O_WRONLY|unix.O_CLOEXEC, 0)
+ if err != nil {
+ return newSystemErrorWithCause(err, "open exec fifo")
+ }
+ if _, err := unix.Write(fd, []byte("0")); err != nil {
+ return newSystemErrorWithCause(err, "write 0 exec fifo")
+ }
+ // Close the O_PATH fifofd fd before exec because the kernel resets
+ // dumpable in the wrong order. This has been fixed in newer kernels, but
+ // we keep this to ensure CVE-2016-9962 doesn't re-emerge on older kernels.
+ // N.B. the core issue itself (passing dirfds to the host filesystem) has
+ // since been resolved.
+ // https://github.com/torvalds/linux/blob/v4.9/fs/exec.c#L1290-L1318
+ unix.Close(l.fifoFd)
+ // Set seccomp as close to execve as possible, so as few syscalls take
+ // place afterward (reducing the amount of syscalls that users need to
+ // enable in their seccomp profiles).
+ if l.config.Config.Seccomp != nil && l.config.NoNewPrivileges {
+ if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
+ return newSystemErrorWithCause(err, "init seccomp")
+ }
+ }
+ if err := syscall.Exec(name, l.config.Args[0:], os.Environ()); err != nil {
+ return newSystemErrorWithCause(err, "exec user process")
+ }
+ return nil
+}
--- /dev/null
+// +build linux
+
+package libcontainer
+
+import (
+ "fmt"
+ "os"
+ "path/filepath"
+
+ "github.com/opencontainers/runc/libcontainer/configs"
+
+ "github.com/sirupsen/logrus"
+ "golang.org/x/sys/unix"
+)
+
+func newStateTransitionError(from, to containerState) error {
+ return &stateTransitionError{
+ From: from.status().String(),
+ To: to.status().String(),
+ }
+}
+
+// stateTransitionError is returned when an invalid state transition happens from one
+// state to another.
+type stateTransitionError struct {
+ From string
+ To string
+}
+
+func (s *stateTransitionError) Error() string {
+ return fmt.Sprintf("invalid state transition from %s to %s", s.From, s.To)
+}
+
+type containerState interface {
+ transition(containerState) error
+ destroy() error
+ status() Status
+}
+
+func destroy(c *linuxContainer) error {
+ if !c.config.Namespaces.Contains(configs.NEWPID) {
+ if err := signalAllProcesses(c.cgroupManager, unix.SIGKILL); err != nil {
+ logrus.Warn(err)
+ }
+ }
+ err := c.cgroupManager.Destroy()
+ if c.intelRdtManager != nil {
+ if ierr := c.intelRdtManager.Destroy(); err == nil {
+ err = ierr
+ }
+ }
+ if rerr := os.RemoveAll(c.root); err == nil {
+ err = rerr
+ }
+ c.initProcess = nil
+ if herr := runPoststopHooks(c); err == nil {
+ err = herr
+ }
+ c.state = &stoppedState{c: c}
+ return err
+}
+
+func runPoststopHooks(c *linuxContainer) error {
+ if c.config.Hooks != nil {
+ s, err := c.currentOCIState()
+ if err != nil {
+ return err
+ }
+ for _, hook := range c.config.Hooks.Poststop {
+ if err := hook.Run(s); err != nil {
+ return err
+ }
+ }
+ }
+ return nil
+}
+
+// stoppedState represents a container is a stopped/destroyed state.
+type stoppedState struct {
+ c *linuxContainer
+}
+
+func (b *stoppedState) status() Status {
+ return Stopped
+}
+
+func (b *stoppedState) transition(s containerState) error {
+ switch s.(type) {
+ case *runningState, *restoredState:
+ b.c.state = s
+ return nil
+ case *stoppedState:
+ return nil
+ }
+ return newStateTransitionError(b, s)
+}
+
+func (b *stoppedState) destroy() error {
+ return destroy(b.c)
+}
+
+// runningState represents a container that is currently running.
+type runningState struct {
+ c *linuxContainer
+}
+
+func (r *runningState) status() Status {
+ return Running
+}
+
+func (r *runningState) transition(s containerState) error {
+ switch s.(type) {
+ case *stoppedState:
+ t, err := r.c.runType()
+ if err != nil {
+ return err
+ }
+ if t == Running {
+ return newGenericError(fmt.Errorf("container still running"), ContainerNotStopped)
+ }
+ r.c.state = s
+ return nil
+ case *pausedState:
+ r.c.state = s
+ return nil
+ case *runningState:
+ return nil
+ }
+ return newStateTransitionError(r, s)
+}
+
+func (r *runningState) destroy() error {
+ t, err := r.c.runType()
+ if err != nil {
+ return err
+ }
+ if t == Running {
+ return newGenericError(fmt.Errorf("container is not destroyed"), ContainerNotStopped)
+ }
+ return destroy(r.c)
+}
+
+type createdState struct {
+ c *linuxContainer
+}
+
+func (i *createdState) status() Status {
+ return Created
+}
+
+func (i *createdState) transition(s containerState) error {
+ switch s.(type) {
+ case *runningState, *pausedState, *stoppedState:
+ i.c.state = s
+ return nil
+ case *createdState:
+ return nil
+ }
+ return newStateTransitionError(i, s)
+}
+
+func (i *createdState) destroy() error {
+ i.c.initProcess.signal(unix.SIGKILL)
+ return destroy(i.c)
+}
+
+// pausedState represents a container that is currently pause. It cannot be destroyed in a
+// paused state and must transition back to running first.
+type pausedState struct {
+ c *linuxContainer
+}
+
+func (p *pausedState) status() Status {
+ return Paused
+}
+
+func (p *pausedState) transition(s containerState) error {
+ switch s.(type) {
+ case *runningState, *stoppedState:
+ p.c.state = s
+ return nil
+ case *pausedState:
+ return nil
+ }
+ return newStateTransitionError(p, s)
+}
+
+func (p *pausedState) destroy() error {
+ t, err := p.c.runType()
+ if err != nil {
+ return err
+ }
+ if t != Running && t != Created {
+ if err := p.c.cgroupManager.Freeze(configs.Thawed); err != nil {
+ return err
+ }
+ return destroy(p.c)
+ }
+ return newGenericError(fmt.Errorf("container is paused"), ContainerPaused)
+}
+
+// restoredState is the same as the running state but also has associated checkpoint
+// information that maybe need destroyed when the container is stopped and destroy is called.
+type restoredState struct {
+ imageDir string
+ c *linuxContainer
+}
+
+func (r *restoredState) status() Status {
+ return Running
+}
+
+func (r *restoredState) transition(s containerState) error {
+ switch s.(type) {
+ case *stoppedState, *runningState:
+ return nil
+ }
+ return newStateTransitionError(r, s)
+}
+
+func (r *restoredState) destroy() error {
+ if _, err := os.Stat(filepath.Join(r.c.root, "checkpoint")); err != nil {
+ if !os.IsNotExist(err) {
+ return err
+ }
+ }
+ return destroy(r.c)
+}
+
+// loadedState is used whenever a container is restored, loaded, or setting additional
+// processes inside and it should not be destroyed when it is exiting.
+type loadedState struct {
+ c *linuxContainer
+ s Status
+}
+
+func (n *loadedState) status() Status {
+ return n.s
+}
+
+func (n *loadedState) transition(s containerState) error {
+ n.c.state = s
+ return nil
+}
+
+func (n *loadedState) destroy() error {
+ if err := n.c.refreshState(); err != nil {
+ return err
+ }
+ return n.c.state.destroy()
+}
--- /dev/null
+// +build linux
+
+package libcontainer
+
+import (
+ "reflect"
+ "testing"
+)
+
+var states = map[containerState]Status{
+ &createdState{}: Created,
+ &runningState{}: Running,
+ &restoredState{}: Running,
+ &pausedState{}: Paused,
+ &stoppedState{}: Stopped,
+ &loadedState{s: Running}: Running,
+}
+
+func TestStateStatus(t *testing.T) {
+ for s, status := range states {
+ if s.status() != status {
+ t.Fatalf("state returned %s but expected %s", s.status(), status)
+ }
+ }
+}
+
+func isStateTransitionError(err error) bool {
+ _, ok := err.(*stateTransitionError)
+ return ok
+}
+
+func testTransitions(t *testing.T, initialState containerState, valid []containerState) {
+ validMap := map[reflect.Type]interface{}{}
+ for _, validState := range valid {
+ validMap[reflect.TypeOf(validState)] = nil
+ t.Run(validState.status().String(), func(t *testing.T) {
+ if err := initialState.transition(validState); err != nil {
+ t.Fatal(err)
+ }
+ })
+ }
+ for state := range states {
+ if _, ok := validMap[reflect.TypeOf(state)]; ok {
+ continue
+ }
+ t.Run(state.status().String(), func(t *testing.T) {
+ err := initialState.transition(state)
+ if err == nil {
+ t.Fatal("transition should fail")
+ }
+ if !isStateTransitionError(err) {
+ t.Fatal("expected stateTransitionError")
+ }
+ })
+ }
+}
+
+func TestStoppedStateTransition(t *testing.T) {
+ testTransitions(
+ t,
+ &stoppedState{c: &linuxContainer{}},
+ []containerState{
+ &stoppedState{},
+ &runningState{},
+ &restoredState{},
+ },
+ )
+}
+
+func TestPausedStateTransition(t *testing.T) {
+ testTransitions(
+ t,
+ &pausedState{c: &linuxContainer{}},
+ []containerState{
+ &pausedState{},
+ &runningState{},
+ &stoppedState{},
+ },
+ )
+}
+
+func TestRestoredStateTransition(t *testing.T) {
+ testTransitions(
+ t,
+ &restoredState{c: &linuxContainer{}},
+ []containerState{
+ &stoppedState{},
+ &runningState{},
+ },
+ )
+}
+
+func TestRunningStateTransition(t *testing.T) {
+ testTransitions(
+ t,
+ &runningState{c: &linuxContainer{}},
+ []containerState{
+ &stoppedState{},
+ &pausedState{},
+ &runningState{},
+ },
+ )
+}
+
+func TestCreatedStateTransition(t *testing.T) {
+ testTransitions(
+ t,
+ &createdState{c: &linuxContainer{}},
+ []containerState{
+ &stoppedState{},
+ &pausedState{},
+ &runningState{},
+ &createdState{},
+ },
+ )
+}
--- /dev/null
+package libcontainer
+
+type NetworkInterface struct {
+ // Name is the name of the network interface.
+ Name string
+
+ RxBytes uint64
+ RxPackets uint64
+ RxErrors uint64
+ RxDropped uint64
+ TxBytes uint64
+ TxPackets uint64
+ TxErrors uint64
+ TxDropped uint64
+}
--- /dev/null
+package libcontainer
+
+import "github.com/opencontainers/runc/libcontainer/cgroups"
+import "github.com/opencontainers/runc/libcontainer/intelrdt"
+
+type Stats struct {
+ Interfaces []*NetworkInterface
+ CgroupStats *cgroups.Stats
+ IntelRdtStats *intelrdt.Stats
+}
--- /dev/null
+package libcontainer
+
+import (
+ "encoding/json"
+ "fmt"
+ "io"
+
+ "github.com/opencontainers/runc/libcontainer/utils"
+)
+
+type syncType string
+
+// Constants that are used for synchronisation between the parent and child
+// during container setup. They come in pairs (with procError being a generic
+// response which is followed by a &genericError).
+//
+// [ child ] <-> [ parent ]
+//
+// procHooks --> [run hooks]
+// <-- procResume
+//
+// procConsole -->
+// <-- procConsoleReq
+// [send(fd)] --> [recv(fd)]
+// <-- procConsoleAck
+//
+// procReady --> [final setup]
+// <-- procRun
+const (
+ procError syncType = "procError"
+ procReady syncType = "procReady"
+ procRun syncType = "procRun"
+ procHooks syncType = "procHooks"
+ procResume syncType = "procResume"
+)
+
+type syncT struct {
+ Type syncType `json:"type"`
+}
+
+// writeSync is used to write to a synchronisation pipe. An error is returned
+// if there was a problem writing the payload.
+func writeSync(pipe io.Writer, sync syncType) error {
+ return utils.WriteJSON(pipe, syncT{sync})
+}
+
+// readSync is used to read from a synchronisation pipe. An error is returned
+// if we got a genericError, the pipe was closed, or we got an unexpected flag.
+func readSync(pipe io.Reader, expected syncType) error {
+ var procSync syncT
+ if err := json.NewDecoder(pipe).Decode(&procSync); err != nil {
+ if err == io.EOF {
+ return fmt.Errorf("parent closed synchronisation channel")
+ }
+
+ if procSync.Type == procError {
+ var ierr genericError
+
+ if err := json.NewDecoder(pipe).Decode(&ierr); err != nil {
+ return fmt.Errorf("failed reading error from parent: %v", err)
+ }
+
+ return &ierr
+ }
+
+ if procSync.Type != expected {
+ return fmt.Errorf("invalid synchronisation flag from parent")
+ }
+ }
+ return nil
+}
+
+// parseSync runs the given callback function on each syncT received from the
+// child. It will return once io.EOF is returned from the given pipe.
+func parseSync(pipe io.Reader, fn func(*syncT) error) error {
+ dec := json.NewDecoder(pipe)
+ for {
+ var sync syncT
+ if err := dec.Decode(&sync); err != nil {
+ if err == io.EOF {
+ break
+ }
+ return err
+ }
+
+ // We handle this case outside fn for cleanliness reasons.
+ var ierr *genericError
+ if sync.Type == procError {
+ if err := dec.Decode(&ierr); err != nil && err != io.EOF {
+ return newSystemErrorWithCause(err, "decoding proc error from init")
+ }
+ if ierr != nil {
+ return ierr
+ }
+ // Programmer error.
+ panic("No error following JSON procError payload.")
+ }
+
+ if err := fn(&sync); err != nil {
+ return err
+ }
+ }
+ return nil
+}
--- /dev/null
+// +build linux
+
+package system
+
+import (
+ "os"
+ "os/exec"
+ "syscall" // only for exec
+ "unsafe"
+
+ "github.com/opencontainers/runc/libcontainer/user"
+ "golang.org/x/sys/unix"
+)
+
+// If arg2 is nonzero, set the "child subreaper" attribute of the
+// calling process; if arg2 is zero, unset the attribute. When a
+// process is marked as a child subreaper, all of the children
+// that it creates, and their descendants, will be marked as
+// having a subreaper. In effect, a subreaper fulfills the role
+// of init(1) for its descendant processes. Upon termination of
+// a process that is orphaned (i.e., its immediate parent has
+// already terminated) and marked as having a subreaper, the
+// nearest still living ancestor subreaper will receive a SIGCHLD
+// signal and be able to wait(2) on the process to discover its
+// termination status.
+const PR_SET_CHILD_SUBREAPER = 36
+
+type ParentDeathSignal int
+
+func (p ParentDeathSignal) Restore() error {
+ if p == 0 {
+ return nil
+ }
+ current, err := GetParentDeathSignal()
+ if err != nil {
+ return err
+ }
+ if p == current {
+ return nil
+ }
+ return p.Set()
+}
+
+func (p ParentDeathSignal) Set() error {
+ return SetParentDeathSignal(uintptr(p))
+}
+
+func Execv(cmd string, args []string, env []string) error {
+ name, err := exec.LookPath(cmd)
+ if err != nil {
+ return err
+ }
+
+ return syscall.Exec(name, args, env)
+}
+
+func Prlimit(pid, resource int, limit unix.Rlimit) error {
+ _, _, err := unix.RawSyscall6(unix.SYS_PRLIMIT64, uintptr(pid), uintptr(resource), uintptr(unsafe.Pointer(&limit)), uintptr(unsafe.Pointer(&limit)), 0, 0)
+ if err != 0 {
+ return err
+ }
+ return nil
+}
+
+func SetParentDeathSignal(sig uintptr) error {
+ if err := unix.Prctl(unix.PR_SET_PDEATHSIG, sig, 0, 0, 0); err != nil {
+ return err
+ }
+ return nil
+}
+
+func GetParentDeathSignal() (ParentDeathSignal, error) {
+ var sig int
+ if err := unix.Prctl(unix.PR_GET_PDEATHSIG, uintptr(unsafe.Pointer(&sig)), 0, 0, 0); err != nil {
+ return -1, err
+ }
+ return ParentDeathSignal(sig), nil
+}
+
+func SetKeepCaps() error {
+ if err := unix.Prctl(unix.PR_SET_KEEPCAPS, 1, 0, 0, 0); err != nil {
+ return err
+ }
+
+ return nil
+}
+
+func ClearKeepCaps() error {
+ if err := unix.Prctl(unix.PR_SET_KEEPCAPS, 0, 0, 0, 0); err != nil {
+ return err
+ }
+
+ return nil
+}
+
+func Setctty() error {
+ if err := unix.IoctlSetInt(0, unix.TIOCSCTTY, 0); err != nil {
+ return err
+ }
+ return nil
+}
+
+// RunningInUserNS detects whether we are currently running in a user namespace.
+// Originally copied from github.com/lxc/lxd/shared/util.go
+func RunningInUserNS() bool {
+ uidmap, err := user.CurrentProcessUIDMap()
+ if err != nil {
+ // This kernel-provided file only exists if user namespaces are supported
+ return false
+ }
+ return UIDMapInUserNS(uidmap)
+}
+
+func UIDMapInUserNS(uidmap []user.IDMap) bool {
+ /*
+ * We assume we are in the initial user namespace if we have a full
+ * range - 4294967295 uids starting at uid 0.
+ */
+ if len(uidmap) == 1 && uidmap[0].ID == 0 && uidmap[0].ParentID == 0 && uidmap[0].Count == 4294967295 {
+ return false
+ }
+ return true
+}
+
+// GetParentNSeuid returns the euid within the parent user namespace
+func GetParentNSeuid() int64 {
+ euid := int64(os.Geteuid())
+ uidmap, err := user.CurrentProcessUIDMap()
+ if err != nil {
+ // This kernel-provided file only exists if user namespaces are supported
+ return euid
+ }
+ for _, um := range uidmap {
+ if um.ID <= euid && euid <= um.ID+um.Count-1 {
+ return um.ParentID + euid - um.ID
+ }
+ }
+ return euid
+}
+
+// SetSubreaper sets the value i as the subreaper setting for the calling process
+func SetSubreaper(i int) error {
+ return unix.Prctl(PR_SET_CHILD_SUBREAPER, uintptr(i), 0, 0, 0)
+}
+
+// GetSubreaper returns the subreaper setting for the calling process
+func GetSubreaper() (int, error) {
+ var i uintptr
+
+ if err := unix.Prctl(unix.PR_GET_CHILD_SUBREAPER, uintptr(unsafe.Pointer(&i)), 0, 0, 0); err != nil {
+ return -1, err
+ }
+
+ return int(i), nil
+}
--- /dev/null
+// +build linux
+
+package system
+
+import (
+ "strings"
+ "testing"
+
+ "github.com/opencontainers/runc/libcontainer/user"
+)
+
+func TestUIDMapInUserNS(t *testing.T) {
+ cases := []struct {
+ s string
+ expected bool
+ }{
+ {
+ s: " 0 0 4294967295\n",
+ expected: false,
+ },
+ {
+ s: " 0 0 1\n",
+ expected: true,
+ },
+ {
+ s: " 0 1001 1\n 1 231072 65536\n",
+ expected: true,
+ },
+ {
+ // file exist but empty (the initial state when userns is created. see man 7 user_namespaces)
+ s: "",
+ expected: true,
+ },
+ }
+ for _, c := range cases {
+ uidmap, err := user.ParseIDMap(strings.NewReader(c.s))
+ if err != nil {
+ t.Fatal(err)
+ }
+ actual := UIDMapInUserNS(uidmap)
+ if c.expected != actual {
+ t.Fatalf("expected %v, got %v for %q", c.expected, actual, c.s)
+ }
+ }
+}
--- /dev/null
+package system
+
+import (
+ "fmt"
+ "io/ioutil"
+ "path/filepath"
+ "strconv"
+ "strings"
+)
+
+// State is the status of a process.
+type State rune
+
+const ( // Only values for Linux 3.14 and later are listed here
+ Dead State = 'X'
+ DiskSleep State = 'D'
+ Running State = 'R'
+ Sleeping State = 'S'
+ Stopped State = 'T'
+ TracingStop State = 't'
+ Zombie State = 'Z'
+)
+
+// String forms of the state from proc(5)'s documentation for
+// /proc/[pid]/status' "State" field.
+func (s State) String() string {
+ switch s {
+ case Dead:
+ return "dead"
+ case DiskSleep:
+ return "disk sleep"
+ case Running:
+ return "running"
+ case Sleeping:
+ return "sleeping"
+ case Stopped:
+ return "stopped"
+ case TracingStop:
+ return "tracing stop"
+ case Zombie:
+ return "zombie"
+ default:
+ return fmt.Sprintf("unknown (%c)", s)
+ }
+}
+
+// Stat_t represents the information from /proc/[pid]/stat, as
+// described in proc(5) with names based on the /proc/[pid]/status
+// fields.
+type Stat_t struct {
+ // PID is the process ID.
+ PID uint
+
+ // Name is the command run by the process.
+ Name string
+
+ // State is the state of the process.
+ State State
+
+ // StartTime is the number of clock ticks after system boot (since
+ // Linux 2.6).
+ StartTime uint64
+}
+
+// Stat returns a Stat_t instance for the specified process.
+func Stat(pid int) (stat Stat_t, err error) {
+ bytes, err := ioutil.ReadFile(filepath.Join("/proc", strconv.Itoa(pid), "stat"))
+ if err != nil {
+ return stat, err
+ }
+ return parseStat(string(bytes))
+}
+
+// GetProcessStartTime is deprecated. Use Stat(pid) and
+// Stat_t.StartTime instead.
+func GetProcessStartTime(pid int) (string, error) {
+ stat, err := Stat(pid)
+ if err != nil {
+ return "", err
+ }
+ return fmt.Sprintf("%d", stat.StartTime), nil
+}
+
+func parseStat(data string) (stat Stat_t, err error) {
+ // From proc(5), field 2 could contain space and is inside `(` and `)`.
+ // The following is an example:
+ // 89653 (gunicorn: maste) S 89630 89653 89653 0 -1 4194560 29689 28896 0 3 146 32 76 19 20 0 1 0 2971844 52965376 3920 18446744073709551615 1 1 0 0 0 0 0 16781312 137447943 0 0 0 17 1 0 0 0 0 0 0 0 0 0 0 0 0 0
+ i := strings.LastIndex(data, ")")
+ if i <= 2 || i >= len(data)-1 {
+ return stat, fmt.Errorf("invalid stat data: %q", data)
+ }
+
+ parts := strings.SplitN(data[:i], "(", 2)
+ if len(parts) != 2 {
+ return stat, fmt.Errorf("invalid stat data: %q", data)
+ }
+
+ stat.Name = parts[1]
+ _, err = fmt.Sscanf(parts[0], "%d", &stat.PID)
+ if err != nil {
+ return stat, err
+ }
+
+ // parts indexes should be offset by 3 from the field number given
+ // proc(5), because parts is zero-indexed and we've removed fields
+ // one (PID) and two (Name) in the paren-split.
+ parts = strings.Split(data[i+2:], " ")
+ var state int
+ fmt.Sscanf(parts[3-3], "%c", &state)
+ stat.State = State(state)
+ fmt.Sscanf(parts[22-3], "%d", &stat.StartTime)
+ return stat, nil
+}
--- /dev/null
+package system
+
+import "testing"
+
+func TestParseStartTime(t *testing.T) {
+ data := map[string]Stat_t{
+ "4902 (gunicorn: maste) S 4885 4902 4902 0 -1 4194560 29683 29929 61 83 78 16 96 17 20 0 1 0 9126532 52965376 1903 18446744073709551615 4194304 7461796 140733928751520 140733928698072 139816984959091 0 0 16781312 137447943 1 0 0 17 3 0 0 9 0 0 9559488 10071156 33050624 140733928758775 140733928758945 140733928758945 140733928759264 0": {
+ PID: 4902,
+ Name: "gunicorn: maste",
+ State: 'S',
+ StartTime: 9126532,
+ },
+ "9534 (cat) R 9323 9534 9323 34828 9534 4194304 95 0 0 0 0 0 0 0 20 0 1 0 9214966 7626752 168 18446744073709551615 4194304 4240332 140732237651568 140732237650920 140570710391216 0 0 0 0 0 0 0 17 1 0 0 0 0 0 6340112 6341364 21553152 140732237653865 140732237653885 140732237653885 140732237656047 0": {
+ PID: 9534,
+ Name: "cat",
+ State: 'R',
+ StartTime: 9214966,
+ },
+
+ "24767 (irq/44-mei_me) S 2 0 0 0 -1 2129984 0 0 0 0 0 0 0 0 -51 0 1 0 8722075 0 0 18446744073709551615 0 0 0 0 0 0 0 2147483647 0 0 0 0 17 1 50 1 0 0 0 0 0 0 0 0 0 0 0": {
+ PID: 24767,
+ Name: "irq/44-mei_me",
+ State: 'S',
+ StartTime: 8722075,
+ },
+ }
+ for line, expected := range data {
+ st, err := parseStat(line)
+ if err != nil {
+ t.Fatal(err)
+ }
+ if st.PID != expected.PID {
+ t.Fatalf("expected PID %q but received %q", expected.PID, st.PID)
+ }
+ if st.State != expected.State {
+ t.Fatalf("expected state %q but received %q", expected.State, st.State)
+ }
+ if st.Name != expected.Name {
+ t.Fatalf("expected name %q but received %q", expected.Name, st.Name)
+ }
+ if st.StartTime != expected.StartTime {
+ t.Fatalf("expected start time %q but received %q", expected.StartTime, st.StartTime)
+ }
+ }
+}
--- /dev/null
+// +build linux
+// +build 386 arm
+
+package system
+
+import (
+ "golang.org/x/sys/unix"
+)
+
+// Setuid sets the uid of the calling thread to the specified uid.
+func Setuid(uid int) (err error) {
+ _, _, e1 := unix.RawSyscall(unix.SYS_SETUID32, uintptr(uid), 0, 0)
+ if e1 != 0 {
+ err = e1
+ }
+ return
+}
+
+// Setgid sets the gid of the calling thread to the specified gid.
+func Setgid(gid int) (err error) {
+ _, _, e1 := unix.RawSyscall(unix.SYS_SETGID32, uintptr(gid), 0, 0)
+ if e1 != 0 {
+ err = e1
+ }
+ return
+}
--- /dev/null
+// +build linux
+// +build arm64 amd64 mips mipsle mips64 mips64le ppc ppc64 ppc64le s390x
+
+package system
+
+import (
+ "golang.org/x/sys/unix"
+)
+
+// Setuid sets the uid of the calling thread to the specified uid.
+func Setuid(uid int) (err error) {
+ _, _, e1 := unix.RawSyscall(unix.SYS_SETUID, uintptr(uid), 0, 0)
+ if e1 != 0 {
+ err = e1
+ }
+ return
+}
+
+// Setgid sets the gid of the calling thread to the specified gid.
+func Setgid(gid int) (err error) {
+ _, _, e1 := unix.RawSyscall(unix.SYS_SETGID, uintptr(gid), 0, 0)
+ if e1 != 0 {
+ err = e1
+ }
+ return
+}
--- /dev/null
+// +build cgo,linux
+
+package system
+
+/*
+#include <unistd.h>
+*/
+import "C"
+
+func GetClockTicks() int {
+ return int(C.sysconf(C._SC_CLK_TCK))
+}
--- /dev/null
+// +build !cgo windows
+
+package system
+
+func GetClockTicks() int {
+ // TODO figure out a better alternative for platforms where we're missing cgo
+ //
+ // TODO Windows. This could be implemented using Win32 QueryPerformanceFrequency().
+ // https://msdn.microsoft.com/en-us/library/windows/desktop/ms644905(v=vs.85).aspx
+ //
+ // An example of its usage can be found here.
+ // https://msdn.microsoft.com/en-us/library/windows/desktop/dn553408(v=vs.85).aspx
+
+ return 100
+}
--- /dev/null
+// +build !linux
+
+package system
+
+import (
+ "os"
+
+ "github.com/opencontainers/runc/libcontainer/user"
+)
+
+// RunningInUserNS is a stub for non-Linux systems
+// Always returns false
+func RunningInUserNS() bool {
+ return false
+}
+
+// UIDMapInUserNS is a stub for non-Linux systems
+// Always returns false
+func UIDMapInUserNS(uidmap []user.IDMap) bool {
+ return false
+}
+
+// GetParentNSeuid returns the euid within the parent user namespace
+// Always returns os.Geteuid on non-linux
+func GetParentNSeuid() int {
+ return os.Geteuid()
+}
--- /dev/null
+package system
+
+import "golang.org/x/sys/unix"
+
+// Returns a []byte slice if the xattr is set and nil otherwise
+// Requires path and its attribute as arguments
+func Lgetxattr(path string, attr string) ([]byte, error) {
+ var sz int
+ // Start with a 128 length byte array
+ dest := make([]byte, 128)
+ sz, errno := unix.Lgetxattr(path, attr, dest)
+
+ switch {
+ case errno == unix.ENODATA:
+ return nil, errno
+ case errno == unix.ENOTSUP:
+ return nil, errno
+ case errno == unix.ERANGE:
+ // 128 byte array might just not be good enough,
+ // A dummy buffer is used to get the real size
+ // of the xattrs on disk
+ sz, errno = unix.Lgetxattr(path, attr, []byte{})
+ if errno != nil {
+ return nil, errno
+ }
+ dest = make([]byte, sz)
+ sz, errno = unix.Lgetxattr(path, attr, dest)
+ if errno != nil {
+ return nil, errno
+ }
+ case errno != nil:
+ return nil, errno
+ }
+ return dest[:sz], nil
+}
--- /dev/null
+Tianon Gravi <admwiggin@gmail.com> (@tianon)
+Aleksa Sarai <cyphar@cyphar.com> (@cyphar)
--- /dev/null
+package user
+
+import (
+ "errors"
+)
+
+var (
+ // The current operating system does not provide the required data for user lookups.
+ ErrUnsupported = errors.New("user lookup: operating system does not provide passwd-formatted data")
+ // No matching entries found in file.
+ ErrNoPasswdEntries = errors.New("no matching entries in passwd file")
+ ErrNoGroupEntries = errors.New("no matching entries in group file")
+)
+
+// LookupUser looks up a user by their username in /etc/passwd. If the user
+// cannot be found (or there is no /etc/passwd file on the filesystem), then
+// LookupUser returns an error.
+func LookupUser(username string) (User, error) {
+ return lookupUser(username)
+}
+
+// LookupUid looks up a user by their user id in /etc/passwd. If the user cannot
+// be found (or there is no /etc/passwd file on the filesystem), then LookupId
+// returns an error.
+func LookupUid(uid int) (User, error) {
+ return lookupUid(uid)
+}
+
+// LookupGroup looks up a group by its name in /etc/group. If the group cannot
+// be found (or there is no /etc/group file on the filesystem), then LookupGroup
+// returns an error.
+func LookupGroup(groupname string) (Group, error) {
+ return lookupGroup(groupname)
+}
+
+// LookupGid looks up a group by its group id in /etc/group. If the group cannot
+// be found (or there is no /etc/group file on the filesystem), then LookupGid
+// returns an error.
+func LookupGid(gid int) (Group, error) {
+ return lookupGid(gid)
+}
--- /dev/null
+// +build darwin dragonfly freebsd linux netbsd openbsd solaris
+
+package user
+
+import (
+ "io"
+ "os"
+ "strconv"
+
+ "golang.org/x/sys/unix"
+)
+
+// Unix-specific path to the passwd and group formatted files.
+const (
+ unixPasswdPath = "/etc/passwd"
+ unixGroupPath = "/etc/group"
+)
+
+func lookupUser(username string) (User, error) {
+ return lookupUserFunc(func(u User) bool {
+ return u.Name == username
+ })
+}
+
+func lookupUid(uid int) (User, error) {
+ return lookupUserFunc(func(u User) bool {
+ return u.Uid == uid
+ })
+}
+
+func lookupUserFunc(filter func(u User) bool) (User, error) {
+ // Get operating system-specific passwd reader-closer.
+ passwd, err := GetPasswd()
+ if err != nil {
+ return User{}, err
+ }
+ defer passwd.Close()
+
+ // Get the users.
+ users, err := ParsePasswdFilter(passwd, filter)
+ if err != nil {
+ return User{}, err
+ }
+
+ // No user entries found.
+ if len(users) == 0 {
+ return User{}, ErrNoPasswdEntries
+ }
+
+ // Assume the first entry is the "correct" one.
+ return users[0], nil
+}
+
+func lookupGroup(groupname string) (Group, error) {
+ return lookupGroupFunc(func(g Group) bool {
+ return g.Name == groupname
+ })
+}
+
+func lookupGid(gid int) (Group, error) {
+ return lookupGroupFunc(func(g Group) bool {
+ return g.Gid == gid
+ })
+}
+
+func lookupGroupFunc(filter func(g Group) bool) (Group, error) {
+ // Get operating system-specific group reader-closer.
+ group, err := GetGroup()
+ if err != nil {
+ return Group{}, err
+ }
+ defer group.Close()
+
+ // Get the users.
+ groups, err := ParseGroupFilter(group, filter)
+ if err != nil {
+ return Group{}, err
+ }
+
+ // No user entries found.
+ if len(groups) == 0 {
+ return Group{}, ErrNoGroupEntries
+ }
+
+ // Assume the first entry is the "correct" one.
+ return groups[0], nil
+}
+
+func GetPasswdPath() (string, error) {
+ return unixPasswdPath, nil
+}
+
+func GetPasswd() (io.ReadCloser, error) {
+ return os.Open(unixPasswdPath)
+}
+
+func GetGroupPath() (string, error) {
+ return unixGroupPath, nil
+}
+
+func GetGroup() (io.ReadCloser, error) {
+ return os.Open(unixGroupPath)
+}
+
+// CurrentUser looks up the current user by their user id in /etc/passwd. If the
+// user cannot be found (or there is no /etc/passwd file on the filesystem),
+// then CurrentUser returns an error.
+func CurrentUser() (User, error) {
+ return LookupUid(unix.Getuid())
+}
+
+// CurrentGroup looks up the current user's group by their primary group id's
+// entry in /etc/passwd. If the group cannot be found (or there is no
+// /etc/group file on the filesystem), then CurrentGroup returns an error.
+func CurrentGroup() (Group, error) {
+ return LookupGid(unix.Getgid())
+}
+
+func currentUserSubIDs(fileName string) ([]SubID, error) {
+ u, err := CurrentUser()
+ if err != nil {
+ return nil, err
+ }
+ filter := func(entry SubID) bool {
+ return entry.Name == u.Name || entry.Name == strconv.Itoa(u.Uid)
+ }
+ return ParseSubIDFileFilter(fileName, filter)
+}
+
+func CurrentUserSubUIDs() ([]SubID, error) {
+ return currentUserSubIDs("/etc/subuid")
+}
+
+func CurrentUserSubGIDs() ([]SubID, error) {
+ return currentUserSubIDs("/etc/subgid")
+}
+
+func CurrentProcessUIDMap() ([]IDMap, error) {
+ return ParseIDMapFile("/proc/self/uid_map")
+}
+
+func CurrentProcessGIDMap() ([]IDMap, error) {
+ return ParseIDMapFile("/proc/self/gid_map")
+}
--- /dev/null
+// +build windows
+
+package user
+
+import (
+ "fmt"
+ "os/user"
+)
+
+func lookupUser(username string) (User, error) {
+ u, err := user.Lookup(username)
+ if err != nil {
+ return User{}, err
+ }
+ return userFromOS(u)
+}
+
+func lookupUid(uid int) (User, error) {
+ u, err := user.LookupId(fmt.Sprintf("%d", uid))
+ if err != nil {
+ return User{}, err
+ }
+ return userFromOS(u)
+}
+
+func lookupGroup(groupname string) (Group, error) {
+ g, err := user.LookupGroup(groupname)
+ if err != nil {
+ return Group{}, err
+ }
+ return groupFromOS(g)
+}
+
+func lookupGid(gid int) (Group, error) {
+ g, err := user.LookupGroupId(fmt.Sprintf("%d", gid))
+ if err != nil {
+ return Group{}, err
+ }
+ return groupFromOS(g)
+}
--- /dev/null
+package user
+
+import (
+ "bufio"
+ "fmt"
+ "io"
+ "os"
+ "os/user"
+ "strconv"
+ "strings"
+)
+
+const (
+ minId = 0
+ maxId = 1<<31 - 1 //for 32-bit systems compatibility
+)
+
+var (
+ ErrRange = fmt.Errorf("uids and gids must be in range %d-%d", minId, maxId)
+)
+
+type User struct {
+ Name string
+ Pass string
+ Uid int
+ Gid int
+ Gecos string
+ Home string
+ Shell string
+}
+
+// userFromOS converts an os/user.(*User) to local User
+//
+// (This does not include Pass, Shell or Gecos)
+func userFromOS(u *user.User) (User, error) {
+ newUser := User{
+ Name: u.Username,
+ Home: u.HomeDir,
+ }
+ id, err := strconv.Atoi(u.Uid)
+ if err != nil {
+ return newUser, err
+ }
+ newUser.Uid = id
+
+ id, err = strconv.Atoi(u.Gid)
+ if err != nil {
+ return newUser, err
+ }
+ newUser.Gid = id
+ return newUser, nil
+}
+
+type Group struct {
+ Name string
+ Pass string
+ Gid int
+ List []string
+}
+
+// groupFromOS converts an os/user.(*Group) to local Group
+//
+// (This does not include Pass, Shell or Gecos)
+func groupFromOS(g *user.Group) (Group, error) {
+ newGroup := Group{
+ Name: g.Name,
+ }
+
+ id, err := strconv.Atoi(g.Gid)
+ if err != nil {
+ return newGroup, err
+ }
+ newGroup.Gid = id
+
+ return newGroup, nil
+}
+
+// SubID represents an entry in /etc/sub{u,g}id
+type SubID struct {
+ Name string
+ SubID int64
+ Count int64
+}
+
+// IDMap represents an entry in /proc/PID/{u,g}id_map
+type IDMap struct {
+ ID int64
+ ParentID int64
+ Count int64
+}
+
+func parseLine(line string, v ...interface{}) {
+ parseParts(strings.Split(line, ":"), v...)
+}
+
+func parseParts(parts []string, v ...interface{}) {
+ if len(parts) == 0 {
+ return
+ }
+
+ for i, p := range parts {
+ // Ignore cases where we don't have enough fields to populate the arguments.
+ // Some configuration files like to misbehave.
+ if len(v) <= i {
+ break
+ }
+
+ // Use the type of the argument to figure out how to parse it, scanf() style.
+ // This is legit.
+ switch e := v[i].(type) {
+ case *string:
+ *e = p
+ case *int:
+ // "numbers", with conversion errors ignored because of some misbehaving configuration files.
+ *e, _ = strconv.Atoi(p)
+ case *int64:
+ *e, _ = strconv.ParseInt(p, 10, 64)
+ case *[]string:
+ // Comma-separated lists.
+ if p != "" {
+ *e = strings.Split(p, ",")
+ } else {
+ *e = []string{}
+ }
+ default:
+ // Someone goof'd when writing code using this function. Scream so they can hear us.
+ panic(fmt.Sprintf("parseLine only accepts {*string, *int, *int64, *[]string} as arguments! %#v is not a pointer!", e))
+ }
+ }
+}
+
+func ParsePasswdFile(path string) ([]User, error) {
+ passwd, err := os.Open(path)
+ if err != nil {
+ return nil, err
+ }
+ defer passwd.Close()
+ return ParsePasswd(passwd)
+}
+
+func ParsePasswd(passwd io.Reader) ([]User, error) {
+ return ParsePasswdFilter(passwd, nil)
+}
+
+func ParsePasswdFileFilter(path string, filter func(User) bool) ([]User, error) {
+ passwd, err := os.Open(path)
+ if err != nil {
+ return nil, err
+ }
+ defer passwd.Close()
+ return ParsePasswdFilter(passwd, filter)
+}
+
+func ParsePasswdFilter(r io.Reader, filter func(User) bool) ([]User, error) {
+ if r == nil {
+ return nil, fmt.Errorf("nil source for passwd-formatted data")
+ }
+
+ var (
+ s = bufio.NewScanner(r)
+ out = []User{}
+ )
+
+ for s.Scan() {
+ if err := s.Err(); err != nil {
+ return nil, err
+ }
+
+ line := strings.TrimSpace(s.Text())
+ if line == "" {
+ continue
+ }
+
+ // see: man 5 passwd
+ // name:password:UID:GID:GECOS:directory:shell
+ // Name:Pass:Uid:Gid:Gecos:Home:Shell
+ // root:x:0:0:root:/root:/bin/bash
+ // adm:x:3:4:adm:/var/adm:/bin/false
+ p := User{}
+ parseLine(line, &p.Name, &p.Pass, &p.Uid, &p.Gid, &p.Gecos, &p.Home, &p.Shell)
+
+ if filter == nil || filter(p) {
+ out = append(out, p)
+ }
+ }
+
+ return out, nil
+}
+
+func ParseGroupFile(path string) ([]Group, error) {
+ group, err := os.Open(path)
+ if err != nil {
+ return nil, err
+ }
+
+ defer group.Close()
+ return ParseGroup(group)
+}
+
+func ParseGroup(group io.Reader) ([]Group, error) {
+ return ParseGroupFilter(group, nil)
+}
+
+func ParseGroupFileFilter(path string, filter func(Group) bool) ([]Group, error) {
+ group, err := os.Open(path)
+ if err != nil {
+ return nil, err
+ }
+ defer group.Close()
+ return ParseGroupFilter(group, filter)
+}
+
+func ParseGroupFilter(r io.Reader, filter func(Group) bool) ([]Group, error) {
+ if r == nil {
+ return nil, fmt.Errorf("nil source for group-formatted data")
+ }
+
+ var (
+ s = bufio.NewScanner(r)
+ out = []Group{}
+ )
+
+ for s.Scan() {
+ if err := s.Err(); err != nil {
+ return nil, err
+ }
+
+ text := s.Text()
+ if text == "" {
+ continue
+ }
+
+ // see: man 5 group
+ // group_name:password:GID:user_list
+ // Name:Pass:Gid:List
+ // root:x:0:root
+ // adm:x:4:root,adm,daemon
+ p := Group{}
+ parseLine(text, &p.Name, &p.Pass, &p.Gid, &p.List)
+
+ if filter == nil || filter(p) {
+ out = append(out, p)
+ }
+ }
+
+ return out, nil
+}
+
+type ExecUser struct {
+ Uid int
+ Gid int
+ Sgids []int
+ Home string
+}
+
+// GetExecUserPath is a wrapper for GetExecUser. It reads data from each of the
+// given file paths and uses that data as the arguments to GetExecUser. If the
+// files cannot be opened for any reason, the error is ignored and a nil
+// io.Reader is passed instead.
+func GetExecUserPath(userSpec string, defaults *ExecUser, passwdPath, groupPath string) (*ExecUser, error) {
+ var passwd, group io.Reader
+
+ if passwdFile, err := os.Open(passwdPath); err == nil {
+ passwd = passwdFile
+ defer passwdFile.Close()
+ }
+
+ if groupFile, err := os.Open(groupPath); err == nil {
+ group = groupFile
+ defer groupFile.Close()
+ }
+
+ return GetExecUser(userSpec, defaults, passwd, group)
+}
+
+// GetExecUser parses a user specification string (using the passwd and group
+// readers as sources for /etc/passwd and /etc/group data, respectively). In
+// the case of blank fields or missing data from the sources, the values in
+// defaults is used.
+//
+// GetExecUser will return an error if a user or group literal could not be
+// found in any entry in passwd and group respectively.
+//
+// Examples of valid user specifications are:
+// * ""
+// * "user"
+// * "uid"
+// * "user:group"
+// * "uid:gid
+// * "user:gid"
+// * "uid:group"
+//
+// It should be noted that if you specify a numeric user or group id, they will
+// not be evaluated as usernames (only the metadata will be filled). So attempting
+// to parse a user with user.Name = "1337" will produce the user with a UID of
+// 1337.
+func GetExecUser(userSpec string, defaults *ExecUser, passwd, group io.Reader) (*ExecUser, error) {
+ if defaults == nil {
+ defaults = new(ExecUser)
+ }
+
+ // Copy over defaults.
+ user := &ExecUser{
+ Uid: defaults.Uid,
+ Gid: defaults.Gid,
+ Sgids: defaults.Sgids,
+ Home: defaults.Home,
+ }
+
+ // Sgids slice *cannot* be nil.
+ if user.Sgids == nil {
+ user.Sgids = []int{}
+ }
+
+ // Allow for userArg to have either "user" syntax, or optionally "user:group" syntax
+ var userArg, groupArg string
+ parseLine(userSpec, &userArg, &groupArg)
+
+ // Convert userArg and groupArg to be numeric, so we don't have to execute
+ // Atoi *twice* for each iteration over lines.
+ uidArg, uidErr := strconv.Atoi(userArg)
+ gidArg, gidErr := strconv.Atoi(groupArg)
+
+ // Find the matching user.
+ users, err := ParsePasswdFilter(passwd, func(u User) bool {
+ if userArg == "" {
+ // Default to current state of the user.
+ return u.Uid == user.Uid
+ }
+
+ if uidErr == nil {
+ // If the userArg is numeric, always treat it as a UID.
+ return uidArg == u.Uid
+ }
+
+ return u.Name == userArg
+ })
+
+ // If we can't find the user, we have to bail.
+ if err != nil && passwd != nil {
+ if userArg == "" {
+ userArg = strconv.Itoa(user.Uid)
+ }
+ return nil, fmt.Errorf("unable to find user %s: %v", userArg, err)
+ }
+
+ var matchedUserName string
+ if len(users) > 0 {
+ // First match wins, even if there's more than one matching entry.
+ matchedUserName = users[0].Name
+ user.Uid = users[0].Uid
+ user.Gid = users[0].Gid
+ user.Home = users[0].Home
+ } else if userArg != "" {
+ // If we can't find a user with the given username, the only other valid
+ // option is if it's a numeric username with no associated entry in passwd.
+
+ if uidErr != nil {
+ // Not numeric.
+ return nil, fmt.Errorf("unable to find user %s: %v", userArg, ErrNoPasswdEntries)
+ }
+ user.Uid = uidArg
+
+ // Must be inside valid uid range.
+ if user.Uid < minId || user.Uid > maxId {
+ return nil, ErrRange
+ }
+
+ // Okay, so it's numeric. We can just roll with this.
+ }
+
+ // On to the groups. If we matched a username, we need to do this because of
+ // the supplementary group IDs.
+ if groupArg != "" || matchedUserName != "" {
+ groups, err := ParseGroupFilter(group, func(g Group) bool {
+ // If the group argument isn't explicit, we'll just search for it.
+ if groupArg == "" {
+ // Check if user is a member of this group.
+ for _, u := range g.List {
+ if u == matchedUserName {
+ return true
+ }
+ }
+ return false
+ }
+
+ if gidErr == nil {
+ // If the groupArg is numeric, always treat it as a GID.
+ return gidArg == g.Gid
+ }
+
+ return g.Name == groupArg
+ })
+ if err != nil && group != nil {
+ return nil, fmt.Errorf("unable to find groups for spec %v: %v", matchedUserName, err)
+ }
+
+ // Only start modifying user.Gid if it is in explicit form.
+ if groupArg != "" {
+ if len(groups) > 0 {
+ // First match wins, even if there's more than one matching entry.
+ user.Gid = groups[0].Gid
+ } else {
+ // If we can't find a group with the given name, the only other valid
+ // option is if it's a numeric group name with no associated entry in group.
+
+ if gidErr != nil {
+ // Not numeric.
+ return nil, fmt.Errorf("unable to find group %s: %v", groupArg, ErrNoGroupEntries)
+ }
+ user.Gid = gidArg
+
+ // Must be inside valid gid range.
+ if user.Gid < minId || user.Gid > maxId {
+ return nil, ErrRange
+ }
+
+ // Okay, so it's numeric. We can just roll with this.
+ }
+ } else if len(groups) > 0 {
+ // Supplementary group ids only make sense if in the implicit form.
+ user.Sgids = make([]int, len(groups))
+ for i, group := range groups {
+ user.Sgids[i] = group.Gid
+ }
+ }
+ }
+
+ return user, nil
+}
+
+// GetAdditionalGroups looks up a list of groups by name or group id
+// against the given /etc/group formatted data. If a group name cannot
+// be found, an error will be returned. If a group id cannot be found,
+// or the given group data is nil, the id will be returned as-is
+// provided it is in the legal range.
+func GetAdditionalGroups(additionalGroups []string, group io.Reader) ([]int, error) {
+ var groups = []Group{}
+ if group != nil {
+ var err error
+ groups, err = ParseGroupFilter(group, func(g Group) bool {
+ for _, ag := range additionalGroups {
+ if g.Name == ag || strconv.Itoa(g.Gid) == ag {
+ return true
+ }
+ }
+ return false
+ })
+ if err != nil {
+ return nil, fmt.Errorf("Unable to find additional groups %v: %v", additionalGroups, err)
+ }
+ }
+
+ gidMap := make(map[int]struct{})
+ for _, ag := range additionalGroups {
+ var found bool
+ for _, g := range groups {
+ // if we found a matched group either by name or gid, take the
+ // first matched as correct
+ if g.Name == ag || strconv.Itoa(g.Gid) == ag {
+ if _, ok := gidMap[g.Gid]; !ok {
+ gidMap[g.Gid] = struct{}{}
+ found = true
+ break
+ }
+ }
+ }
+ // we asked for a group but didn't find it. let's check to see
+ // if we wanted a numeric group
+ if !found {
+ gid, err := strconv.Atoi(ag)
+ if err != nil {
+ return nil, fmt.Errorf("Unable to find group %s", ag)
+ }
+ // Ensure gid is inside gid range.
+ if gid < minId || gid > maxId {
+ return nil, ErrRange
+ }
+ gidMap[gid] = struct{}{}
+ }
+ }
+ gids := []int{}
+ for gid := range gidMap {
+ gids = append(gids, gid)
+ }
+ return gids, nil
+}
+
+// GetAdditionalGroupsPath is a wrapper around GetAdditionalGroups
+// that opens the groupPath given and gives it as an argument to
+// GetAdditionalGroups.
+func GetAdditionalGroupsPath(additionalGroups []string, groupPath string) ([]int, error) {
+ var group io.Reader
+
+ if groupFile, err := os.Open(groupPath); err == nil {
+ group = groupFile
+ defer groupFile.Close()
+ }
+ return GetAdditionalGroups(additionalGroups, group)
+}
+
+func ParseSubIDFile(path string) ([]SubID, error) {
+ subid, err := os.Open(path)
+ if err != nil {
+ return nil, err
+ }
+ defer subid.Close()
+ return ParseSubID(subid)
+}
+
+func ParseSubID(subid io.Reader) ([]SubID, error) {
+ return ParseSubIDFilter(subid, nil)
+}
+
+func ParseSubIDFileFilter(path string, filter func(SubID) bool) ([]SubID, error) {
+ subid, err := os.Open(path)
+ if err != nil {
+ return nil, err
+ }
+ defer subid.Close()
+ return ParseSubIDFilter(subid, filter)
+}
+
+func ParseSubIDFilter(r io.Reader, filter func(SubID) bool) ([]SubID, error) {
+ if r == nil {
+ return nil, fmt.Errorf("nil source for subid-formatted data")
+ }
+
+ var (
+ s = bufio.NewScanner(r)
+ out = []SubID{}
+ )
+
+ for s.Scan() {
+ if err := s.Err(); err != nil {
+ return nil, err
+ }
+
+ line := strings.TrimSpace(s.Text())
+ if line == "" {
+ continue
+ }
+
+ // see: man 5 subuid
+ p := SubID{}
+ parseLine(line, &p.Name, &p.SubID, &p.Count)
+
+ if filter == nil || filter(p) {
+ out = append(out, p)
+ }
+ }
+
+ return out, nil
+}
+
+func ParseIDMapFile(path string) ([]IDMap, error) {
+ r, err := os.Open(path)
+ if err != nil {
+ return nil, err
+ }
+ defer r.Close()
+ return ParseIDMap(r)
+}
+
+func ParseIDMap(r io.Reader) ([]IDMap, error) {
+ return ParseIDMapFilter(r, nil)
+}
+
+func ParseIDMapFileFilter(path string, filter func(IDMap) bool) ([]IDMap, error) {
+ r, err := os.Open(path)
+ if err != nil {
+ return nil, err
+ }
+ defer r.Close()
+ return ParseIDMapFilter(r, filter)
+}
+
+func ParseIDMapFilter(r io.Reader, filter func(IDMap) bool) ([]IDMap, error) {
+ if r == nil {
+ return nil, fmt.Errorf("nil source for idmap-formatted data")
+ }
+
+ var (
+ s = bufio.NewScanner(r)
+ out = []IDMap{}
+ )
+
+ for s.Scan() {
+ if err := s.Err(); err != nil {
+ return nil, err
+ }
+
+ line := strings.TrimSpace(s.Text())
+ if line == "" {
+ continue
+ }
+
+ // see: man 7 user_namespaces
+ p := IDMap{}
+ parseParts(strings.Fields(line), &p.ID, &p.ParentID, &p.Count)
+
+ if filter == nil || filter(p) {
+ out = append(out, p)
+ }
+ }
+
+ return out, nil
+}
--- /dev/null
+package user
+
+import (
+ "io"
+ "reflect"
+ "sort"
+ "strconv"
+ "strings"
+ "testing"
+
+ "github.com/opencontainers/runc/libcontainer/utils"
+)
+
+func TestUserParseLine(t *testing.T) {
+ var (
+ a, b string
+ c []string
+ d int
+ )
+
+ parseLine("", &a, &b)
+ if a != "" || b != "" {
+ t.Fatalf("a and b should be empty ('%v', '%v')", a, b)
+ }
+
+ parseLine("a", &a, &b)
+ if a != "a" || b != "" {
+ t.Fatalf("a should be 'a' and b should be empty ('%v', '%v')", a, b)
+ }
+
+ parseLine("bad boys:corny cows", &a, &b)
+ if a != "bad boys" || b != "corny cows" {
+ t.Fatalf("a should be 'bad boys' and b should be 'corny cows' ('%v', '%v')", a, b)
+ }
+
+ parseLine("", &c)
+ if len(c) != 0 {
+ t.Fatalf("c should be empty (%#v)", c)
+ }
+
+ parseLine("d,e,f:g:h:i,j,k", &c, &a, &b, &c)
+ if a != "g" || b != "h" || len(c) != 3 || c[0] != "i" || c[1] != "j" || c[2] != "k" {
+ t.Fatalf("a should be 'g', b should be 'h', and c should be ['i','j','k'] ('%v', '%v', '%#v')", a, b, c)
+ }
+
+ parseLine("::::::::::", &a, &b, &c)
+ if a != "" || b != "" || len(c) != 0 {
+ t.Fatalf("a, b, and c should all be empty ('%v', '%v', '%#v')", a, b, c)
+ }
+
+ parseLine("not a number", &d)
+ if d != 0 {
+ t.Fatalf("d should be 0 (%v)", d)
+ }
+
+ parseLine("b:12:c", &a, &d, &b)
+ if a != "b" || b != "c" || d != 12 {
+ t.Fatalf("a should be 'b' and b should be 'c', and d should be 12 ('%v', '%v', %v)", a, b, d)
+ }
+}
+
+func TestUserParsePasswd(t *testing.T) {
+ users, err := ParsePasswdFilter(strings.NewReader(`
+root:x:0:0:root:/root:/bin/bash
+adm:x:3:4:adm:/var/adm:/bin/false
+this is just some garbage data
+`), nil)
+ if err != nil {
+ t.Fatalf("Unexpected error: %v", err)
+ }
+ if len(users) != 3 {
+ t.Fatalf("Expected 3 users, got %v", len(users))
+ }
+ if users[0].Uid != 0 || users[0].Name != "root" {
+ t.Fatalf("Expected users[0] to be 0 - root, got %v - %v", users[0].Uid, users[0].Name)
+ }
+ if users[1].Uid != 3 || users[1].Name != "adm" {
+ t.Fatalf("Expected users[1] to be 3 - adm, got %v - %v", users[1].Uid, users[1].Name)
+ }
+}
+
+func TestUserParseGroup(t *testing.T) {
+ groups, err := ParseGroupFilter(strings.NewReader(`
+root:x:0:root
+adm:x:4:root,adm,daemon
+this is just some garbage data
+`), nil)
+ if err != nil {
+ t.Fatalf("Unexpected error: %v", err)
+ }
+ if len(groups) != 3 {
+ t.Fatalf("Expected 3 groups, got %v", len(groups))
+ }
+ if groups[0].Gid != 0 || groups[0].Name != "root" || len(groups[0].List) != 1 {
+ t.Fatalf("Expected groups[0] to be 0 - root - 1 member, got %v - %v - %v", groups[0].Gid, groups[0].Name, len(groups[0].List))
+ }
+ if groups[1].Gid != 4 || groups[1].Name != "adm" || len(groups[1].List) != 3 {
+ t.Fatalf("Expected groups[1] to be 4 - adm - 3 members, got %v - %v - %v", groups[1].Gid, groups[1].Name, len(groups[1].List))
+ }
+}
+
+func TestValidGetExecUser(t *testing.T) {
+ const passwdContent = `
+root:x:0:0:root user:/root:/bin/bash
+adm:x:42:43:adm:/var/adm:/bin/false
+111:x:222:333::/var/garbage
+odd:x:111:112::/home/odd:::::
+this is just some garbage data
+`
+ const groupContent = `
+root:x:0:root
+adm:x:43:
+grp:x:1234:root,adm
+444:x:555:111
+odd:x:444:
+this is just some garbage data
+`
+ defaultExecUser := ExecUser{
+ Uid: 8888,
+ Gid: 8888,
+ Sgids: []int{8888},
+ Home: "/8888",
+ }
+
+ tests := []struct {
+ ref string
+ expected ExecUser
+ }{
+ {
+ ref: "root",
+ expected: ExecUser{
+ Uid: 0,
+ Gid: 0,
+ Sgids: []int{0, 1234},
+ Home: "/root",
+ },
+ },
+ {
+ ref: "adm",
+ expected: ExecUser{
+ Uid: 42,
+ Gid: 43,
+ Sgids: []int{1234},
+ Home: "/var/adm",
+ },
+ },
+ {
+ ref: "root:adm",
+ expected: ExecUser{
+ Uid: 0,
+ Gid: 43,
+ Sgids: defaultExecUser.Sgids,
+ Home: "/root",
+ },
+ },
+ {
+ ref: "adm:1234",
+ expected: ExecUser{
+ Uid: 42,
+ Gid: 1234,
+ Sgids: defaultExecUser.Sgids,
+ Home: "/var/adm",
+ },
+ },
+ {
+ ref: "42:1234",
+ expected: ExecUser{
+ Uid: 42,
+ Gid: 1234,
+ Sgids: defaultExecUser.Sgids,
+ Home: "/var/adm",
+ },
+ },
+ {
+ ref: "1337:1234",
+ expected: ExecUser{
+ Uid: 1337,
+ Gid: 1234,
+ Sgids: defaultExecUser.Sgids,
+ Home: defaultExecUser.Home,
+ },
+ },
+ {
+ ref: "1337",
+ expected: ExecUser{
+ Uid: 1337,
+ Gid: defaultExecUser.Gid,
+ Sgids: defaultExecUser.Sgids,
+ Home: defaultExecUser.Home,
+ },
+ },
+ {
+ ref: "",
+ expected: ExecUser{
+ Uid: defaultExecUser.Uid,
+ Gid: defaultExecUser.Gid,
+ Sgids: defaultExecUser.Sgids,
+ Home: defaultExecUser.Home,
+ },
+ },
+
+ // Regression tests for #695.
+ {
+ ref: "111",
+ expected: ExecUser{
+ Uid: 111,
+ Gid: 112,
+ Sgids: defaultExecUser.Sgids,
+ Home: "/home/odd",
+ },
+ },
+ {
+ ref: "111:444",
+ expected: ExecUser{
+ Uid: 111,
+ Gid: 444,
+ Sgids: defaultExecUser.Sgids,
+ Home: "/home/odd",
+ },
+ },
+ }
+
+ for _, test := range tests {
+ passwd := strings.NewReader(passwdContent)
+ group := strings.NewReader(groupContent)
+
+ execUser, err := GetExecUser(test.ref, &defaultExecUser, passwd, group)
+ if err != nil {
+ t.Logf("got unexpected error when parsing '%s': %s", test.ref, err.Error())
+ t.Fail()
+ continue
+ }
+
+ if !reflect.DeepEqual(test.expected, *execUser) {
+ t.Logf("ref: %v", test.ref)
+ t.Logf("got: %#v", execUser)
+ t.Logf("expected: %#v", test.expected)
+ t.Fail()
+ continue
+ }
+ }
+}
+
+func TestInvalidGetExecUser(t *testing.T) {
+ const passwdContent = `
+root:x:0:0:root user:/root:/bin/bash
+adm:x:42:43:adm:/var/adm:/bin/false
+-42:x:12:13:broken:/very/broken
+this is just some garbage data
+`
+ const groupContent = `
+root:x:0:root
+adm:x:43:
+grp:x:1234:root,adm
+this is just some garbage data
+`
+
+ tests := []string{
+ // No such user/group.
+ "notuser",
+ "notuser:notgroup",
+ "root:notgroup",
+ "notuser:adm",
+ "8888:notgroup",
+ "notuser:8888",
+
+ // Invalid user/group values.
+ "-1:0",
+ "0:-3",
+ "-5:-2",
+ "-42",
+ "-43",
+ }
+
+ for _, test := range tests {
+ passwd := strings.NewReader(passwdContent)
+ group := strings.NewReader(groupContent)
+
+ execUser, err := GetExecUser(test, nil, passwd, group)
+ if err == nil {
+ t.Logf("got unexpected success when parsing '%s': %#v", test, execUser)
+ t.Fail()
+ continue
+ }
+ }
+}
+
+func TestGetExecUserNilSources(t *testing.T) {
+ const passwdContent = `
+root:x:0:0:root user:/root:/bin/bash
+adm:x:42:43:adm:/var/adm:/bin/false
+this is just some garbage data
+`
+ const groupContent = `
+root:x:0:root
+adm:x:43:
+grp:x:1234:root,adm
+this is just some garbage data
+`
+
+ defaultExecUser := ExecUser{
+ Uid: 8888,
+ Gid: 8888,
+ Sgids: []int{8888},
+ Home: "/8888",
+ }
+
+ tests := []struct {
+ ref string
+ passwd, group bool
+ expected ExecUser
+ }{
+ {
+ ref: "",
+ passwd: false,
+ group: false,
+ expected: ExecUser{
+ Uid: 8888,
+ Gid: 8888,
+ Sgids: []int{8888},
+ Home: "/8888",
+ },
+ },
+ {
+ ref: "root",
+ passwd: true,
+ group: false,
+ expected: ExecUser{
+ Uid: 0,
+ Gid: 0,
+ Sgids: []int{8888},
+ Home: "/root",
+ },
+ },
+ {
+ ref: "0",
+ passwd: false,
+ group: false,
+ expected: ExecUser{
+ Uid: 0,
+ Gid: 8888,
+ Sgids: []int{8888},
+ Home: "/8888",
+ },
+ },
+ {
+ ref: "0:0",
+ passwd: false,
+ group: false,
+ expected: ExecUser{
+ Uid: 0,
+ Gid: 0,
+ Sgids: []int{8888},
+ Home: "/8888",
+ },
+ },
+ }
+
+ for _, test := range tests {
+ var passwd, group io.Reader
+
+ if test.passwd {
+ passwd = strings.NewReader(passwdContent)
+ }
+
+ if test.group {
+ group = strings.NewReader(groupContent)
+ }
+
+ execUser, err := GetExecUser(test.ref, &defaultExecUser, passwd, group)
+ if err != nil {
+ t.Logf("got unexpected error when parsing '%s': %s", test.ref, err.Error())
+ t.Fail()
+ continue
+ }
+
+ if !reflect.DeepEqual(test.expected, *execUser) {
+ t.Logf("got: %#v", execUser)
+ t.Logf("expected: %#v", test.expected)
+ t.Fail()
+ continue
+ }
+ }
+}
+
+func TestGetAdditionalGroups(t *testing.T) {
+ type foo struct {
+ groups []string
+ expected []int
+ hasError bool
+ }
+
+ const groupContent = `
+root:x:0:root
+adm:x:43:
+grp:x:1234:root,adm
+adm:x:4343:root,adm-duplicate
+this is just some garbage data
+`
+ tests := []foo{
+ {
+ // empty group
+ groups: []string{},
+ expected: []int{},
+ },
+ {
+ // single group
+ groups: []string{"adm"},
+ expected: []int{43},
+ },
+ {
+ // multiple groups
+ groups: []string{"adm", "grp"},
+ expected: []int{43, 1234},
+ },
+ {
+ // invalid group
+ groups: []string{"adm", "grp", "not-exist"},
+ expected: nil,
+ hasError: true,
+ },
+ {
+ // group with numeric id
+ groups: []string{"43"},
+ expected: []int{43},
+ },
+ {
+ // group with unknown numeric id
+ groups: []string{"adm", "10001"},
+ expected: []int{43, 10001},
+ },
+ {
+ // groups specified twice with numeric and name
+ groups: []string{"adm", "43"},
+ expected: []int{43},
+ },
+ {
+ // groups with too small id
+ groups: []string{"-1"},
+ expected: nil,
+ hasError: true,
+ },
+ }
+
+ if utils.GetIntSize() > 4 {
+ tests = append(tests, foo{
+ // groups with too large id
+ groups: []string{strconv.Itoa(1 << 31)},
+ expected: nil,
+ hasError: true,
+ })
+ }
+
+ for _, test := range tests {
+ group := strings.NewReader(groupContent)
+
+ gids, err := GetAdditionalGroups(test.groups, group)
+ if test.hasError && err == nil {
+ t.Errorf("Parse(%#v) expects error but has none", test)
+ continue
+ }
+ if !test.hasError && err != nil {
+ t.Errorf("Parse(%#v) has error %v", test, err)
+ continue
+ }
+ sort.Sort(sort.IntSlice(gids))
+ if !reflect.DeepEqual(gids, test.expected) {
+ t.Errorf("Gids(%v), expect %v from groups %v", gids, test.expected, test.groups)
+ }
+ }
+}
+
+func TestGetAdditionalGroupsNumeric(t *testing.T) {
+ tests := []struct {
+ groups []string
+ expected []int
+ hasError bool
+ }{
+ {
+ // numeric groups only
+ groups: []string{"1234", "5678"},
+ expected: []int{1234, 5678},
+ },
+ {
+ // numeric and alphabetic
+ groups: []string{"1234", "fake"},
+ expected: nil,
+ hasError: true,
+ },
+ }
+
+ for _, test := range tests {
+ gids, err := GetAdditionalGroups(test.groups, nil)
+ if test.hasError && err == nil {
+ t.Errorf("Parse(%#v) expects error but has none", test)
+ continue
+ }
+ if !test.hasError && err != nil {
+ t.Errorf("Parse(%#v) has error %v", test, err)
+ continue
+ }
+ sort.Sort(sort.IntSlice(gids))
+ if !reflect.DeepEqual(gids, test.expected) {
+ t.Errorf("Gids(%v), expect %v from groups %v", gids, test.expected, test.groups)
+ }
+ }
+}
--- /dev/null
+// +build linux
+
+package utils
+
+/*
+ * Copyright 2016, 2017 SUSE LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import (
+ "fmt"
+ "os"
+
+ "golang.org/x/sys/unix"
+)
+
+// MaxSendfdLen is the maximum length of the name of a file descriptor being
+// sent using SendFd. The name of the file handle returned by RecvFd will never
+// be larger than this value.
+const MaxNameLen = 4096
+
+// oobSpace is the size of the oob slice required to store a single FD. Note
+// that unix.UnixRights appears to make the assumption that fd is always int32,
+// so sizeof(fd) = 4.
+var oobSpace = unix.CmsgSpace(4)
+
+// RecvFd waits for a file descriptor to be sent over the given AF_UNIX
+// socket. The file name of the remote file descriptor will be recreated
+// locally (it is sent as non-auxiliary data in the same payload).
+func RecvFd(socket *os.File) (*os.File, error) {
+ // For some reason, unix.Recvmsg uses the length rather than the capacity
+ // when passing the msg_controllen and other attributes to recvmsg. So we
+ // have to actually set the length.
+ name := make([]byte, MaxNameLen)
+ oob := make([]byte, oobSpace)
+
+ sockfd := socket.Fd()
+ n, oobn, _, _, err := unix.Recvmsg(int(sockfd), name, oob, 0)
+ if err != nil {
+ return nil, err
+ }
+
+ if n >= MaxNameLen || oobn != oobSpace {
+ return nil, fmt.Errorf("recvfd: incorrect number of bytes read (n=%d oobn=%d)", n, oobn)
+ }
+
+ // Truncate.
+ name = name[:n]
+ oob = oob[:oobn]
+
+ scms, err := unix.ParseSocketControlMessage(oob)
+ if err != nil {
+ return nil, err
+ }
+ if len(scms) != 1 {
+ return nil, fmt.Errorf("recvfd: number of SCMs is not 1: %d", len(scms))
+ }
+ scm := scms[0]
+
+ fds, err := unix.ParseUnixRights(&scm)
+ if err != nil {
+ return nil, err
+ }
+ if len(fds) != 1 {
+ return nil, fmt.Errorf("recvfd: number of fds is not 1: %d", len(fds))
+ }
+ fd := uintptr(fds[0])
+
+ return os.NewFile(fd, string(name)), nil
+}
+
+// SendFd sends a file descriptor over the given AF_UNIX socket. In
+// addition, the file.Name() of the given file will also be sent as
+// non-auxiliary data in the same payload (allowing to send contextual
+// information for a file descriptor).
+func SendFd(socket *os.File, name string, fd uintptr) error {
+ if len(name) >= MaxNameLen {
+ return fmt.Errorf("sendfd: filename too long: %s", name)
+ }
+ oob := unix.UnixRights(int(fd))
+ return unix.Sendmsg(int(socket.Fd()), []byte(name), oob, nil, 0)
+}
--- /dev/null
+package utils
+
+import (
+ "encoding/json"
+ "io"
+ "os"
+ "path/filepath"
+ "strings"
+ "unsafe"
+
+ "golang.org/x/sys/unix"
+)
+
+const (
+ exitSignalOffset = 128
+)
+
+// ResolveRootfs ensures that the current working directory is
+// not a symlink and returns the absolute path to the rootfs
+func ResolveRootfs(uncleanRootfs string) (string, error) {
+ rootfs, err := filepath.Abs(uncleanRootfs)
+ if err != nil {
+ return "", err
+ }
+ return filepath.EvalSymlinks(rootfs)
+}
+
+// ExitStatus returns the correct exit status for a process based on if it
+// was signaled or exited cleanly
+func ExitStatus(status unix.WaitStatus) int {
+ if status.Signaled() {
+ return exitSignalOffset + int(status.Signal())
+ }
+ return status.ExitStatus()
+}
+
+// WriteJSON writes the provided struct v to w using standard json marshaling
+func WriteJSON(w io.Writer, v interface{}) error {
+ data, err := json.Marshal(v)
+ if err != nil {
+ return err
+ }
+ _, err = w.Write(data)
+ return err
+}
+
+// CleanPath makes a path safe for use with filepath.Join. This is done by not
+// only cleaning the path, but also (if the path is relative) adding a leading
+// '/' and cleaning it (then removing the leading '/'). This ensures that a
+// path resulting from prepending another path will always resolve to lexically
+// be a subdirectory of the prefixed path. This is all done lexically, so paths
+// that include symlinks won't be safe as a result of using CleanPath.
+func CleanPath(path string) string {
+ // Deal with empty strings nicely.
+ if path == "" {
+ return ""
+ }
+
+ // Ensure that all paths are cleaned (especially problematic ones like
+ // "/../../../../../" which can cause lots of issues).
+ path = filepath.Clean(path)
+
+ // If the path isn't absolute, we need to do more processing to fix paths
+ // such as "../../../../<etc>/some/path". We also shouldn't convert absolute
+ // paths to relative ones.
+ if !filepath.IsAbs(path) {
+ path = filepath.Clean(string(os.PathSeparator) + path)
+ // This can't fail, as (by definition) all paths are relative to root.
+ path, _ = filepath.Rel(string(os.PathSeparator), path)
+ }
+
+ // Clean the path again for good measure.
+ return filepath.Clean(path)
+}
+
+// SearchLabels searches a list of key-value pairs for the provided key and
+// returns the corresponding value. The pairs must be separated with '='.
+func SearchLabels(labels []string, query string) string {
+ for _, l := range labels {
+ parts := strings.SplitN(l, "=", 2)
+ if len(parts) < 2 {
+ continue
+ }
+ if parts[0] == query {
+ return parts[1]
+ }
+ }
+ return ""
+}
+
+// Annotations returns the bundle path and user defined annotations from the
+// libcontainer state. We need to remove the bundle because that is a label
+// added by libcontainer.
+func Annotations(labels []string) (bundle string, userAnnotations map[string]string) {
+ userAnnotations = make(map[string]string)
+ for _, l := range labels {
+ parts := strings.SplitN(l, "=", 2)
+ if len(parts) < 2 {
+ continue
+ }
+ if parts[0] == "bundle" {
+ bundle = parts[1]
+ } else {
+ userAnnotations[parts[0]] = parts[1]
+ }
+ }
+ return
+}
+
+func GetIntSize() int {
+ return int(unsafe.Sizeof(1))
+}
--- /dev/null
+package utils
+
+import (
+ "bytes"
+ "fmt"
+ "os"
+ "path/filepath"
+ "testing"
+
+ "golang.org/x/sys/unix"
+)
+
+var labelTest = []struct {
+ labels []string
+ query string
+ expectedValue string
+}{
+ {[]string{"bundle=/path/to/bundle"}, "bundle", "/path/to/bundle"},
+ {[]string{"test=a", "test=b"}, "bundle", ""},
+ {[]string{"bundle=a", "test=b", "bundle=c"}, "bundle", "a"},
+ {[]string{"", "test=a", "bundle=b"}, "bundle", "b"},
+ {[]string{"test", "bundle=a"}, "bundle", "a"},
+ {[]string{"test=a", "bundle="}, "bundle", ""},
+}
+
+func TestSearchLabels(t *testing.T) {
+ for _, tt := range labelTest {
+ if v := SearchLabels(tt.labels, tt.query); v != tt.expectedValue {
+ t.Errorf("expected value '%s' for query '%s'; got '%s'", tt.expectedValue, tt.query, v)
+ }
+ }
+}
+
+func TestResolveRootfs(t *testing.T) {
+ dir := "rootfs"
+ os.Mkdir(dir, 0600)
+ defer os.Remove(dir)
+
+ path, err := ResolveRootfs(dir)
+ if err != nil {
+ t.Fatal(err)
+ }
+ pwd, err := os.Getwd()
+ if err != nil {
+ t.Fatal(err)
+ }
+ if path != fmt.Sprintf("%s/%s", pwd, "rootfs") {
+ t.Errorf("expected rootfs to be abs and was %s", path)
+ }
+}
+
+func TestResolveRootfsWithSymlink(t *testing.T) {
+ dir := "rootfs"
+ tmpDir, _ := filepath.EvalSymlinks(os.TempDir())
+ os.Symlink(tmpDir, dir)
+ defer os.Remove(dir)
+
+ path, err := ResolveRootfs(dir)
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ if path != tmpDir {
+ t.Errorf("expected rootfs to be the real path %s and was %s", path, os.TempDir())
+ }
+}
+
+func TestResolveRootfsWithNonExistingDir(t *testing.T) {
+ _, err := ResolveRootfs("foo")
+ if err == nil {
+ t.Error("expected error to happen but received nil")
+ }
+}
+
+func TestExitStatus(t *testing.T) {
+ status := unix.WaitStatus(0)
+ ex := ExitStatus(status)
+ if ex != 0 {
+ t.Errorf("expected exit status to equal 0 and received %d", ex)
+ }
+}
+
+func TestExitStatusSignaled(t *testing.T) {
+ status := unix.WaitStatus(2)
+ ex := ExitStatus(status)
+ if ex != 130 {
+ t.Errorf("expected exit status to equal 130 and received %d", ex)
+ }
+}
+
+func TestWriteJSON(t *testing.T) {
+ person := struct {
+ Name string
+ Age int
+ }{
+ Name: "Alice",
+ Age: 30,
+ }
+
+ var b bytes.Buffer
+ err := WriteJSON(&b, person)
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ expected := `{"Name":"Alice","Age":30}`
+ if b.String() != expected {
+ t.Errorf("expected to write %s but was %s", expected, b.String())
+ }
+}
+
+func TestCleanPath(t *testing.T) {
+ path := CleanPath("")
+ if path != "" {
+ t.Errorf("expected to receive empty string and received %s", path)
+ }
+
+ path = CleanPath("rootfs")
+ if path != "rootfs" {
+ t.Errorf("expected to receive 'rootfs' and received %s", path)
+ }
+
+ path = CleanPath("../../../var")
+ if path != "var" {
+ t.Errorf("expected to receive 'var' and received %s", path)
+ }
+
+ path = CleanPath("/../../../var")
+ if path != "/var" {
+ t.Errorf("expected to receive '/var' and received %s", path)
+ }
+
+ path = CleanPath("/foo/bar/")
+ if path != "/foo/bar" {
+ t.Errorf("expected to receive '/foo/bar' and received %s", path)
+ }
+
+ path = CleanPath("/foo/bar/../")
+ if path != "/foo" {
+ t.Errorf("expected to receive '/foo' and received %s", path)
+ }
+}
--- /dev/null
+// +build !windows
+
+package utils
+
+import (
+ "io/ioutil"
+ "os"
+ "strconv"
+
+ "golang.org/x/sys/unix"
+)
+
+func CloseExecFrom(minFd int) error {
+ fdList, err := ioutil.ReadDir("/proc/self/fd")
+ if err != nil {
+ return err
+ }
+ for _, fi := range fdList {
+ fd, err := strconv.Atoi(fi.Name())
+ if err != nil {
+ // ignore non-numeric file names
+ continue
+ }
+
+ if fd < minFd {
+ // ignore descriptors lower than our specified minimum
+ continue
+ }
+
+ // intentionally ignore errors from unix.CloseOnExec
+ unix.CloseOnExec(fd)
+ // the cases where this might fail are basically file descriptors that have already been closed (including and especially the one that was created when ioutil.ReadDir did the "opendir" syscall)
+ }
+ return nil
+}
+
+// NewSockPair returns a new unix socket pair
+func NewSockPair(name string) (parent *os.File, child *os.File, err error) {
+ fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0)
+ if err != nil {
+ return nil, nil, err
+ }
+ return os.NewFile(uintptr(fds[1]), name+"-p"), os.NewFile(uintptr(fds[0]), name+"-c"), nil
+}
--- /dev/null
+// +build linux
+
+package main
+
+import (
+ "fmt"
+ "io/ioutil"
+ "os"
+ "path/filepath"
+ "syscall"
+ "text/tabwriter"
+ "time"
+
+ "encoding/json"
+
+ "github.com/opencontainers/runc/libcontainer"
+ "github.com/opencontainers/runc/libcontainer/user"
+ "github.com/opencontainers/runc/libcontainer/utils"
+ "github.com/urfave/cli"
+)
+
+const formatOptions = `table or json`
+
+// containerState represents the platform agnostic pieces relating to a
+// running container's status and state
+type containerState struct {
+ // Version is the OCI version for the container
+ Version string `json:"ociVersion"`
+ // ID is the container ID
+ ID string `json:"id"`
+ // InitProcessPid is the init process id in the parent namespace
+ InitProcessPid int `json:"pid"`
+ // Status is the current status of the container, running, paused, ...
+ Status string `json:"status"`
+ // Bundle is the path on the filesystem to the bundle
+ Bundle string `json:"bundle"`
+ // Rootfs is a path to a directory containing the container's root filesystem.
+ Rootfs string `json:"rootfs"`
+ // Created is the unix timestamp for the creation time of the container in UTC
+ Created time.Time `json:"created"`
+ // Annotations is the user defined annotations added to the config.
+ Annotations map[string]string `json:"annotations,omitempty"`
+ // The owner of the state directory (the owner of the container).
+ Owner string `json:"owner"`
+}
+
+var listCommand = cli.Command{
+ Name: "list",
+ Usage: "lists containers started by runc with the given root",
+ ArgsUsage: `
+
+Where the given root is specified via the global option "--root"
+(default: "/run/runc").
+
+EXAMPLE 1:
+To list containers created via the default "--root":
+ # runc list
+
+EXAMPLE 2:
+To list containers created using a non-default value for "--root":
+ # runc --root value list`,
+ Flags: []cli.Flag{
+ cli.StringFlag{
+ Name: "format, f",
+ Value: "table",
+ Usage: `select one of: ` + formatOptions,
+ },
+ cli.BoolFlag{
+ Name: "quiet, q",
+ Usage: "display only container IDs",
+ },
+ },
+ Action: func(context *cli.Context) error {
+ if err := checkArgs(context, 0, exactArgs); err != nil {
+ return err
+ }
+ s, err := getContainers(context)
+ if err != nil {
+ return err
+ }
+
+ if context.Bool("quiet") {
+ for _, item := range s {
+ fmt.Println(item.ID)
+ }
+ return nil
+ }
+
+ switch context.String("format") {
+ case "table":
+ w := tabwriter.NewWriter(os.Stdout, 12, 1, 3, ' ', 0)
+ fmt.Fprint(w, "ID\tPID\tSTATUS\tBUNDLE\tCREATED\tOWNER\n")
+ for _, item := range s {
+ fmt.Fprintf(w, "%s\t%d\t%s\t%s\t%s\t%s\n",
+ item.ID,
+ item.InitProcessPid,
+ item.Status,
+ item.Bundle,
+ item.Created.Format(time.RFC3339Nano),
+ item.Owner)
+ }
+ if err := w.Flush(); err != nil {
+ return err
+ }
+ case "json":
+ if err := json.NewEncoder(os.Stdout).Encode(s); err != nil {
+ return err
+ }
+ default:
+ return fmt.Errorf("invalid format option")
+ }
+ return nil
+ },
+}
+
+func getContainers(context *cli.Context) ([]containerState, error) {
+ factory, err := loadFactory(context)
+ if err != nil {
+ return nil, err
+ }
+ root := context.GlobalString("root")
+ absRoot, err := filepath.Abs(root)
+ if err != nil {
+ return nil, err
+ }
+ list, err := ioutil.ReadDir(absRoot)
+ if err != nil {
+ fatal(err)
+ }
+
+ var s []containerState
+ for _, item := range list {
+ if item.IsDir() {
+ // This cast is safe on Linux.
+ stat := item.Sys().(*syscall.Stat_t)
+ owner, err := user.LookupUid(int(stat.Uid))
+ if err != nil {
+ owner.Name = fmt.Sprintf("#%d", stat.Uid)
+ }
+
+ container, err := factory.Load(item.Name())
+ if err != nil {
+ fmt.Fprintf(os.Stderr, "load container %s: %v\n", item.Name(), err)
+ continue
+ }
+ containerStatus, err := container.Status()
+ if err != nil {
+ fmt.Fprintf(os.Stderr, "status for %s: %v\n", item.Name(), err)
+ continue
+ }
+ state, err := container.State()
+ if err != nil {
+ fmt.Fprintf(os.Stderr, "state for %s: %v\n", item.Name(), err)
+ continue
+ }
+ pid := state.BaseState.InitProcessPid
+ if containerStatus == libcontainer.Stopped {
+ pid = 0
+ }
+ bundle, annotations := utils.Annotations(state.Config.Labels)
+ s = append(s, containerState{
+ Version: state.BaseState.Config.Version,
+ ID: state.BaseState.ID,
+ InitProcessPid: pid,
+ Status: containerStatus.String(),
+ Bundle: bundle,
+ Rootfs: state.BaseState.Config.Rootfs,
+ Created: state.BaseState.Created,
+ Annotations: annotations,
+ Owner: owner.Name,
+ })
+ }
+ }
+ return s, nil
+}
--- /dev/null
+package main
+
+import (
+ "fmt"
+ "io"
+ "os"
+ "strings"
+
+ "github.com/opencontainers/runtime-spec/specs-go"
+
+ "github.com/sirupsen/logrus"
+ "github.com/urfave/cli"
+)
+
+// version will be populated by the Makefile, read from
+// VERSION file of the source code.
+var version = ""
+
+// gitCommit will be the hash that the binary was built from
+// and will be populated by the Makefile
+var gitCommit = ""
+
+const (
+ specConfig = "config.json"
+ usage = `Open Container Initiative runtime
+
+runc is a command line client for running applications packaged according to
+the Open Container Initiative (OCI) format and is a compliant implementation of the
+Open Container Initiative specification.
+
+runc integrates well with existing process supervisors to provide a production
+container runtime environment for applications. It can be used with your
+existing process monitoring tools and the container will be spawned as a
+direct child of the process supervisor.
+
+Containers are configured using bundles. A bundle for a container is a directory
+that includes a specification file named "` + specConfig + `" and a root filesystem.
+The root filesystem contains the contents of the container.
+
+To start a new instance of a container:
+
+ # runc run [ -b bundle ] <container-id>
+
+Where "<container-id>" is your name for the instance of the container that you
+are starting. The name you provide for the container instance must be unique on
+your host. Providing the bundle directory using "-b" is optional. The default
+value for "bundle" is the current directory.`
+)
+
+func main() {
+ app := cli.NewApp()
+ app.Name = "runc"
+ app.Usage = usage
+
+ var v []string
+ if version != "" {
+ v = append(v, version)
+ }
+ if gitCommit != "" {
+ v = append(v, fmt.Sprintf("commit: %s", gitCommit))
+ }
+ v = append(v, fmt.Sprintf("spec: %s", specs.Version))
+ app.Version = strings.Join(v, "\n")
+
+ root := "/run/runc"
+ if shouldHonorXDGRuntimeDir() {
+ if runtimeDir := os.Getenv("XDG_RUNTIME_DIR"); runtimeDir != "" {
+ root = runtimeDir + "/runc"
+ // According to the XDG specification, we need to set anything in
+ // XDG_RUNTIME_DIR to have a sticky bit if we don't want it to get
+ // auto-pruned.
+ if err := os.MkdirAll(root, 0700); err != nil {
+ fatal(err)
+ }
+ if err := os.Chmod(root, 0700|os.ModeSticky); err != nil {
+ fatal(err)
+ }
+ }
+ }
+
+ app.Flags = []cli.Flag{
+ cli.BoolFlag{
+ Name: "debug",
+ Usage: "enable debug output for logging",
+ },
+ cli.StringFlag{
+ Name: "log",
+ Value: "/dev/null",
+ Usage: "set the log file path where internal debug information is written",
+ },
+ cli.StringFlag{
+ Name: "log-format",
+ Value: "text",
+ Usage: "set the format used by logs ('text' (default), or 'json')",
+ },
+ cli.StringFlag{
+ Name: "root",
+ Value: root,
+ Usage: "root directory for storage of container state (this should be located in tmpfs)",
+ },
+ cli.StringFlag{
+ Name: "criu",
+ Value: "criu",
+ Usage: "path to the criu binary used for checkpoint and restore",
+ },
+ cli.BoolFlag{
+ Name: "systemd-cgroup",
+ Usage: "enable systemd cgroup support, expects cgroupsPath to be of form \"slice:prefix:name\" for e.g. \"system.slice:runc:434234\"",
+ },
+ cli.StringFlag{
+ Name: "rootless",
+ Value: "auto",
+ Usage: "ignore cgroup permission errors ('true', 'false', or 'auto')",
+ },
+ }
+ app.Commands = []cli.Command{
+ checkpointCommand,
+ createCommand,
+ deleteCommand,
+ eventsCommand,
+ execCommand,
+ initCommand,
+ killCommand,
+ listCommand,
+ pauseCommand,
+ psCommand,
+ restoreCommand,
+ resumeCommand,
+ runCommand,
+ specCommand,
+ startCommand,
+ stateCommand,
+ updateCommand,
+ }
+ app.Before = func(context *cli.Context) error {
+ if context.GlobalBool("debug") {
+ logrus.SetLevel(logrus.DebugLevel)
+ }
+ if path := context.GlobalString("log"); path != "" {
+ f, err := os.OpenFile(path, os.O_CREATE|os.O_WRONLY|os.O_APPEND|os.O_SYNC, 0666)
+ if err != nil {
+ return err
+ }
+ logrus.SetOutput(f)
+ }
+ switch context.GlobalString("log-format") {
+ case "text":
+ // retain logrus's default.
+ case "json":
+ logrus.SetFormatter(new(logrus.JSONFormatter))
+ default:
+ return fmt.Errorf("unknown log-format %q", context.GlobalString("log-format"))
+ }
+ return nil
+ }
+ // If the command returns an error, cli takes upon itself to print
+ // the error on cli.ErrWriter and exit.
+ // Use our own writer here to ensure the log gets sent to the right location.
+ cli.ErrWriter = &FatalWriter{cli.ErrWriter}
+ if err := app.Run(os.Args); err != nil {
+ fatal(err)
+ }
+}
+
+type FatalWriter struct {
+ cliErrWriter io.Writer
+}
+
+func (f *FatalWriter) Write(p []byte) (n int, err error) {
+ logrus.Error(string(p))
+ return f.cliErrWriter.Write(p)
+}
--- /dev/null
+runc man pages
+====================
+
+This directory contains man pages for runc in markdown format.
+
+To generate man pages from it, use this command
+
+ ./md2man-all.sh
+
+You will see man pages generated under the man8 directory.
+
--- /dev/null
+#!/bin/bash
+set -e
+
+# get into this script's directory
+cd "$(dirname "$(readlink -f "$BASH_SOURCE")")"
+
+[ "$1" = '-q' ] || {
+ set -x
+ pwd
+}
+
+if ! ( which go-md2man &>/dev/null ); then
+ echo "To install man pages, please install 'go-md2man'."
+ exit 0
+fi
+
+for FILE in *.md; do
+ base="$(basename "$FILE")"
+ name="${base%.md}"
+ num="${name##*.}"
+ if [ -z "$num" -o "$name" = "$num" ]; then
+ # skip files that aren't of the format xxxx.N.md (like README.md)
+ continue
+ fi
+ mkdir -p "./man${num}"
+ go-md2man -in "$FILE" -out "./man${num}/${name}"
+done
--- /dev/null
+# NAME
+ runc checkpoint - checkpoint a running container
+
+# SYNOPSIS
+ runc checkpoint [command options] <container-id>
+
+Where "<container-id>" is the name for the instance of the container to be
+checkpointed.
+
+# DESCRIPTION
+ The checkpoint command saves the state of the container instance.
+
+# OPTIONS
+ --image-path value path for saving criu image files
+ --work-path value path for saving work files and logs
+ --parent-path value path for previous criu image files in pre-dump
+ --leave-running leave the process running after checkpointing
+ --tcp-established allow open tcp connections
+ --ext-unix-sk allow external unix sockets
+ --shell-job allow shell jobs
+ --page-server value ADDRESS:PORT of the page server
+ --file-locks handle file locks, for safety
+ --pre-dump dump container's memory information only, leave the container running after this
+ --manage-cgroups-mode value cgroups mode: 'soft' (default), 'full' and 'strict'
+ --empty-ns value create a namespace, but don't restore its properties
--- /dev/null
+# NAME
+ runc create - create a container
+
+# SYNOPSIS
+ runc create [command options] <container-id>
+
+Where "<container-id>" is your name for the instance of the container that you
+are starting. The name you provide for the container instance must be unique on
+your host.
+
+# DESCRIPTION
+ The create command creates an instance of a container for a bundle. The bundle
+is a directory with a specification file named "config.json" and a root
+filesystem.
+
+The specification file includes an args parameter. The args parameter is used
+to specify command(s) that get run when the container is started. To change the
+command(s) that get executed on start, edit the args parameter of the spec. See
+"runc spec --help" for more explanation.
+
+# OPTIONS
+ --bundle value, -b value path to the root of the bundle directory, defaults to the current directory
+ --console-socket value path to an AF_UNIX socket which will receive a file descriptor referencing the master end of the console's pseudoterminal
+ --pid-file value specify the file to write the process id to
+ --no-pivot do not use pivot root to jail process inside rootfs. This should be used whenever the rootfs is on top of a ramdisk
+ --no-new-keyring do not create a new session keyring for the container. This will cause the container to inherit the calling processes session key
+ --preserve-fds value Pass N additional file descriptors to the container (stdio + $LISTEN_FDS + N in total) (default: 0)
--- /dev/null
+# NAME
+ runc delete - delete any resources held by the container often used with detached container
+
+# SYNOPSIS
+ runc delete [command options] <container-id>
+
+Where "<container-id>" is the name for the instance of the container.
+
+# OPTIONS
+ --force, -f Forcibly deletes the container if it is still running (uses SIGKILL)
+
+# EXAMPLE
+For example, if the container id is "ubuntu01" and runc list currently shows the
+status of "ubuntu01" as "stopped" the following will delete resources held for
+"ubuntu01" removing "ubuntu01" from the runc list of containers:
+
+ # runc delete ubuntu01
--- /dev/null
+# NAME
+ runc events - display container events such as OOM notifications, cpu, memory, and IO usage statistics
+
+# SYNOPSIS
+ runc events [command options] <container-id>
+
+Where "<container-id>" is the name for the instance of the container.
+
+# DESCRIPTION
+ The events command displays information about the container. By default the
+information is displayed once every 5 seconds.
+
+# OPTIONS
+ --interval value set the stats collection interval (default: 5s)
+ --stats display the container's stats then exit
--- /dev/null
+# NAME
+ runc exec - execute new process inside the container
+
+# SYNOPSIS
+ runc exec [command options] <container-id> -- <container command> [args...]
+
+Where "<container-id>" is the name for the instance of the container and
+"<container command>" is the command to be executed in the container.
+
+# EXAMPLE
+For example, if the container is configured to run the linux ps command the
+following will output a list of processes running in the container:
+
+ # runc exec <container-id> ps
+
+# OPTIONS
+ --console value specify the pty slave path for use with the container
+ --cwd value current working directory in the container
+ --env value, -e value set environment variables
+ --tty, -t allocate a pseudo-TTY
+ --user value, -u value UID (format: <uid>[:<gid>])
+ --additional-gids value, -g value additional gids
+ --process value, -p value path to the process.json
+ --detach, -d detach from the container's process
+ --pid-file value specify the file to write the process id to
+ --process-label value set the asm process label for the process commonly used with selinux
+ --apparmor value set the apparmor profile for the process
+ --no-new-privs set the no new privileges value for the process
+ --cap value, -c value add a capability to the bounding set for the process
+ --no-subreaper disable the use of the subreaper used to reap reparented processes
--- /dev/null
+# NAME
+ runc kill - kill sends the specified signal (default: SIGTERM) to the container's init process
+
+# SYNOPSIS
+ runc kill [command options] <container-id> <signal>
+
+Where "<container-id>" is the name for the instance of the container and
+"<signal>" is the signal to be sent to the init process.
+
+# OPTIONS
+ --all, -a send the specified signal to all processes inside the container
+
+# EXAMPLE
+
+For example, if the container id is "ubuntu01" the following will send a "KILL"
+signal to the init process of the "ubuntu01" container:
+
+ # runc kill ubuntu01 KILL
--- /dev/null
+# NAME
+ runc list - lists containers started by runc with the given root
+
+# SYNOPSIS
+ runc list [command options]
+
+# EXAMPLE
+Where the given root is specified via the global option "--root"
+(default: "/run/runc").
+
+To list containers created via the default "--root":
+ # runc list
+
+To list containers created using a non-default value for "--root":
+ # runc --root value list
+
+# OPTIONS
+ --format value, -f value select one of: table or json (default: "table")
+ --quiet, -q display only container IDs
--- /dev/null
+# NAME
+ runc pause - pause suspends all processes inside the container
+
+# SYNOPSIS
+ runc pause <container-id>
+
+Where "<container-id>" is the name for the instance of the container to be
+paused.
+
+# DESCRIPTION
+ The pause command suspends all processes in the instance of the container.
+Use runc list to identiy instances of containers and their current status.
--- /dev/null
+# NAME
+ runc ps - ps displays the processes running inside a container
+
+# SYNOPSIS
+ runc ps [command options] <container-id> [ps options]
+
+# OPTIONS
+ --format value, -f value select one of: table(default) or json
+
+The default format is table. The following will output the processes of a container
+in json format:
+
+ # runc ps -f json <container-id>
--- /dev/null
+# NAME
+ runc restore - restore a container from a previous checkpoint
+
+# SYNOPSIS
+ runc restore [command options] <container-id>
+
+Where "<container-id>" is the name for the instance of the container to be
+restored.
+
+# DESCRIPTION
+ Restores the saved state of the container instance that was previously saved
+using the runc checkpoint command.
+
+# OPTIONS
+ --image-path value path to criu image files for restoring
+ --work-path value path for saving work files and logs
+ --tcp-established allow open tcp connections
+ --ext-unix-sk allow external unix sockets
+ --shell-job allow shell jobs
+ --file-locks handle file locks, for safety
+ --manage-cgroups-mode value cgroups mode: 'soft' (default), 'full' and 'strict'
+ --bundle value, -b value path to the root of the bundle directory
+ --detach, -d detach from the container's process
+ --pid-file value specify the file to write the process id to
+ --no-subreaper disable the use of the subreaper used to reap reparented processes
+ --no-pivot do not use pivot root to jail process inside rootfs. This should be used whenever the rootfs is on top of a ramdisk
--- /dev/null
+# NAME
+ runc resume - resumes all processes that have been previously paused
+
+# SYNOPSIS
+ runc resume <container-id>
+
+Where "<container-id>" is the name for the instance of the container to be
+resumed.
+
+# DESCRIPTION
+ The resume command resumes all processes in the instance of the container.
+Use runc list to identiy instances of containers and their current status.
--- /dev/null
+# NAME
+ runc run - create and run a container
+
+# SYNOPSIS
+ runc run [command options] <container-id>
+
+Where "<container-id>" is your name for the instance of the container that you
+are starting. The name you provide for the container instance must be unique on
+your host.
+
+# DESCRIPTION
+ The run command creates an instance of a container for a bundle. The bundle
+is a directory with a specification file named "config.json" and a root
+filesystem.
+
+The specification file includes an args parameter. The args parameter is used
+to specify command(s) that get run when the container is started. To change the
+command(s) that get executed on start, edit the args parameter of the spec. See
+"runc spec --help" for more explanation.
+
+# OPTIONS
+ --bundle value, -b value path to the root of the bundle directory, defaults to the current directory
+ --console-socket value path to an AF_UNIX socket which will receive a file descriptor referencing the master end of the console's pseudoterminal
+ --detach, -d detach from the container's process
+ --pid-file value specify the file to write the process id to
+ --no-subreaper disable the use of the subreaper used to reap reparented processes
+ --no-pivot do not use pivot root to jail process inside rootfs. This should be used whenever the rootfs is on top of a ramdisk
+ --no-new-keyring do not create a new session keyring for the container. This will cause the container to inherit the calling processes session key
+ --preserve-fds value Pass N additional file descriptors to the container (stdio + $LISTEN_FDS + N in total) (default: 0)
--- /dev/null
+# NAME
+ runc spec - create a new specification file
+
+# SYNOPSIS
+ runc spec [command options] [arguments...]
+
+# DESCRIPTION
+ The spec command creates the new specification file named "config.json" for
+the bundle.
+
+The spec generated is just a starter file. Editing of the spec is required to
+achieve desired results. For example, the newly generated spec includes an args
+parameter that is initially set to call the "sh" command when the container is
+started. Calling "sh" may work for an ubuntu container or busybox, but will not
+work for containers that do not include the "sh" program.
+
+# EXAMPLE
+ To run docker's hello-world container one needs to set the args parameter
+in the spec to call hello. This can be done using the sed command or a text
+editor. The following commands create a bundle for hello-world, change the
+default args parameter in the spec from "sh" to "/hello", then run the hello
+command in a new hello-world container named container1:
+
+ mkdir hello
+ cd hello
+ docker pull hello-world
+ docker export $(docker create hello-world) > hello-world.tar
+ mkdir rootfs
+ tar -C rootfs -xf hello-world.tar
+ runc spec
+ sed -i 's;"sh";"/hello";' config.json
+ runc start container1
+
+In the start command above, "container1" is the name for the instance of the
+container that you are starting. The name you provide for the container instance
+must be unique on your host.
+
+An alternative for generating a customized spec config is to use "oci-runtime-tool", the
+sub-command "oci-runtime-tool generate" has lots of options that can be used to do any
+customizations as you want, see [runtime-tools](https://github.com/opencontainers/runtime-tools)
+to get more information.
+
+When starting a container through runc, runc needs root privilege. If not
+already running as root, you can use sudo to give runc root privilege. For
+example: "sudo runc start container1" will give runc root privilege to start the
+container on your host.
+
+Alternatively, you can start a rootless container, which has the ability to run without root privileges. For this to work, the specification file needs to be adjusted accordingly. You can pass the parameter --rootless to this command to generate a proper rootless spec file.
+
+# OPTIONS
+ --bundle value, -b value path to the root of the bundle directory
+ --rootless generate a configuration for a rootless container
--- /dev/null
+# NAME
+ runc start - start executes the user defined process in a created container
+
+# SYNOPSIS
+ runc start <container-id>
+
+Where "<container-id>" is your name for the instance of the container that you
+are starting. The name you provide for the container instance must be unique on
+your host.
+
+# DESCRIPTION
+ The start command executes the user defined process in a created container.
--- /dev/null
+# NAME
+ runc state - output the state of a container
+
+# SYNOPSIS
+ runc state <container-id>
+
+Where "<container-id>" is your name for the instance of the container.
+
+# DESCRIPTION
+ The state command outputs current state information for the
+instance of a container.
--- /dev/null
+# NAME
+ runc update - update container resource constraints
+
+# SYNOPSIS
+ runc update [command options] <container-id>
+
+# DESCRIPTION
+ The data can be read from a file or the standard input, the
+accepted format is as follow (unchanged values can be omitted):
+
+ {
+ "memory": {
+ "limit": 0,
+ "reservation": 0,
+ "swap": 0,
+ "kernel": 0,
+ "kernelTCP": 0
+ },
+ "cpu": {
+ "shares": 0,
+ "quota": 0,
+ "period": 0,
+ "realtimeRuntime": 0,
+ "realtimePeriod": 0,
+ "cpus": "",
+ "mems": ""
+ },
+ "blockIO": {
+ "blkioWeight": 0
+ }
+ }
+
+Note: if data is to be read from a file or the standard input, all
+other options are ignored.
+
+# OPTIONS
+ --resources value, -r value path to the file containing the resources to update or '-' to read from the standard input
+ --blkio-weight value Specifies per cgroup weight, range is from 10 to 1000 (default: 0)
+ --cpu-period value CPU CFS period to be used for hardcapping (in usecs). 0 to use system default
+ --cpu-quota value CPU CFS hardcap limit (in usecs). Allowed cpu time in a given period
+ --cpu-rt-period value CPU realtime period to be used for hardcapping (in usecs). 0 to use system default
+ --cpu-rt-runtime value CPU realtime hardcap limit (in usecs). Allowed cpu time in a given period
+ --cpu-share value CPU shares (relative weight vs. other containers)
+ --cpuset-cpus value CPU(s) to use
+ --cpuset-mems value Memory node(s) to use
+ --kernel-memory value Kernel memory limit (in bytes)
+ --kernel-memory-tcp value Kernel memory limit (in bytes) for tcp buffer
+ --memory value Memory limit (in bytes)
+ --memory-reservation value Memory reservation or soft_limit (in bytes)
+ --memory-swap value Total memory usage (memory + swap); set '-1' to enable unlimited swap
+ --pids-limit value Maximum number of pids allowed in the container (default: 0)
+ --l3-cache-schema The string of Intel RDT/CAT L3 cache schema
+ --mem-bw-schema The string of Intel RDT/MBA memory bandwidth schema
--- /dev/null
+# NAME
+ runc - Open Container Initiative runtime
+
+# SYNOPSIS
+ runc [global options] command [command options] [arguments...]
+
+# DESCRIPTION
+runc is a command line client for running applications packaged according to
+the Open Container Initiative (OCI) format and is a compliant implementation of the
+Open Container Initiative specification.
+
+runc integrates well with existing process supervisors to provide a production
+container runtime environment for applications. It can be used with your
+existing process monitoring tools and the container will be spawned as a
+direct child of the process supervisor.
+
+Containers are configured using bundles. A bundle for a container is a directory
+that includes a specification file named "config.json" and a root filesystem.
+The root filesystem contains the contents of the container.
+
+To start a new instance of a container:
+
+ # runc start [ -b bundle ] <container-id>
+
+Where "<container-id>" is your name for the instance of the container that you
+are starting. The name you provide for the container instance must be unique on
+your host. Providing the bundle directory using "-b" is optional. The default
+value for "bundle" is the current directory.
+
+# COMMANDS
+ checkpoint checkpoint a running container
+ create create a container
+ delete delete any resources held by the container often used with detached containers
+ events display container events such as OOM notifications, cpu, memory, IO and network stats
+ exec execute new process inside the container
+ init initialize the namespaces and launch the process (do not call it outside of runc)
+ kill kill sends the specified signal (default: SIGTERM) to the container's init process
+ list lists containers started by runc with the given root
+ pause pause suspends all processes inside the container
+ ps displays the processes running inside a container
+ restore restore a container from a previous checkpoint
+ resume resumes all processes that have been previously paused
+ run create and run a container
+ spec create a new specification file
+ start executes the user defined process in a created container
+ state output the state of a container
+ update update container resource constraints
+ help, h Shows a list of commands or help for one command
+
+# GLOBAL OPTIONS
+ --debug enable debug output for logging
+ --log value set the log file path where internal debug information is written (default: "/dev/null")
+ --log-format value set the format used by logs ('text' (default), or 'json') (default: "text")
+ --root value root directory for storage of container state (this should be located in tmpfs) (default: "/run/runc" or $XDG_RUNTIME_DIR/runc for rootless containers)
+ --criu value path to the criu binary used for checkpoint and restore (default: "criu")
+ --systemd-cgroup enable systemd cgroup support, expects cgroupsPath to be of form "slice:prefix:name" for e.g. "system.slice:runc:434234"
+ --rootless value enable rootless mode ('true', 'false', or 'auto') (default: "auto")
+ --help, -h show help
+ --version, -v print the version
--- /dev/null
+// +build linux
+
+package main
+
+import (
+ "bytes"
+ "fmt"
+ "net"
+ "path/filepath"
+
+ "github.com/opencontainers/runtime-spec/specs-go"
+
+ "github.com/sirupsen/logrus"
+ "github.com/urfave/cli"
+)
+
+type notifySocket struct {
+ socket *net.UnixConn
+ host string
+ socketPath string
+}
+
+func newNotifySocket(context *cli.Context, notifySocketHost string, id string) *notifySocket {
+ if notifySocketHost == "" {
+ return nil
+ }
+
+ root := filepath.Join(context.GlobalString("root"), id)
+ path := filepath.Join(root, "notify.sock")
+
+ notifySocket := ¬ifySocket{
+ socket: nil,
+ host: notifySocketHost,
+ socketPath: path,
+ }
+
+ return notifySocket
+}
+
+func (s *notifySocket) Close() error {
+ return s.socket.Close()
+}
+
+// If systemd is supporting sd_notify protocol, this function will add support
+// for sd_notify protocol from within the container.
+func (s *notifySocket) setupSpec(context *cli.Context, spec *specs.Spec) {
+ mount := specs.Mount{Destination: s.host, Source: s.socketPath, Options: []string{"bind"}}
+ spec.Mounts = append(spec.Mounts, mount)
+ spec.Process.Env = append(spec.Process.Env, fmt.Sprintf("NOTIFY_SOCKET=%s", s.host))
+}
+
+func (s *notifySocket) setupSocket() error {
+ addr := net.UnixAddr{
+ Name: s.socketPath,
+ Net: "unixgram",
+ }
+
+ socket, err := net.ListenUnixgram("unixgram", &addr)
+ if err != nil {
+ return err
+ }
+
+ s.socket = socket
+ return nil
+}
+
+// pid1 must be set only with -d, as it is used to set the new process as the main process
+// for the service in systemd
+func (s *notifySocket) run(pid1 int) {
+ buf := make([]byte, 512)
+ notifySocketHostAddr := net.UnixAddr{Name: s.host, Net: "unixgram"}
+ client, err := net.DialUnix("unixgram", nil, ¬ifySocketHostAddr)
+ if err != nil {
+ logrus.Error(err)
+ return
+ }
+ for {
+ r, err := s.socket.Read(buf)
+ if err != nil {
+ break
+ }
+ var out bytes.Buffer
+ for _, line := range bytes.Split(buf[0:r], []byte{'\n'}) {
+ if bytes.HasPrefix(line, []byte("READY=")) {
+ _, err = out.Write(line)
+ if err != nil {
+ return
+ }
+
+ _, err = out.Write([]byte{'\n'})
+ if err != nil {
+ return
+ }
+
+ _, err = client.Write(out.Bytes())
+ if err != nil {
+ return
+ }
+
+ // now we can inform systemd to use pid1 as the pid to monitor
+ if pid1 > 0 {
+ newPid := fmt.Sprintf("MAINPID=%d\n", pid1)
+ client.Write([]byte(newPid))
+ }
+ return
+ }
+ }
+ }
+}
--- /dev/null
+// +build linux
+
+package main
+
+import (
+ "github.com/sirupsen/logrus"
+ "github.com/urfave/cli"
+)
+
+var pauseCommand = cli.Command{
+ Name: "pause",
+ Usage: "pause suspends all processes inside the container",
+ ArgsUsage: `<container-id>
+
+Where "<container-id>" is the name for the instance of the container to be
+paused. `,
+ Description: `The pause command suspends all processes in the instance of the container.
+
+Use runc list to identiy instances of containers and their current status.`,
+ Action: func(context *cli.Context) error {
+ if err := checkArgs(context, 1, exactArgs); err != nil {
+ return err
+ }
+ rootlessCg, err := shouldUseRootlessCgroupManager(context)
+ if err != nil {
+ return err
+ }
+ if rootlessCg {
+ logrus.Warnf("runc pause may fail if you don't have the full access to cgroups")
+ }
+ container, err := getContainer(context)
+ if err != nil {
+ return err
+ }
+ return container.Pause()
+ },
+}
+
+var resumeCommand = cli.Command{
+ Name: "resume",
+ Usage: "resumes all processes that have been previously paused",
+ ArgsUsage: `<container-id>
+
+Where "<container-id>" is the name for the instance of the container to be
+resumed.`,
+ Description: `The resume command resumes all processes in the instance of the container.
+
+Use runc list to identiy instances of containers and their current status.`,
+ Action: func(context *cli.Context) error {
+ if err := checkArgs(context, 1, exactArgs); err != nil {
+ return err
+ }
+ rootlessCg, err := shouldUseRootlessCgroupManager(context)
+ if err != nil {
+ return err
+ }
+ if rootlessCg {
+ logrus.Warn("runc resume may fail if you don't have the full access to cgroups")
+ }
+ container, err := getContainer(context)
+ if err != nil {
+ return err
+ }
+ return container.Resume()
+ },
+}
--- /dev/null
+// +build linux
+
+package main
+
+import (
+ "encoding/json"
+ "fmt"
+ "os"
+ "os/exec"
+ "strconv"
+ "strings"
+
+ "github.com/sirupsen/logrus"
+ "github.com/urfave/cli"
+)
+
+var psCommand = cli.Command{
+ Name: "ps",
+ Usage: "ps displays the processes running inside a container",
+ ArgsUsage: `<container-id> [ps options]`,
+ Flags: []cli.Flag{
+ cli.StringFlag{
+ Name: "format, f",
+ Value: "table",
+ Usage: `select one of: ` + formatOptions,
+ },
+ },
+ Action: func(context *cli.Context) error {
+ if err := checkArgs(context, 1, minArgs); err != nil {
+ return err
+ }
+ rootlessCg, err := shouldUseRootlessCgroupManager(context)
+ if err != nil {
+ return err
+ }
+ if rootlessCg {
+ logrus.Warn("runc ps may fail if you don't have the full access to cgroups")
+ }
+
+ container, err := getContainer(context)
+ if err != nil {
+ return err
+ }
+
+ pids, err := container.Processes()
+ if err != nil {
+ return err
+ }
+
+ switch context.String("format") {
+ case "table":
+ case "json":
+ return json.NewEncoder(os.Stdout).Encode(pids)
+ default:
+ return fmt.Errorf("invalid format option")
+ }
+
+ // [1:] is to remove command name, ex:
+ // context.Args(): [containet_id ps_arg1 ps_arg2 ...]
+ // psArgs: [ps_arg1 ps_arg2 ...]
+ //
+ psArgs := context.Args()[1:]
+ if len(psArgs) == 0 {
+ psArgs = []string{"-ef"}
+ }
+
+ cmd := exec.Command("ps", psArgs...)
+ output, err := cmd.CombinedOutput()
+ if err != nil {
+ return fmt.Errorf("%s: %s", err, output)
+ }
+
+ lines := strings.Split(string(output), "\n")
+ pidIndex, err := getPidIndex(lines[0])
+ if err != nil {
+ return err
+ }
+
+ fmt.Println(lines[0])
+ for _, line := range lines[1:] {
+ if len(line) == 0 {
+ continue
+ }
+ fields := strings.Fields(line)
+ p, err := strconv.Atoi(fields[pidIndex])
+ if err != nil {
+ return fmt.Errorf("unexpected pid '%s': %s", fields[pidIndex], err)
+ }
+
+ for _, pid := range pids {
+ if pid == p {
+ fmt.Println(line)
+ break
+ }
+ }
+ }
+ return nil
+ },
+ SkipArgReorder: true,
+}
+
+func getPidIndex(title string) (int, error) {
+ titles := strings.Fields(title)
+
+ pidIndex := -1
+ for i, name := range titles {
+ if name == "PID" {
+ return i, nil
+ }
+ }
+
+ return pidIndex, fmt.Errorf("couldn't find PID field in ps output")
+}
--- /dev/null
+// +build linux
+
+package main
+
+import (
+ "os"
+
+ "github.com/opencontainers/runc/libcontainer"
+ "github.com/opencontainers/runc/libcontainer/system"
+ "github.com/sirupsen/logrus"
+ "github.com/urfave/cli"
+)
+
+var restoreCommand = cli.Command{
+ Name: "restore",
+ Usage: "restore a container from a previous checkpoint",
+ ArgsUsage: `<container-id>
+
+Where "<container-id>" is the name for the instance of the container to be
+restored.`,
+ Description: `Restores the saved state of the container instance that was previously saved
+using the runc checkpoint command.`,
+ Flags: []cli.Flag{
+ cli.StringFlag{
+ Name: "console-socket",
+ Value: "",
+ Usage: "path to an AF_UNIX socket which will receive a file descriptor referencing the master end of the console's pseudoterminal",
+ },
+ cli.StringFlag{
+ Name: "image-path",
+ Value: "",
+ Usage: "path to criu image files for restoring",
+ },
+ cli.StringFlag{
+ Name: "work-path",
+ Value: "",
+ Usage: "path for saving work files and logs",
+ },
+ cli.BoolFlag{
+ Name: "tcp-established",
+ Usage: "allow open tcp connections",
+ },
+ cli.BoolFlag{
+ Name: "ext-unix-sk",
+ Usage: "allow external unix sockets",
+ },
+ cli.BoolFlag{
+ Name: "shell-job",
+ Usage: "allow shell jobs",
+ },
+ cli.BoolFlag{
+ Name: "file-locks",
+ Usage: "handle file locks, for safety",
+ },
+ cli.StringFlag{
+ Name: "manage-cgroups-mode",
+ Value: "",
+ Usage: "cgroups mode: 'soft' (default), 'full' and 'strict'",
+ },
+ cli.StringFlag{
+ Name: "bundle, b",
+ Value: "",
+ Usage: "path to the root of the bundle directory",
+ },
+ cli.BoolFlag{
+ Name: "detach,d",
+ Usage: "detach from the container's process",
+ },
+ cli.StringFlag{
+ Name: "pid-file",
+ Value: "",
+ Usage: "specify the file to write the process id to",
+ },
+ cli.BoolFlag{
+ Name: "no-subreaper",
+ Usage: "disable the use of the subreaper used to reap reparented processes",
+ },
+ cli.BoolFlag{
+ Name: "no-pivot",
+ Usage: "do not use pivot root to jail process inside rootfs. This should be used whenever the rootfs is on top of a ramdisk",
+ },
+ cli.StringSliceFlag{
+ Name: "empty-ns",
+ Usage: "create a namespace, but don't restore its properties",
+ },
+ cli.BoolFlag{
+ Name: "auto-dedup",
+ Usage: "enable auto deduplication of memory images",
+ },
+ cli.BoolFlag{
+ Name: "lazy-pages",
+ Usage: "use userfaultfd to lazily restore memory pages",
+ },
+ },
+ Action: func(context *cli.Context) error {
+ if err := checkArgs(context, 1, exactArgs); err != nil {
+ return err
+ }
+ // XXX: Currently this is untested with rootless containers.
+ if os.Geteuid() != 0 || system.RunningInUserNS() {
+ logrus.Warn("runc checkpoint is untested with rootless containers")
+ }
+
+ spec, err := setupSpec(context)
+ if err != nil {
+ return err
+ }
+ options := criuOptions(context)
+ if err := setEmptyNsMask(context, options); err != nil {
+ return err
+ }
+ status, err := startContainer(context, spec, CT_ACT_RESTORE, options)
+ if err != nil {
+ return err
+ }
+ // exit with the container's exit status so any external supervisor is
+ // notified of the exit with the correct exit status.
+ os.Exit(status)
+ return nil
+ },
+}
+
+func criuOptions(context *cli.Context) *libcontainer.CriuOpts {
+ imagePath := getCheckpointImagePath(context)
+ if err := os.MkdirAll(imagePath, 0655); err != nil {
+ fatal(err)
+ }
+ return &libcontainer.CriuOpts{
+ ImagesDirectory: imagePath,
+ WorkDirectory: context.String("work-path"),
+ ParentImage: context.String("parent-path"),
+ LeaveRunning: context.Bool("leave-running"),
+ TcpEstablished: context.Bool("tcp-established"),
+ ExternalUnixConnections: context.Bool("ext-unix-sk"),
+ ShellJob: context.Bool("shell-job"),
+ FileLocks: context.Bool("file-locks"),
+ PreDump: context.Bool("pre-dump"),
+ AutoDedup: context.Bool("auto-dedup"),
+ LazyPages: context.Bool("lazy-pages"),
+ StatusFd: context.String("status-fd"),
+ }
+}
--- /dev/null
+package main
+
+import "fmt"
+
+const (
+ RLIMIT_CPU = iota // CPU time in sec
+ RLIMIT_FSIZE // Maximum filesize
+ RLIMIT_DATA // max data size
+ RLIMIT_STACK // max stack size
+ RLIMIT_CORE // max core file size
+ RLIMIT_RSS // max resident set size
+ RLIMIT_NPROC // max number of processes
+ RLIMIT_NOFILE // max number of open files
+ RLIMIT_MEMLOCK // max locked-in-memory address space
+ RLIMIT_AS // address space limit
+ RLIMIT_LOCKS // maximum file locks held
+ RLIMIT_SIGPENDING // max number of pending signals
+ RLIMIT_MSGQUEUE // maximum bytes in POSIX mqueues
+ RLIMIT_NICE // max nice prio allowed to raise to
+ RLIMIT_RTPRIO // maximum realtime priority
+ RLIMIT_RTTIME // timeout for RT tasks in us
+)
+
+var rlimitMap = map[string]int{
+ "RLIMIT_CPU": RLIMIT_CPU,
+ "RLIMIT_FSIZE": RLIMIT_FSIZE,
+ "RLIMIT_DATA": RLIMIT_DATA,
+ "RLIMIT_STACK": RLIMIT_STACK,
+ "RLIMIT_CORE": RLIMIT_CORE,
+ "RLIMIT_RSS": RLIMIT_RSS,
+ "RLIMIT_NPROC": RLIMIT_NPROC,
+ "RLIMIT_NOFILE": RLIMIT_NOFILE,
+ "RLIMIT_MEMLOCK": RLIMIT_MEMLOCK,
+ "RLIMIT_AS": RLIMIT_AS,
+ "RLIMIT_LOCKS": RLIMIT_LOCKS,
+ "RLIMIT_SIGPENDING": RLIMIT_SIGPENDING,
+ "RLIMIT_MSGQUEUE": RLIMIT_MSGQUEUE,
+ "RLIMIT_NICE": RLIMIT_NICE,
+ "RLIMIT_RTPRIO": RLIMIT_RTPRIO,
+ "RLIMIT_RTTIME": RLIMIT_RTTIME,
+}
+
+func strToRlimit(key string) (int, error) {
+ rl, ok := rlimitMap[key]
+ if !ok {
+ return 0, fmt.Errorf("wrong rlimit value: %s", key)
+ }
+ return rl, nil
+}
--- /dev/null
+// +build linux
+
+package main
+
+import (
+ "os"
+
+ "github.com/opencontainers/runc/libcontainer/system"
+ "github.com/urfave/cli"
+)
+
+func shouldUseRootlessCgroupManager(context *cli.Context) (bool, error) {
+ if context != nil {
+ b, err := parseBoolOrAuto(context.GlobalString("rootless"))
+ if err != nil {
+ return false, err
+ }
+ // nil b stands for "auto detect"
+ if b != nil {
+ return *b, nil
+ }
+
+ if context.GlobalBool("systemd-cgroup") {
+ return false, nil
+ }
+ }
+ if os.Geteuid() != 0 {
+ return true, nil
+ }
+ if !system.RunningInUserNS() {
+ // euid == 0 , in the initial ns (i.e. the real root)
+ return false, nil
+ }
+ // euid = 0, in a userns.
+ // As we are unaware of cgroups path, we can't determine whether we have the full
+ // access to the cgroups path.
+ // Either way, we can safely decide to use the rootless cgroups manager.
+ return true, nil
+}
+
+func shouldHonorXDGRuntimeDir() bool {
+ if os.Getenv("XDG_RUNTIME_DIR") == "" {
+ return false
+ }
+ if os.Geteuid() != 0 {
+ return true
+ }
+ if !system.RunningInUserNS() {
+ // euid == 0 , in the initial ns (i.e. the real root)
+ // in this case, we should use /run/runc and ignore
+ // $XDG_RUNTIME_DIR (e.g. /run/user/0) for backward
+ // compatibility.
+ return false
+ }
+ // euid = 0, in a userns.
+ u, ok := os.LookupEnv("USER")
+ return !ok || u != "root"
+}
--- /dev/null
+// +build linux
+
+package main
+
+import (
+ "os"
+
+ "github.com/urfave/cli"
+)
+
+// default action is to start a container
+var runCommand = cli.Command{
+ Name: "run",
+ Usage: "create and run a container",
+ ArgsUsage: `<container-id>
+
+Where "<container-id>" is your name for the instance of the container that you
+are starting. The name you provide for the container instance must be unique on
+your host.`,
+ Description: `The run command creates an instance of a container for a bundle. The bundle
+is a directory with a specification file named "` + specConfig + `" and a root
+filesystem.
+
+The specification file includes an args parameter. The args parameter is used
+to specify command(s) that get run when the container is started. To change the
+command(s) that get executed on start, edit the args parameter of the spec. See
+"runc spec --help" for more explanation.`,
+ Flags: []cli.Flag{
+ cli.StringFlag{
+ Name: "bundle, b",
+ Value: "",
+ Usage: `path to the root of the bundle directory, defaults to the current directory`,
+ },
+ cli.StringFlag{
+ Name: "console-socket",
+ Value: "",
+ Usage: "path to an AF_UNIX socket which will receive a file descriptor referencing the master end of the console's pseudoterminal",
+ },
+ cli.BoolFlag{
+ Name: "detach, d",
+ Usage: "detach from the container's process",
+ },
+ cli.StringFlag{
+ Name: "pid-file",
+ Value: "",
+ Usage: "specify the file to write the process id to",
+ },
+ cli.BoolFlag{
+ Name: "no-subreaper",
+ Usage: "disable the use of the subreaper used to reap reparented processes",
+ },
+ cli.BoolFlag{
+ Name: "no-pivot",
+ Usage: "do not use pivot root to jail process inside rootfs. This should be used whenever the rootfs is on top of a ramdisk",
+ },
+ cli.BoolFlag{
+ Name: "no-new-keyring",
+ Usage: "do not create a new session keyring for the container. This will cause the container to inherit the calling processes session key",
+ },
+ cli.IntFlag{
+ Name: "preserve-fds",
+ Usage: "Pass N additional file descriptors to the container (stdio + $LISTEN_FDS + N in total)",
+ },
+ },
+ Action: func(context *cli.Context) error {
+ if err := checkArgs(context, 1, exactArgs); err != nil {
+ return err
+ }
+ if err := revisePidFile(context); err != nil {
+ return err
+ }
+ spec, err := setupSpec(context)
+ if err != nil {
+ return err
+ }
+ status, err := startContainer(context, spec, CT_ACT_RUN, nil)
+ if err == nil {
+ // exit with the container's exit status so any external supervisor is
+ // notified of the exit with the correct exit status.
+ os.Exit(status)
+ }
+ return err
+ },
+}
--- /dev/null
+#!/bin/bash
+
+if [ -z "$VALIDATE_UPSTREAM" ]; then
+ # this is kind of an expensive check, so let's not do this twice if we
+ # are running more than one validate bundlescript
+
+ VALIDATE_REPO='https://github.com/opencontainers/runc.git'
+ VALIDATE_BRANCH='master'
+
+ if [ "$TRAVIS" = 'true' -a "$TRAVIS_PULL_REQUEST" != 'false' ]; then
+ VALIDATE_REPO="https://github.com/${TRAVIS_REPO_SLUG}.git"
+ VALIDATE_BRANCH="${TRAVIS_BRANCH}"
+ fi
+
+ VALIDATE_HEAD="$(git rev-parse --verify HEAD)"
+
+ git fetch -q "$VALIDATE_REPO" "refs/heads/$VALIDATE_BRANCH"
+ VALIDATE_UPSTREAM="$(git rev-parse --verify FETCH_HEAD)"
+
+ VALIDATE_COMMIT_LOG="$VALIDATE_UPSTREAM..$VALIDATE_HEAD"
+ VALIDATE_COMMIT_DIFF="$VALIDATE_UPSTREAM...$VALIDATE_HEAD"
+
+ validate_diff() {
+ if [ "$VALIDATE_UPSTREAM" != "$VALIDATE_HEAD" ]; then
+ git diff "$VALIDATE_COMMIT_DIFF" "$@"
+ fi
+ }
+ validate_log() {
+ if [ "$VALIDATE_UPSTREAM" != "$VALIDATE_HEAD" ]; then
+ git log "$VALIDATE_COMMIT_LOG" "$@"
+ fi
+ }
+fi
--- /dev/null
+#!/usr/bin/env bash
+set -e
+
+# bits of this were adapted from check_config.sh in docker
+# see also https://github.com/docker/docker/blob/master/contrib/check-config.sh
+
+possibleConfigs=(
+ '/proc/config.gz'
+ "/boot/config-$(uname -r)"
+ "/usr/src/linux-$(uname -r)/.config"
+ '/usr/src/linux/.config'
+)
+possibleConfigFiles=(
+ 'config.gz'
+ "config-$(uname -r)"
+ '.config'
+)
+
+if ! command -v zgrep &>/dev/null; then
+ zgrep() {
+ zcat "$2" | grep "$1"
+ }
+fi
+
+kernelVersion="$(uname -r)"
+kernelMajor="${kernelVersion%%.*}"
+kernelMinor="${kernelVersion#$kernelMajor.}"
+kernelMinor="${kernelMinor%%.*}"
+
+is_set() {
+ zgrep "CONFIG_$1=[y|m]" "$CONFIG" >/dev/null
+}
+is_set_in_kernel() {
+ zgrep "CONFIG_$1=y" "$CONFIG" >/dev/null
+}
+is_set_as_module() {
+ zgrep "CONFIG_$1=m" "$CONFIG" >/dev/null
+}
+
+color() {
+ local codes=()
+ if [ "$1" = 'bold' ]; then
+ codes=("${codes[@]}" '1')
+ shift
+ fi
+ if [ "$#" -gt 0 ]; then
+ local code
+ case "$1" in
+ # see https://en.wikipedia.org/wiki/ANSI_escape_code#Colors
+ black) code=30 ;;
+ red) code=31 ;;
+ green) code=32 ;;
+ yellow) code=33 ;;
+ blue) code=34 ;;
+ magenta) code=35 ;;
+ cyan) code=36 ;;
+ white) code=37 ;;
+ esac
+ if [ "$code" ]; then
+ codes=("${codes[@]}" "$code")
+ fi
+ fi
+ local IFS=';'
+ echo -en '\033['"${codes[*]}"'m'
+}
+wrap_color() {
+ text="$1"
+ shift
+ color "$@"
+ echo -n "$text"
+ color reset
+ echo
+}
+
+wrap_good() {
+ echo "$(wrap_color "$1" white): $(wrap_color "$2" green)"
+}
+wrap_bad() {
+ echo "$(wrap_color "$1" bold): $(wrap_color "$2" bold red)"
+}
+wrap_warning() {
+ wrap_color >&2 "$*" red
+}
+
+check_flag() {
+ if is_set_in_kernel "$1"; then
+ wrap_good "CONFIG_$1" 'enabled'
+ elif is_set_as_module "$1"; then
+ wrap_good "CONFIG_$1" 'enabled (as module)'
+ else
+ wrap_bad "CONFIG_$1" 'missing'
+ fi
+}
+
+check_flags() {
+ for flag in "$@"; do
+ echo "- $(check_flag "$flag")"
+ done
+}
+
+check_distro_userns() {
+ source /etc/os-release 2>/dev/null || /bin/true
+ if [[ "${ID}" =~ ^(centos|rhel)$ && "${VERSION_ID}" =~ ^7 ]]; then
+ # this is a CentOS7 or RHEL7 system
+ grep -q "user_namespace.enable=1" /proc/cmdline || {
+ # no user namespace support enabled
+ wrap_bad " (RHEL7/CentOS7" "User namespaces disabled; add 'user_namespace.enable=1' to boot command line)"
+ }
+ fi
+}
+
+is_config() {
+ local config="$1"
+
+ # Todo: more check
+ [[ -f "$config" ]] && return 0
+ return 1
+}
+
+search_config() {
+ local target_dir="$1"
+ [[ "$target_dir" ]] || target_dir=("${possibleConfigs[@]}")
+
+ local tryConfig
+ for tryConfig in "${target_dir[@]}"; do
+ is_config "$tryConfig" && {
+ CONFIG="$tryConfig"
+ return
+ }
+ [[ -d "$tryConfig" ]] && {
+ for tryFile in "${possibleConfigFiles[@]}"; do
+ is_config "$tryConfig/$tryFile" && {
+ CONFIG="$tryConfig/$tryFile"
+ return
+ }
+ done
+ }
+ done
+
+ wrap_warning "error: cannot find kernel config"
+ wrap_warning " try running this script again, specifying the kernel config:"
+ wrap_warning " CONFIG=/path/to/kernel/.config $0 or $0 /path/to/kernel/.config"
+ exit 1
+}
+
+CONFIG="$1"
+
+is_config "$CONFIG" || {
+ if [[ ! "$CONFIG" ]]; then
+ wrap_color "info: no config specified, searching for kernel config ..." white
+ search_config
+ elif [[ -d "$CONFIG" ]]; then
+ wrap_color "info: input is a directory, searching for kernel config in this directory..." white
+ search_config "$CONFIG"
+ else
+ wrap_warning "warning: $CONFIG seems not a kernel config, searching other paths for kernel config ..."
+ search_config
+ fi
+}
+
+wrap_color "info: reading kernel config from $CONFIG ..." white
+echo
+
+echo 'Generally Necessary:'
+
+echo -n '- '
+cgroupSubsystemDir="$(awk '/[, ](cpu|cpuacct|cpuset|devices|freezer|memory)[, ]/ && $3 == "cgroup" { print $2 }' /proc/mounts | head -n1)"
+cgroupDir="$(dirname "$cgroupSubsystemDir")"
+if [ -d "$cgroupDir/cpu" -o -d "$cgroupDir/cpuacct" -o -d "$cgroupDir/cpuset" -o -d "$cgroupDir/devices" -o -d "$cgroupDir/freezer" -o -d "$cgroupDir/memory" ]; then
+ echo "$(wrap_good 'cgroup hierarchy' 'properly mounted') [$cgroupDir]"
+else
+ if [ "$cgroupSubsystemDir" ]; then
+ echo "$(wrap_bad 'cgroup hierarchy' 'single mountpoint!') [$cgroupSubsystemDir]"
+ else
+ echo "$(wrap_bad 'cgroup hierarchy' 'nonexistent??')"
+ fi
+ echo " $(wrap_color '(see https://github.com/tianon/cgroupfs-mount)' yellow)"
+fi
+
+if [ "$(cat /sys/module/apparmor/parameters/enabled 2>/dev/null)" = 'Y' ]; then
+ echo -n '- '
+ if command -v apparmor_parser &>/dev/null; then
+ echo "$(wrap_good 'apparmor' 'enabled and tools installed')"
+ else
+ echo "$(wrap_bad 'apparmor' 'enabled, but apparmor_parser missing')"
+ echo -n ' '
+ if command -v apt-get &>/dev/null; then
+ echo "$(wrap_color '(use "apt-get install apparmor" to fix this)')"
+ elif command -v yum &>/dev/null; then
+ echo "$(wrap_color '(your best bet is "yum install apparmor-parser")')"
+ else
+ echo "$(wrap_color '(look for an "apparmor" package for your distribution)')"
+ fi
+ fi
+fi
+
+flags=(
+ NAMESPACES {NET,PID,IPC,UTS}_NS
+ CGROUPS CGROUP_CPUACCT CGROUP_DEVICE CGROUP_FREEZER CGROUP_SCHED CPUSETS MEMCG
+ KEYS
+ MACVLAN VETH BRIDGE BRIDGE_NETFILTER
+ NF_NAT_IPV4 IP_NF_FILTER IP_NF_TARGET_MASQUERADE
+ NETFILTER_XT_MATCH_{ADDRTYPE,CONNTRACK}
+ NF_NAT NF_NAT_NEEDED
+
+ # required for bind-mounting /dev/mqueue into containers
+ POSIX_MQUEUE
+)
+check_flags "${flags[@]}"
+echo
+
+echo 'Optional Features:'
+{
+ check_flags USER_NS
+ check_distro_userns
+
+ check_flags SECCOMP
+ check_flags CGROUP_PIDS
+
+ check_flags MEMCG_SWAP MEMCG_SWAP_ENABLED
+ if is_set MEMCG_SWAP && ! is_set MEMCG_SWAP_ENABLED; then
+ echo " $(wrap_color '(note that cgroup swap accounting is not enabled in your kernel config, you can enable it by setting boot option "swapaccount=1")' bold black)"
+ fi
+}
+
+if [ "$kernelMajor" -lt 4 ] || [ "$kernelMajor" -eq 4 -a "$kernelMinor" -le 5 ]; then
+ check_flags MEMCG_KMEM
+fi
+
+if [ "$kernelMajor" -lt 3 ] || [ "$kernelMajor" -eq 3 -a "$kernelMinor" -le 18 ]; then
+ check_flags RESOURCE_COUNTERS
+fi
+
+if [ "$kernelMajor" -lt 3 ] || [ "$kernelMajor" -eq 3 -a "$kernelMinor" -le 13 ]; then
+ netprio=NETPRIO_CGROUP
+else
+ netprio=CGROUP_NET_PRIO
+fi
+
+flags=(
+ BLK_CGROUP BLK_DEV_THROTTLING IOSCHED_CFQ CFQ_GROUP_IOSCHED
+ CGROUP_PERF
+ CGROUP_HUGETLB
+ NET_CLS_CGROUP $netprio
+ CFS_BANDWIDTH FAIR_GROUP_SCHED RT_GROUP_SCHED
+)
+check_flags "${flags[@]}"
--- /dev/null
+#!/bin/bash
+# Copyright (C) 2017 SUSE LLC.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+
+## --->
+# Project-specific options and functions. In *theory* you shouldn't need to
+# touch anything else in this script in order to use this elsewhere.
+project="runc"
+root="$(readlink -f "$(dirname "${BASH_SOURCE}")/..")"
+
+# This function takes an output path as an argument, where the built
+# (preferably static) binary should be placed.
+function build_project() {
+ builddir="$(dirname "$1")"
+
+ # Build with all tags enabled.
+ make -C "$root" COMMIT_NO= BUILDTAGS="seccomp selinux apparmor" static
+ mv "$root/$project" "$1"
+}
+
+# End of the easy-to-configure portion.
+## <---
+
+# Print usage information.
+function usage() {
+ echo "usage: release.sh [-S <gpg-key-id>] [-c <commit-ish>] [-r <release-dir>] [-v <version>]" >&2
+ exit 1
+}
+
+# Log something to stderr.
+function log() {
+ echo "[*] $*" >&2
+}
+
+# Log something to stderr and then exit with 0.
+function bail() {
+ log "$@"
+ exit 0
+}
+
+# Conduct a sanity-check to make sure that GPG provided with the given
+# arguments can sign something. Inability to sign things is not a fatal error.
+function gpg_cansign() {
+ gpg "$@" --clear-sign </dev/null >/dev/null
+}
+
+# When creating releases we need to build static binaries, an archive of the
+# current commit, and generate detached signatures for both.
+keyid=""
+commit="HEAD"
+version=""
+releasedir=""
+hashcmd=""
+while getopts "S:c:r:v:h:" opt; do
+ case "$opt" in
+ S)
+ keyid="$OPTARG"
+ ;;
+ c)
+ commit="$OPTARG"
+ ;;
+ r)
+ releasedir="$OPTARG"
+ ;;
+ v)
+ version="$OPTARG"
+ ;;
+ h)
+ hashcmd="$OPTARG"
+ ;;
+ \:)
+ echo "Missing argument: -$OPTARG" >&2
+ usage
+ ;;
+ \?)
+ echo "Invalid option: -$OPTARG" >&2
+ usage
+ ;;
+ esac
+done
+
+version="${version:-$(<"$root/VERSION")}"
+releasedir="${releasedir:-release/$version}"
+hashcmd="${hashcmd:-sha256sum}"
+goarch="$(go env GOARCH || echo "amd64")"
+
+log "creating $project release in '$releasedir'"
+log " version: $version"
+log " commit: $commit"
+log " key: ${keyid:-DEFAULT}"
+log " hash: $hashcmd"
+
+# Make explicit what we're doing.
+set -x
+
+# Make the release directory.
+rm -rf "$releasedir" && mkdir -p "$releasedir"
+
+# Build project.
+build_project "$releasedir/$project.$goarch"
+
+# Generate new archive.
+git archive --format=tar --prefix="$project-$version/" "$commit" | xz > "$releasedir/$project.tar.xz"
+
+# Generate sha256 checksums for both.
+( cd "$releasedir" ; "$hashcmd" "$project".{"$goarch",tar.xz} > "$project.$hashcmd" ; )
+
+# Set up the gpgflags.
+[[ "$keyid" ]] && export gpgflags="--default-key $keyid"
+gpg_cansign $gpgflags || bail "Could not find suitable GPG key, skipping signing step."
+
+# Sign everything.
+gpg $gpgflags --detach-sign --armor "$releasedir/$project.$goarch"
+gpg $gpgflags --detach-sign --armor "$releasedir/$project.tar.xz"
+gpg $gpgflags --clear-sign --armor \
+ --output "$releasedir/$project.$hashcmd"{.tmp,} && \
+ mv "$releasedir/$project.$hashcmd"{.tmp,}
--- /dev/null
+#!/bin/bash
+
+mount -t tmpfs none /tmp
+exec "$@"
--- /dev/null
+#!/bin/bash
+
+source "$(dirname "$BASH_SOURCE")/.validate"
+
+IFS=$'\n'
+files=($(validate_diff --diff-filter=ACMR --name-only -- '*.c' | grep -v '^vendor/' || true))
+unset IFS
+
+# indent(1): "You must use the ‘-T’ option to tell indent the name of all the typenames in your program that are defined by typedef."
+INDENT="indent -linux -l120 -T size_t -T jmp_buf"
+if [ -z "$(indent --version 2>&1 | grep GNU)" ]; then
+ echo "Skipping C indentation checks, as GNU indent is not installed."
+ exit 0
+fi
+
+badFiles=()
+for f in "${files[@]}"; do
+ orig=$(mktemp)
+ formatted=$(mktemp)
+ # we use "git show" here to validate that what's committed is formatted
+ git show "$VALIDATE_HEAD:$f" > ${orig}
+ ${INDENT} ${orig} -o ${formatted}
+ if [ "$(diff -u ${orig} ${formatted})" ]; then
+ badFiles+=("$f")
+ fi
+ rm -f ${orig} ${formatted}
+done
+
+if [ ${#badFiles[@]} -eq 0 ]; then
+ echo 'Congratulations! All C source files are properly formatted.'
+else
+ {
+ echo "These files are not properly formatted:"
+ for f in "${badFiles[@]}"; do
+ echo " - $f"
+ done
+ echo
+ echo "Please reformat the above files using \"${INDENT}\" and commit the result."
+ echo
+ } >&2
+ false
+fi
--- /dev/null
+#!/bin/bash
+
+source "$(dirname "$BASH_SOURCE")/.validate"
+
+IFS=$'\n'
+files=($(validate_diff --diff-filter=ACMR --name-only -- '*.go' | grep -v '^vendor/' || true))
+unset IFS
+
+badFiles=()
+for f in "${files[@]}"; do
+ # we use "git show" here to validate that what's committed is formatted
+ if [ "$(git show "$VALIDATE_HEAD:$f" | gofmt -s -l)" ]; then
+ badFiles+=("$f")
+ fi
+done
+
+if [ ${#badFiles[@]} -eq 0 ]; then
+ echo 'Congratulations! All Go source files are properly formatted.'
+else
+ {
+ echo "These files are not properly gofmt'd:"
+ for f in "${badFiles[@]}"; do
+ echo " - $f"
+ done
+ echo
+ echo 'Please reformat the above files using "gofmt -s -w" and commit the result.'
+ echo
+ } >&2
+ false
+fi
--- /dev/null
+// +build linux
+// +build !mips,!mipsle,!mips64,!mips64le
+
+package main
+
+import (
+ "syscall"
+
+ "golang.org/x/sys/unix"
+)
+
+var signalMap = map[string]syscall.Signal{
+ "ABRT": unix.SIGABRT,
+ "ALRM": unix.SIGALRM,
+ "BUS": unix.SIGBUS,
+ "CHLD": unix.SIGCHLD,
+ "CLD": unix.SIGCLD,
+ "CONT": unix.SIGCONT,
+ "FPE": unix.SIGFPE,
+ "HUP": unix.SIGHUP,
+ "ILL": unix.SIGILL,
+ "INT": unix.SIGINT,
+ "IO": unix.SIGIO,
+ "IOT": unix.SIGIOT,
+ "KILL": unix.SIGKILL,
+ "PIPE": unix.SIGPIPE,
+ "POLL": unix.SIGPOLL,
+ "PROF": unix.SIGPROF,
+ "PWR": unix.SIGPWR,
+ "QUIT": unix.SIGQUIT,
+ "SEGV": unix.SIGSEGV,
+ "STKFLT": unix.SIGSTKFLT,
+ "STOP": unix.SIGSTOP,
+ "SYS": unix.SIGSYS,
+ "TERM": unix.SIGTERM,
+ "TRAP": unix.SIGTRAP,
+ "TSTP": unix.SIGTSTP,
+ "TTIN": unix.SIGTTIN,
+ "TTOU": unix.SIGTTOU,
+ "URG": unix.SIGURG,
+ "USR1": unix.SIGUSR1,
+ "USR2": unix.SIGUSR2,
+ "VTALRM": unix.SIGVTALRM,
+ "WINCH": unix.SIGWINCH,
+ "XCPU": unix.SIGXCPU,
+ "XFSZ": unix.SIGXFSZ,
+}
--- /dev/null
+// +build linux,mips linux,mipsle linux,mips64 linux,mips64le
+
+package main
+
+import (
+ "syscall"
+
+ "golang.org/x/sys/unix"
+)
+
+var signalMap = map[string]syscall.Signal{
+ "ABRT": unix.SIGABRT,
+ "ALRM": unix.SIGALRM,
+ "BUS": unix.SIGBUS,
+ "CHLD": unix.SIGCHLD,
+ "CLD": unix.SIGCLD,
+ "CONT": unix.SIGCONT,
+ "FPE": unix.SIGFPE,
+ "HUP": unix.SIGHUP,
+ "ILL": unix.SIGILL,
+ "INT": unix.SIGINT,
+ "IO": unix.SIGIO,
+ "IOT": unix.SIGIOT,
+ "KILL": unix.SIGKILL,
+ "PIPE": unix.SIGPIPE,
+ "POLL": unix.SIGPOLL,
+ "PROF": unix.SIGPROF,
+ "PWR": unix.SIGPWR,
+ "QUIT": unix.SIGQUIT,
+ "SEGV": unix.SIGSEGV,
+ "STOP": unix.SIGSTOP,
+ "SYS": unix.SIGSYS,
+ "TERM": unix.SIGTERM,
+ "TRAP": unix.SIGTRAP,
+ "TSTP": unix.SIGTSTP,
+ "TTIN": unix.SIGTTIN,
+ "TTOU": unix.SIGTTOU,
+ "URG": unix.SIGURG,
+ "USR1": unix.SIGUSR1,
+ "USR2": unix.SIGUSR2,
+ "VTALRM": unix.SIGVTALRM,
+ "WINCH": unix.SIGWINCH,
+ "XCPU": unix.SIGXCPU,
+ "XFSZ": unix.SIGXFSZ,
+}
--- /dev/null
+// +build linux
+
+package main
+
+import (
+ "os"
+ "os/signal"
+ "syscall" // only for Signal
+
+ "github.com/opencontainers/runc/libcontainer"
+ "github.com/opencontainers/runc/libcontainer/system"
+ "github.com/opencontainers/runc/libcontainer/utils"
+
+ "github.com/sirupsen/logrus"
+ "golang.org/x/sys/unix"
+)
+
+const signalBufferSize = 2048
+
+// newSignalHandler returns a signal handler for processing SIGCHLD and SIGWINCH signals
+// while still forwarding all other signals to the process.
+// If notifySocket is present, use it to read systemd notifications from the container and
+// forward them to notifySocketHost.
+func newSignalHandler(enableSubreaper bool, notifySocket *notifySocket) *signalHandler {
+ if enableSubreaper {
+ // set us as the subreaper before registering the signal handler for the container
+ if err := system.SetSubreaper(1); err != nil {
+ logrus.Warn(err)
+ }
+ }
+ // ensure that we have a large buffer size so that we do not miss any signals
+ // in case we are not processing them fast enough.
+ s := make(chan os.Signal, signalBufferSize)
+ // handle all signals for the process.
+ signal.Notify(s)
+ return &signalHandler{
+ signals: s,
+ notifySocket: notifySocket,
+ }
+}
+
+// exit models a process exit status with the pid and
+// exit status.
+type exit struct {
+ pid int
+ status int
+}
+
+type signalHandler struct {
+ signals chan os.Signal
+ notifySocket *notifySocket
+}
+
+// forward handles the main signal event loop forwarding, resizing, or reaping depending
+// on the signal received.
+func (h *signalHandler) forward(process *libcontainer.Process, tty *tty, detach bool) (int, error) {
+ // make sure we know the pid of our main process so that we can return
+ // after it dies.
+ if detach && h.notifySocket == nil {
+ return 0, nil
+ }
+
+ pid1, err := process.Pid()
+ if err != nil {
+ return -1, err
+ }
+
+ if h.notifySocket != nil {
+ if detach {
+ h.notifySocket.run(pid1)
+ return 0, nil
+ }
+ go h.notifySocket.run(0)
+ }
+
+ // Perform the initial tty resize. Always ignore errors resizing because
+ // stdout might have disappeared (due to races with when SIGHUP is sent).
+ _ = tty.resize()
+ // Handle and forward signals.
+ for s := range h.signals {
+ switch s {
+ case unix.SIGWINCH:
+ // Ignore errors resizing, as above.
+ _ = tty.resize()
+ case unix.SIGCHLD:
+ exits, err := h.reap()
+ if err != nil {
+ logrus.Error(err)
+ }
+ for _, e := range exits {
+ logrus.WithFields(logrus.Fields{
+ "pid": e.pid,
+ "status": e.status,
+ }).Debug("process exited")
+ if e.pid == pid1 {
+ // call Wait() on the process even though we already have the exit
+ // status because we must ensure that any of the go specific process
+ // fun such as flushing pipes are complete before we return.
+ process.Wait()
+ if h.notifySocket != nil {
+ h.notifySocket.Close()
+ }
+ return e.status, nil
+ }
+ }
+ default:
+ logrus.Debugf("sending signal to process %s", s)
+ if err := unix.Kill(pid1, s.(syscall.Signal)); err != nil {
+ logrus.Error(err)
+ }
+ }
+ }
+ return -1, nil
+}
+
+// reap runs wait4 in a loop until we have finished processing any existing exits
+// then returns all exits to the main event loop for further processing.
+func (h *signalHandler) reap() (exits []exit, err error) {
+ var (
+ ws unix.WaitStatus
+ rus unix.Rusage
+ )
+ for {
+ pid, err := unix.Wait4(-1, &ws, unix.WNOHANG, &rus)
+ if err != nil {
+ if err == unix.ECHILD {
+ return exits, nil
+ }
+ return nil, err
+ }
+ if pid <= 0 {
+ return exits, nil
+ }
+ exits = append(exits, exit{
+ pid: pid,
+ status: utils.ExitStatus(ws),
+ })
+ }
+}
--- /dev/null
+// +build linux
+
+package main
+
+import (
+ "encoding/json"
+ "fmt"
+ "io/ioutil"
+ "os"
+
+ "github.com/opencontainers/runc/libcontainer/configs"
+ "github.com/opencontainers/runc/libcontainer/specconv"
+ "github.com/opencontainers/runtime-spec/specs-go"
+ "github.com/urfave/cli"
+)
+
+var specCommand = cli.Command{
+ Name: "spec",
+ Usage: "create a new specification file",
+ ArgsUsage: "",
+ Description: `The spec command creates the new specification file named "` + specConfig + `" for
+the bundle.
+
+The spec generated is just a starter file. Editing of the spec is required to
+achieve desired results. For example, the newly generated spec includes an args
+parameter that is initially set to call the "sh" command when the container is
+started. Calling "sh" may work for an ubuntu container or busybox, but will not
+work for containers that do not include the "sh" program.
+
+EXAMPLE:
+ To run docker's hello-world container one needs to set the args parameter
+in the spec to call hello. This can be done using the sed command or a text
+editor. The following commands create a bundle for hello-world, change the
+default args parameter in the spec from "sh" to "/hello", then run the hello
+command in a new hello-world container named container1:
+
+ mkdir hello
+ cd hello
+ docker pull hello-world
+ docker export $(docker create hello-world) > hello-world.tar
+ mkdir rootfs
+ tar -C rootfs -xf hello-world.tar
+ runc spec
+ sed -i 's;"sh";"/hello";' ` + specConfig + `
+ runc run container1
+
+In the run command above, "container1" is the name for the instance of the
+container that you are starting. The name you provide for the container instance
+must be unique on your host.
+
+An alternative for generating a customized spec config is to use "oci-runtime-tool", the
+sub-command "oci-runtime-tool generate" has lots of options that can be used to do any
+customizations as you want, see runtime-tools (https://github.com/opencontainers/runtime-tools)
+to get more information.
+
+When starting a container through runc, runc needs root privilege. If not
+already running as root, you can use sudo to give runc root privilege. For
+example: "sudo runc start container1" will give runc root privilege to start the
+container on your host.
+
+Alternatively, you can start a rootless container, which has the ability to run
+without root privileges. For this to work, the specification file needs to be
+adjusted accordingly. You can pass the parameter --rootless to this command to
+generate a proper rootless spec file.
+
+Note that --rootless is not needed when you execute runc as the root in a user namespace
+created by an unprivileged user.
+`,
+ Flags: []cli.Flag{
+ cli.StringFlag{
+ Name: "bundle, b",
+ Value: "",
+ Usage: "path to the root of the bundle directory",
+ },
+ cli.BoolFlag{
+ Name: "rootless",
+ Usage: "generate a configuration for a rootless container",
+ },
+ },
+ Action: func(context *cli.Context) error {
+ if err := checkArgs(context, 0, exactArgs); err != nil {
+ return err
+ }
+ spec := specconv.Example()
+
+ rootless := context.Bool("rootless")
+ if rootless {
+ specconv.ToRootless(spec)
+ }
+
+ checkNoFile := func(name string) error {
+ _, err := os.Stat(name)
+ if err == nil {
+ return fmt.Errorf("File %s exists. Remove it first", name)
+ }
+ if !os.IsNotExist(err) {
+ return err
+ }
+ return nil
+ }
+ bundle := context.String("bundle")
+ if bundle != "" {
+ if err := os.Chdir(bundle); err != nil {
+ return err
+ }
+ }
+ if err := checkNoFile(specConfig); err != nil {
+ return err
+ }
+ data, err := json.MarshalIndent(spec, "", "\t")
+ if err != nil {
+ return err
+ }
+ return ioutil.WriteFile(specConfig, data, 0666)
+ },
+}
+
+// loadSpec loads the specification from the provided path.
+func loadSpec(cPath string) (spec *specs.Spec, err error) {
+ cf, err := os.Open(cPath)
+ if err != nil {
+ if os.IsNotExist(err) {
+ return nil, fmt.Errorf("JSON specification file %s not found", cPath)
+ }
+ return nil, err
+ }
+ defer cf.Close()
+
+ if err = json.NewDecoder(cf).Decode(&spec); err != nil {
+ return nil, err
+ }
+ return spec, validateProcessSpec(spec.Process)
+}
+
+func createLibContainerRlimit(rlimit specs.POSIXRlimit) (configs.Rlimit, error) {
+ rl, err := strToRlimit(rlimit.Type)
+ if err != nil {
+ return configs.Rlimit{}, err
+ }
+ return configs.Rlimit{
+ Type: rl,
+ Hard: rlimit.Hard,
+ Soft: rlimit.Soft,
+ }, nil
+}
--- /dev/null
+package main
+
+import (
+ "errors"
+ "fmt"
+
+ "github.com/opencontainers/runc/libcontainer"
+ "github.com/urfave/cli"
+)
+
+var startCommand = cli.Command{
+ Name: "start",
+ Usage: "executes the user defined process in a created container",
+ ArgsUsage: `<container-id>
+
+Where "<container-id>" is your name for the instance of the container that you
+are starting. The name you provide for the container instance must be unique on
+your host.`,
+ Description: `The start command executes the user defined process in a created container.`,
+ Action: func(context *cli.Context) error {
+ if err := checkArgs(context, 1, exactArgs); err != nil {
+ return err
+ }
+ container, err := getContainer(context)
+ if err != nil {
+ return err
+ }
+ status, err := container.Status()
+ if err != nil {
+ return err
+ }
+ switch status {
+ case libcontainer.Created:
+ return container.Exec()
+ case libcontainer.Stopped:
+ return errors.New("cannot start a container that has stopped")
+ case libcontainer.Running:
+ return errors.New("cannot start an already running container")
+ default:
+ return fmt.Errorf("cannot start a container in the %s state\n", status)
+ }
+ },
+}
--- /dev/null
+// +build linux
+
+package main
+
+import (
+ "encoding/json"
+ "os"
+
+ "github.com/opencontainers/runc/libcontainer"
+ "github.com/opencontainers/runc/libcontainer/utils"
+ "github.com/urfave/cli"
+)
+
+var stateCommand = cli.Command{
+ Name: "state",
+ Usage: "output the state of a container",
+ ArgsUsage: `<container-id>
+
+Where "<container-id>" is your name for the instance of the container.`,
+ Description: `The state command outputs current state information for the
+instance of a container.`,
+ Action: func(context *cli.Context) error {
+ if err := checkArgs(context, 1, exactArgs); err != nil {
+ return err
+ }
+ container, err := getContainer(context)
+ if err != nil {
+ return err
+ }
+ containerStatus, err := container.Status()
+ if err != nil {
+ return err
+ }
+ state, err := container.State()
+ if err != nil {
+ return err
+ }
+ pid := state.BaseState.InitProcessPid
+ if containerStatus == libcontainer.Stopped {
+ pid = 0
+ }
+ bundle, annotations := utils.Annotations(state.Config.Labels)
+ cs := containerState{
+ Version: state.BaseState.Config.Version,
+ ID: state.BaseState.ID,
+ InitProcessPid: pid,
+ Status: containerStatus.String(),
+ Bundle: bundle,
+ Rootfs: state.BaseState.Config.Rootfs,
+ Created: state.BaseState.Created,
+ Annotations: annotations,
+ }
+ data, err := json.MarshalIndent(cs, "", " ")
+ if err != nil {
+ return err
+ }
+ os.Stdout.Write(data)
+ return nil
+ },
+}
--- /dev/null
+# runc Integration Tests
+
+Integration tests provide end-to-end testing of runc.
+
+Note that integration tests do **not** replace unit tests.
+
+As a rule of thumb, code should be tested thoroughly with unit tests.
+Integration tests on the other hand are meant to test a specific feature end
+to end.
+
+Integration tests are written in *bash* using the
+[bats](https://github.com/sstephenson/bats) framework.
+
+## Running integration tests
+
+The easiest way to run integration tests is with Docker:
+```
+$ make integration
+```
+Alternatively, you can run integration tests directly on your host through make:
+```
+$ sudo make localintegration
+```
+Or you can just run them directly using bats
+```
+$ sudo bats tests/integration
+```
+To run a single test bucket:
+```
+$ make integration TESTPATH="/checkpoint.bats"
+```
+
+
+To run them on your host, you will need to setup a development environment plus
+[bats](https://github.com/sstephenson/bats#installing-bats-from-source)
+For example:
+```
+$ cd ~/go/src/github.com
+$ git clone https://github.com/sstephenson/bats.git
+$ cd bats
+$ ./install.sh /usr/local
+```
+
+> **Note**: There are known issues running the integration tests using
+> **devicemapper** as a storage driver, make sure that your docker daemon
+> is using **aufs** if you want to successfully run the integration tests.
+
+## Writing integration tests
+
+[helper functions]
+(https://github.com/opencontainers/runc/blob/master/test/integration/helpers.bash)
+are provided in order to facilitate writing tests.
+
+```sh
+#!/usr/bin/env bats
+
+# This will load the helpers.
+load helpers
+
+# setup is called at the beginning of every test.
+function setup() {
+ # see functions teardown_hello and setup_hello in helpers.bash, used to
+ # create a pristine environment for running your tests
+ teardown_hello
+ setup_hello
+}
+
+# teardown is called at the end of every test.
+function teardown() {
+ teardown_hello
+}
+
+@test "this is a simple test" {
+ runc run containerid
+ # "The runc macro" automatically populates $status, $output and $lines.
+ # Please refer to bats documentation to find out more.
+ [ "$status" -eq 0 ]
+
+ # check expected output
+ [[ "${output}" == *"Hello"* ]]
+}
+
+```
--- /dev/null
+#!/usr/bin/env bats
+
+load helpers
+
+function teardown() {
+ rm -f $BATS_TMPDIR/runc-cgroups-integration-test.json
+ teardown_running_container test_cgroups_kmem
+ teardown_running_container test_cgroups_permissions
+ teardown_busybox
+}
+
+function setup() {
+ teardown
+ setup_busybox
+}
+
+function check_cgroup_value() {
+ cgroup=$1
+ source=$2
+ expected=$3
+
+ current=$(cat $cgroup/$source)
+ echo $cgroup/$source
+ echo "current" $current "!?" "$expected"
+ [ "$current" -eq "$expected" ]
+}
+
+@test "runc update --kernel-memory (initialized)" {
+ [[ "$ROOTLESS" -ne 0 ]] && requires rootless_cgroup
+ requires cgroups_kmem
+
+ set_cgroups_path "$BUSYBOX_BUNDLE"
+
+ # Set some initial known values
+ DATA=$(cat <<-EOF
+ "memory": {
+ "kernel": 16777216
+ },
+EOF
+ )
+ DATA=$(echo ${DATA} | sed 's/\n/\\n/g')
+ sed -i "s/\(\"resources\": {\)/\1\n${DATA}/" ${BUSYBOX_BUNDLE}/config.json
+
+ # run a detached busybox to work with
+ runc run -d --console-socket $CONSOLE_SOCKET test_cgroups_kmem
+ [ "$status" -eq 0 ]
+
+ # update kernel memory limit
+ runc update test_cgroups_kmem --kernel-memory 50331648
+ [ "$status" -eq 0 ]
+
+ # check the value
+ check_cgroup_value $CGROUP_MEMORY "memory.kmem.limit_in_bytes" 50331648
+}
+
+@test "runc update --kernel-memory (uninitialized)" {
+ [[ "$ROOTLESS" -ne 0 ]] && requires rootless_cgroup
+ requires cgroups_kmem
+
+ set_cgroups_path "$BUSYBOX_BUNDLE"
+
+ # run a detached busybox to work with
+ runc run -d --console-socket $CONSOLE_SOCKET test_cgroups_kmem
+ [ "$status" -eq 0 ]
+
+ # update kernel memory limit
+ runc update test_cgroups_kmem --kernel-memory 50331648
+ # Since kernel 4.6, we can update kernel memory without initialization
+ # because it's accounted by default.
+ if [ "$KERNEL_MAJOR" -lt 4 ] || [ "$KERNEL_MAJOR" -eq 4 -a "$KERNEL_MINOR" -le 5 ]; then
+ [ ! "$status" -eq 0 ]
+ else
+ [ "$status" -eq 0 ]
+ check_cgroup_value $CGROUP_MEMORY "memory.kmem.limit_in_bytes" 50331648
+ fi
+}
+
+@test "runc create (no limits + no cgrouppath + no permission) succeeds" {
+ runc run -d --console-socket $CONSOLE_SOCKET test_cgroups_permissions
+ [ "$status" -eq 0 ]
+}
+
+@test "runc create (rootless + no limits + cgrouppath + no permission) fails with permission error" {
+ requires rootless
+ requires rootless_no_cgroup
+
+ set_cgroups_path "$BUSYBOX_BUNDLE"
+
+ runc run -d --console-socket $CONSOLE_SOCKET test_cgroups_permissions
+ [ "$status" -eq 1 ]
+ [[ ${lines[1]} == *"permission denied"* ]]
+}
+
+@test "runc create (rootless + limits + no cgrouppath + no permission) fails with informative error" {
+ requires rootless
+ requires rootless_no_cgroup
+
+ set_resources_limit "$BUSYBOX_BUNDLE"
+
+ runc run -d --console-socket $CONSOLE_SOCKET test_cgroups_permissions
+ [ "$status" -eq 1 ]
+ [[ ${lines[1]} == *"cannot set pids limit: container could not join or create cgroup"* ]]
+}
+
+@test "runc create (limits + cgrouppath + permission on the cgroup dir) succeeds" {
+ [[ "$ROOTLESS" -ne 0 ]] && requires rootless_cgroup
+
+ set_cgroups_path "$BUSYBOX_BUNDLE"
+ set_resources_limit "$BUSYBOX_BUNDLE"
+
+ runc run -d --console-socket $CONSOLE_SOCKET test_cgroups_permissions
+ [ "$status" -eq 0 ]
+}
+
+@test "runc exec (limits + cgrouppath + permission on the cgroup dir) succeeds" {
+ [[ "$ROOTLESS" -ne 0 ]] && requires rootless_cgroup
+
+ set_cgroups_path "$BUSYBOX_BUNDLE"
+ set_resources_limit "$BUSYBOX_BUNDLE"
+
+ runc run -d --console-socket $CONSOLE_SOCKET test_cgroups_permissions
+ [ "$status" -eq 0 ]
+
+ runc exec test_cgroups_permissions echo "cgroups_exec"
+ [ "$status" -eq 0 ]
+ [[ ${lines[0]} == *"cgroups_exec"* ]]
+}
--- /dev/null
+#!/usr/bin/env bats
+
+load helpers
+
+function setup() {
+ teardown_busybox
+ setup_busybox
+}
+
+function teardown() {
+ teardown_busybox
+}
+
+@test "checkpoint and restore" {
+ # XXX: currently criu require root containers.
+ requires criu root
+
+ runc run -d --console-socket $CONSOLE_SOCKET test_busybox
+ [ "$status" -eq 0 ]
+
+ testcontainer test_busybox running
+
+ for i in `seq 2`; do
+ # checkpoint the running container
+ runc --criu "$CRIU" checkpoint --work-path ./work-dir test_busybox
+ ret=$?
+ # if you are having problems getting criu to work uncomment the following dump:
+ #cat /run/opencontainer/containers/test_busybox/criu.work/dump.log
+ cat ./work-dir/dump.log | grep -B 5 Error || true
+ [ "$ret" -eq 0 ]
+
+ # after checkpoint busybox is no longer running
+ runc state test_busybox
+ [ "$status" -ne 0 ]
+
+ # restore from checkpoint
+ runc --criu "$CRIU" restore -d --work-path ./work-dir --console-socket $CONSOLE_SOCKET test_busybox
+ ret=$?
+ cat ./work-dir/restore.log | grep -B 5 Error || true
+ [ "$ret" -eq 0 ]
+
+ # busybox should be back up and running
+ testcontainer test_busybox running
+ done
+}
+
+@test "checkpoint --pre-dump and restore" {
+ # XXX: currently criu require root containers.
+ requires criu root
+
+ # The changes to 'terminal' are needed for running in detached mode
+ sed -i 's;"terminal": true;"terminal": false;' config.json
+ sed -i 's/"sh"/"sh","-c","for i in `seq 10`; do read xxx || continue; echo ponG $xxx; done"/' config.json
+
+ # The following code creates pipes for stdin and stdout.
+ # CRIU can't handle fifo-s, so we need all these tricks.
+ fifo=`mktemp -u /tmp/runc-fifo-XXXXXX`
+ mkfifo $fifo
+
+ # stdout
+ cat $fifo | cat $fifo &
+ pid=$!
+ exec 50</proc/$pid/fd/0
+ exec 51>/proc/$pid/fd/0
+
+ # stdin
+ cat $fifo | cat $fifo &
+ pid=$!
+ exec 60</proc/$pid/fd/0
+ exec 61>/proc/$pid/fd/0
+
+ echo -n > $fifo
+ unlink $fifo
+
+ # run busybox
+ __runc run -d test_busybox <&60 >&51 2>&51
+ [ $? -eq 0 ]
+
+ testcontainer test_busybox running
+
+ #test checkpoint pre-dump
+ mkdir parent-dir
+ runc --criu "$CRIU" checkpoint --pre-dump --image-path ./parent-dir test_busybox
+ [ "$status" -eq 0 ]
+
+ # busybox should still be running
+ runc state test_busybox
+ [ "$status" -eq 0 ]
+ [[ "${output}" == *"running"* ]]
+
+ # checkpoint the running container
+ mkdir image-dir
+ mkdir work-dir
+ runc --criu "$CRIU" checkpoint --parent-path ./parent-dir --work-path ./work-dir --image-path ./image-dir test_busybox
+ cat ./work-dir/dump.log | grep -B 5 Error || true
+ [ "$status" -eq 0 ]
+
+ # after checkpoint busybox is no longer running
+ runc state test_busybox
+ [ "$status" -ne 0 ]
+
+ # restore from checkpoint
+ __runc --criu "$CRIU" restore -d --work-path ./work-dir --image-path ./image-dir test_busybox <&60 >&51 2>&51
+ ret=$?
+ cat ./work-dir/restore.log | grep -B 5 Error || true
+ [ $ret -eq 0 ]
+
+ # busybox should be back up and running
+ testcontainer test_busybox running
+
+ runc exec --cwd /bin test_busybox echo ok
+ [ "$status" -eq 0 ]
+ [[ ${output} == "ok" ]]
+
+ echo Ping >&61
+ exec 61>&-
+ exec 51>&-
+ run cat <&50
+ [ "$status" -eq 0 ]
+ [[ "${output}" == *"ponG Ping"* ]]
+}
+
+@test "checkpoint --lazy-pages and restore" {
+ # XXX: currently criu require root containers.
+ requires criu root
+
+ # check if lazy-pages is supported
+ run ${CRIU} check --feature uffd-noncoop
+ if [ "$status" -eq 1 ]; then
+ # this criu does not support lazy migration; skip the test
+ skip "this criu does not support lazy migration"
+ fi
+
+ # The changes to 'terminal' are needed for running in detached mode
+ sed -i 's;"terminal": true;"terminal": false;' config.json
+ # This should not be necessary: https://github.com/checkpoint-restore/criu/issues/575
+ sed -i 's;"readonly": true;"readonly": false;' config.json
+ sed -i 's/"sh"/"sh","-c","for i in `seq 10`; do read xxx || continue; echo ponG $xxx; done"/' config.json
+
+ # The following code creates pipes for stdin and stdout.
+ # CRIU can't handle fifo-s, so we need all these tricks.
+ fifo=`mktemp -u /tmp/runc-fifo-XXXXXX`
+ mkfifo $fifo
+
+ # For lazy migration we need to know when CRIU is ready to serve
+ # the memory pages via TCP.
+ lazy_pipe=`mktemp -u /tmp/lazy-pipe-XXXXXX`
+ mkfifo $lazy_pipe
+
+ # TCP port for lazy migration
+ port=27277
+
+ # stdout
+ cat $fifo | cat $fifo &
+ pid=$!
+ exec 50</proc/$pid/fd/0
+ exec 51>/proc/$pid/fd/0
+
+ # stdin
+ cat $fifo | cat $fifo &
+ pid=$!
+ exec 60</proc/$pid/fd/0
+ exec 61>/proc/$pid/fd/0
+
+ echo -n > $fifo
+ unlink $fifo
+
+ # run busybox
+ __runc run -d test_busybox <&60 >&51 2>&51
+ [ $? -eq 0 ]
+
+ testcontainer test_busybox running
+
+ # checkpoint the running container
+ mkdir image-dir
+ mkdir work-dir
+ # Double fork taken from helpers.bats
+ # We need to start 'runc checkpoint --lazy-pages' in the background,
+ # so we double fork in the shell.
+ (runc --criu "$CRIU" checkpoint --lazy-pages --page-server 0.0.0.0:${port} --status-fd ${lazy_pipe} --work-path ./work-dir --image-path ./image-dir test_busybox & ) &
+ # Sleeping here. This is ugly, but not sure how else to handle it.
+ # The return code of the in the background running runc is needed, if
+ # there is some basic error. If the lazy migration is ready can
+ # be handled by $lazy_pipe. Which probably will always be ready
+ # after sleeping two seconds.
+ sleep 2
+ # Check if inventory.img was written
+ [ -e image-dir/inventory.img ]
+ # If the inventory.img exists criu checkpointed some things, let's see
+ # if there were other errors in the log file.
+ run grep -B 5 Error ./work-dir/dump.log -q
+ [ "$status" -eq 1 ]
+
+ # This will block until CRIU is ready to serve memory pages
+ cat $lazy_pipe
+ [ "$status" -eq 1 ]
+
+ unlink $lazy_pipe
+
+ # Double fork taken from helpers.bats
+ # We need to start 'criu lazy-pages' in the background,
+ # so we double fork in the shell.
+ # Start CRIU in lazy-daemon mode
+ $(${CRIU} lazy-pages --page-server --address 127.0.0.1 --port ${port} -D image-dir &) &
+
+ # Restore lazily from checkpoint.
+ # The restored container needs a different name as the checkpointed
+ # container is not yet destroyed. It is only destroyed at that point
+ # in time when the last page is lazily transferred to the destination.
+ # Killing the CRIU on the checkpoint side will let the container
+ # continue to run if the migration failed at some point.
+ __runc --criu "$CRIU" restore -d --work-path ./image-dir --image-path ./image-dir --lazy-pages test_busybox_restore <&60 >&51 2>&51
+ ret=$?
+ [ $ret -eq 0 ]
+ run grep -B 5 Error ./work-dir/dump.log -q
+ [ "$status" -eq 1 ]
+
+ # busybox should be back up and running
+ testcontainer test_busybox_restore running
+
+ runc exec --cwd /bin test_busybox_restore echo ok
+ [ "$status" -eq 0 ]
+ [[ ${output} == "ok" ]]
+
+ echo Ping >&61
+ exec 61>&-
+ exec 51>&-
+ run cat <&50
+ [ "$status" -eq 0 ]
+ [[ "${output}" == *"ponG Ping"* ]]
+}
+
+@test "checkpoint and restore in external network namespace" {
+ # XXX: currently criu require root containers.
+ requires criu root
+
+ # check if external_net_ns is supported; only with criu 3.10++
+ run ${CRIU} check --feature external_net_ns
+ if [ "$status" -eq 1 ]; then
+ # this criu does not support external_net_ns; skip the test
+ skip "this criu does not support external network namespaces"
+ fi
+
+ # create a temporary name for the test network namespace
+ tmp=`mktemp`
+ rm -f $tmp
+ ns_name=`basename $tmp`
+ # create network namespace
+ ip netns add $ns_name
+ ns_path=`ip netns add $ns_name 2>&1 | sed -e 's/.*"\(.*\)".*/\1/'`
+
+ ns_inode=`ls -iL $ns_path | awk '{ print $1 }'`
+
+ # tell runc which network namespace to use
+ sed -i "s;\"type\": \"network\";\"type\": \"network\",\"path\": \"$ns_path\";" config.json
+
+ runc run -d --console-socket $CONSOLE_SOCKET test_busybox
+ [ "$status" -eq 0 ]
+
+ testcontainer test_busybox running
+
+ for i in `seq 2`; do
+ # checkpoint the running container; this automatically tells CRIU to
+ # handle the network namespace defined in config.json as an external
+ runc --criu "$CRIU" checkpoint --work-path ./work-dir test_busybox
+ ret=$?
+ # if you are having problems getting criu to work uncomment the following dump:
+ #cat /run/opencontainer/containers/test_busybox/criu.work/dump.log
+ cat ./work-dir/dump.log | grep -B 5 Error || true
+ [ "$ret" -eq 0 ]
+
+ # after checkpoint busybox is no longer running
+ runc state test_busybox
+ [ "$status" -ne 0 ]
+
+ # restore from checkpoint; this should restore the container into the existing network namespace
+ runc --criu "$CRIU" restore -d --work-path ./work-dir --console-socket $CONSOLE_SOCKET test_busybox
+ ret=$?
+ cat ./work-dir/restore.log | grep -B 5 Error || true
+ [ "$ret" -eq 0 ]
+
+ # busybox should be back up and running
+ testcontainer test_busybox running
+
+ # container should be running in same network namespace as before
+ pid=`__runc state test_busybox | jq '.pid'`
+ ns_inode_new=`readlink /proc/$pid/ns/net | sed -e 's/.*\[\(.*\)\]/\1/'`
+ echo "old network namespace inode $ns_inode"
+ echo "new network namespace inode $ns_inode_new"
+ [ "$ns_inode" -eq "$ns_inode_new" ]
+ done
+ ip netns del $ns_name
+}
+
--- /dev/null
+#!/usr/bin/env bats
+
+load helpers
+
+function setup() {
+ teardown_busybox
+ setup_busybox
+}
+
+function teardown() {
+ teardown_busybox
+}
+
+@test "runc create" {
+ runc create --console-socket $CONSOLE_SOCKET test_busybox
+ [ "$status" -eq 0 ]
+
+ testcontainer test_busybox created
+
+ # start the command
+ runc start test_busybox
+ [ "$status" -eq 0 ]
+
+ testcontainer test_busybox running
+}
+
+@test "runc create exec" {
+ runc create --console-socket $CONSOLE_SOCKET test_busybox
+ [ "$status" -eq 0 ]
+
+ testcontainer test_busybox created
+
+ runc exec test_busybox true
+ [ "$status" -eq 0 ]
+
+ testcontainer test_busybox created
+
+ # start the command
+ runc start test_busybox
+ [ "$status" -eq 0 ]
+
+ testcontainer test_busybox running
+}
+
+@test "runc create --pid-file" {
+ runc create --pid-file pid.txt --console-socket $CONSOLE_SOCKET test_busybox
+ [ "$status" -eq 0 ]
+
+ testcontainer test_busybox created
+
+ # check pid.txt was generated
+ [ -e pid.txt ]
+
+ run cat pid.txt
+ [ "$status" -eq 0 ]
+ [[ ${lines[0]} == $(__runc state test_busybox | jq '.pid') ]]
+
+ # start the command
+ runc start test_busybox
+ [ "$status" -eq 0 ]
+
+ testcontainer test_busybox running
+}
+
+@test "runc create --pid-file with new CWD" {
+ # create pid_file directory as the CWD
+ run mkdir pid_file
+ [ "$status" -eq 0 ]
+ run cd pid_file
+ [ "$status" -eq 0 ]
+
+ runc create --pid-file pid.txt -b $BUSYBOX_BUNDLE --console-socket $CONSOLE_SOCKET test_busybox
+ [ "$status" -eq 0 ]
+
+ testcontainer test_busybox created
+
+ # check pid.txt was generated
+ [ -e pid.txt ]
+
+ run cat pid.txt
+ [ "$status" -eq 0 ]
+ [[ ${lines[0]} == $(__runc state test_busybox | jq '.pid') ]]
+
+ # start the command
+ runc start test_busybox
+ [ "$status" -eq 0 ]
+
+ testcontainer test_busybox running
+}
--- /dev/null
+#!/usr/bin/env bats
+
+load helpers
+
+function setup() {
+ teardown_hello
+ setup_hello
+}
+
+function teardown() {
+ teardown_hello
+}
+
+@test "global --debug" {
+ # run hello-world
+ runc --debug run test_hello
+ echo "${output}"
+ [ "$status" -eq 0 ]
+}
+
+@test "global --debug to --log" {
+ # run hello-world
+ runc --log log.out --debug run test_hello
+ [ "$status" -eq 0 ]
+
+ # check output does not include debug info
+ [[ "${output}" != *"level=debug"* ]]
+
+ # check log.out was generated
+ [ -e log.out ]
+
+ # check expected debug output was sent to log.out
+ run cat log.out
+ [ "$status" -eq 0 ]
+ [[ "${output}" == *"level=debug"* ]]
+}
+
+@test "global --debug to --log --log-format 'text'" {
+ # run hello-world
+ runc --log log.out --log-format "text" --debug run test_hello
+ [ "$status" -eq 0 ]
+
+ # check output does not include debug info
+ [[ "${output}" != *"level=debug"* ]]
+
+ # check log.out was generated
+ [ -e log.out ]
+
+ # check expected debug output was sent to log.out
+ run cat log.out
+ [ "$status" -eq 0 ]
+ [[ "${output}" == *"level=debug"* ]]
+}
+
+@test "global --debug to --log --log-format 'json'" {
+ # run hello-world
+ runc --log log.out --log-format "json" --debug run test_hello
+ [ "$status" -eq 0 ]
+
+ # check output does not include debug info
+ [[ "${output}" != *"level=debug"* ]]
+
+ # check log.out was generated
+ [ -e log.out ]
+
+ # check expected debug output was sent to log.out
+ run cat log.out
+ [ "$status" -eq 0 ]
+ [[ "${output}" == *'"level":"debug"'* ]]
+}
--- /dev/null
+#!/usr/bin/env bats
+
+load helpers
+
+function setup() {
+ teardown_busybox
+ setup_busybox
+}
+
+function teardown() {
+ teardown_busybox
+}
+
+@test "runc delete" {
+ # run busybox detached
+ runc run -d --console-socket $CONSOLE_SOCKET test_busybox
+ [ "$status" -eq 0 ]
+
+ # check state
+ testcontainer test_busybox running
+
+ runc kill test_busybox KILL
+ [ "$status" -eq 0 ]
+ # wait for busybox to be in the destroyed state
+ retry 10 1 eval "__runc state test_busybox | grep -q 'stopped'"
+
+ # delete test_busybox
+ runc delete test_busybox
+ [ "$status" -eq 0 ]
+
+ runc state test_busybox
+ [ "$status" -ne 0 ]
+}
+
+@test "runc delete --force" {
+ # run busybox detached
+ runc run -d --console-socket $CONSOLE_SOCKET test_busybox
+ [ "$status" -eq 0 ]
+
+ # check state
+ testcontainer test_busybox running
+
+ # force delete test_busybox
+ runc delete --force test_busybox
+
+ runc state test_busybox
+ [ "$status" -ne 0 ]
+}
+
+@test "runc delete --force ignore not exist" {
+ runc delete --force notexists
+ [ "$status" -eq 0 ]
+}
--- /dev/null
+#!/usr/bin/env bats
+
+load helpers
+
+function setup() {
+ teardown_busybox
+ setup_busybox
+}
+
+function teardown() {
+ teardown_busybox
+}
+
+@test "events --stats" {
+ # XXX: currently cgroups require root containers.
+ requires root
+
+ # run busybox detached
+ runc run -d --console-socket $CONSOLE_SOCKET test_busybox
+ [ "$status" -eq 0 ]
+
+ # generate stats
+ runc events --stats test_busybox
+ [ "$status" -eq 0 ]
+ [[ "${lines[0]}" == [\{]"\"type\""[:]"\"stats\""[,]"\"id\""[:]"\"test_busybox\""[,]* ]]
+ [[ "${lines[0]}" == *"data"* ]]
+}
+
+@test "events --interval default " {
+ # XXX: currently cgroups require root containers.
+ requires root
+
+ # run busybox detached
+ runc run -d --console-socket $CONSOLE_SOCKET test_busybox
+ [ "$status" -eq 0 ]
+
+ # spawn two sub processes (shells)
+ # the first sub process is an event logger that sends stats events to events.log
+ # the second sub process waits for an event that includes test_busybox then
+ # kills the test_busybox container which causes the event logger to exit
+ (__runc events test_busybox > events.log) &
+ (
+ retry 10 1 eval "grep -q 'test_busybox' events.log"
+ teardown_running_container test_busybox
+ ) &
+ wait # wait for the above sub shells to finish
+
+ [ -e events.log ]
+
+ run cat events.log
+ [ "$status" -eq 0 ]
+ [[ "${lines[0]}" == [\{]"\"type\""[:]"\"stats\""[,]"\"id\""[:]"\"test_busybox\""[,]* ]]
+ [[ "${lines[0]}" == *"data"* ]]
+}
+
+@test "events --interval 1s " {
+ # XXX: currently cgroups require root containers.
+ requires root
+
+ # run busybox detached
+ runc run -d --console-socket $CONSOLE_SOCKET test_busybox
+ [ "$status" -eq 0 ]
+
+ # spawn two sub processes (shells)
+ # the first sub process is an event logger that sends stats events to events.log once a second
+ # the second sub process tries 3 times for an event that incudes test_busybox
+ # pausing 1s between each attempt then kills the test_busybox container which
+ # causes the event logger to exit
+ (__runc events --interval 1s test_busybox > events.log) &
+ (
+ retry 3 1 eval "grep -q 'test_busybox' events.log"
+ teardown_running_container test_busybox
+ ) &
+ wait # wait for the above sub shells to finish
+
+ [ -e events.log ]
+
+ run eval "grep -q 'test_busybox' events.log"
+ [ "$status" -eq 0 ]
+}
+
+@test "events --interval 100ms " {
+ # XXX: currently cgroups require root containers.
+ requires root
+
+ # run busybox detached
+ runc run -d --console-socket $CONSOLE_SOCKET test_busybox
+ [ "$status" -eq 0 ]
+
+ #prove there is no carry over of events.log from a prior test
+ [ ! -e events.log ]
+
+ # spawn two sub processes (shells)
+ # the first sub process is an event logger that sends stats events to events.log once every 100ms
+ # the second sub process tries 3 times for an event that incudes test_busybox
+ # pausing 100s between each attempt then kills the test_busybox container which
+ # causes the event logger to exit
+ (__runc events --interval 100ms test_busybox > events.log) &
+ (
+ retry 3 0.100 eval "grep -q 'test_busybox' events.log"
+ teardown_running_container test_busybox
+ ) &
+ wait # wait for the above sub shells to finish
+
+ [ -e events.log ]
+
+ run eval "grep -q 'test_busybox' events.log"
+ [ "$status" -eq 0 ]
+}
--- /dev/null
+#!/usr/bin/env bats
+
+load helpers
+
+function setup() {
+ teardown_busybox
+ setup_busybox
+}
+
+function teardown() {
+ teardown_busybox
+}
+
+@test "runc exec" {
+ # run busybox detached
+ runc run -d --console-socket $CONSOLE_SOCKET test_busybox
+ [ "$status" -eq 0 ]
+
+ runc exec test_busybox echo Hello from exec
+ [ "$status" -eq 0 ]
+ echo text echoed = "'""${output}""'"
+ [[ "${output}" == *"Hello from exec"* ]]
+}
+
+@test "runc exec --pid-file" {
+ # run busybox detached
+ runc run -d --console-socket $CONSOLE_SOCKET test_busybox
+ [ "$status" -eq 0 ]
+
+ runc exec --pid-file pid.txt test_busybox echo Hello from exec
+ [ "$status" -eq 0 ]
+ echo text echoed = "'""${output}""'"
+ [[ "${output}" == *"Hello from exec"* ]]
+
+ # check pid.txt was generated
+ [ -e pid.txt ]
+
+ run cat pid.txt
+ [ "$status" -eq 0 ]
+ [[ ${lines[0]} =~ [0-9]+ ]]
+ [[ ${lines[0]} != $(__runc state test_busybox | jq '.pid') ]]
+}
+
+@test "runc exec --pid-file with new CWD" {
+ # create pid_file directory as the CWD
+ run mkdir pid_file
+ [ "$status" -eq 0 ]
+ run cd pid_file
+ [ "$status" -eq 0 ]
+
+ # run busybox detached
+ runc run -d -b $BUSYBOX_BUNDLE --console-socket $CONSOLE_SOCKET test_busybox
+ [ "$status" -eq 0 ]
+
+ runc exec --pid-file pid.txt test_busybox echo Hello from exec
+ [ "$status" -eq 0 ]
+ echo text echoed = "'""${output}""'"
+ [[ "${output}" == *"Hello from exec"* ]]
+
+ # check pid.txt was generated
+ [ -e pid.txt ]
+
+ run cat pid.txt
+ [ "$status" -eq 0 ]
+ [[ ${lines[0]} =~ [0-9]+ ]]
+ [[ ${lines[0]} != $(__runc state test_busybox | jq '.pid') ]]
+}
+
+@test "runc exec ls -la" {
+ # run busybox detached
+ runc run -d --console-socket $CONSOLE_SOCKET test_busybox
+ [ "$status" -eq 0 ]
+
+ runc exec test_busybox ls -la
+ [ "$status" -eq 0 ]
+ [[ ${lines[0]} == *"total"* ]]
+ [[ ${lines[1]} == *"."* ]]
+ [[ ${lines[2]} == *".."* ]]
+}
+
+@test "runc exec ls -la with --cwd" {
+ # run busybox detached
+ runc run -d --console-socket $CONSOLE_SOCKET test_busybox
+ [ "$status" -eq 0 ]
+
+ runc exec --cwd /bin test_busybox pwd
+ [ "$status" -eq 0 ]
+ [[ ${output} == "/bin"* ]]
+}
+
+@test "runc exec --env" {
+ # run busybox detached
+ runc run -d --console-socket $CONSOLE_SOCKET test_busybox
+ [ "$status" -eq 0 ]
+
+ runc exec --env RUNC_EXEC_TEST=true test_busybox env
+ [ "$status" -eq 0 ]
+
+ [[ ${output} == *"RUNC_EXEC_TEST=true"* ]]
+}
+
+@test "runc exec --user" {
+ # --user can't work in rootless containers that don't have idmap.
+ [[ "$ROOTLESS" -ne 0 ]] && requires rootless_idmap
+
+ # run busybox detached
+ runc run -d --console-socket $CONSOLE_SOCKET test_busybox
+ [ "$status" -eq 0 ]
+
+ runc exec --user 1000:1000 test_busybox id
+ [ "$status" -eq 0 ]
+
+ [[ "${output}" == "uid=1000 gid=1000"* ]]
+}
+
+@test "runc exec --additional-gids" {
+ requires root
+
+ # run busybox detached
+ runc run -d --console-socket $CONSOLE_SOCKET test_busybox
+ [ "$status" -eq 0 ]
+
+ wait_for_container 15 1 test_busybox
+
+ runc exec --user 1000:1000 --additional-gids 100 --additional-gids 99 test_busybox id
+ [ "$status" -eq 0 ]
+
+ [[ ${output} == "uid=1000 gid=1000 groups=99(nogroup),100(users)" ]]
+}
--- /dev/null
+#!/usr/bin/env bats
+
+load helpers
+
+@test "runc -h" {
+ runc -h
+ [ "$status" -eq 0 ]
+ [[ ${lines[0]} =~ NAME:+ ]]
+ [[ ${lines[1]} =~ runc\ '-'\ Open\ Container\ Initiative\ runtime+ ]]
+
+ runc --help
+ [ "$status" -eq 0 ]
+ [[ ${lines[0]} =~ NAME:+ ]]
+ [[ ${lines[1]} =~ runc\ '-'\ Open\ Container\ Initiative\ runtime+ ]]
+}
+
+@test "runc command -h" {
+ runc checkpoint -h
+ [ "$status" -eq 0 ]
+ [[ ${lines[1]} =~ runc\ checkpoint+ ]]
+
+ runc delete -h
+ [ "$status" -eq 0 ]
+ [[ ${lines[1]} =~ runc\ delete+ ]]
+
+ runc events -h
+ [ "$status" -eq 0 ]
+ [[ ${lines[1]} =~ runc\ events+ ]]
+
+ runc exec -h
+ [ "$status" -eq 0 ]
+ [[ ${lines[1]} =~ runc\ exec+ ]]
+
+ runc kill -h
+ [ "$status" -eq 0 ]
+ [[ ${lines[1]} =~ runc\ kill+ ]]
+
+ runc list -h
+ [ "$status" -eq 0 ]
+ [[ ${lines[0]} =~ NAME:+ ]]
+ [[ ${lines[1]} =~ runc\ list+ ]]
+
+ runc list --help
+ [ "$status" -eq 0 ]
+ [[ ${lines[0]} =~ NAME:+ ]]
+ [[ ${lines[1]} =~ runc\ list+ ]]
+
+ runc pause -h
+ [ "$status" -eq 0 ]
+ [[ ${lines[1]} =~ runc\ pause+ ]]
+
+ runc restore -h
+ [ "$status" -eq 0 ]
+ [[ ${lines[1]} =~ runc\ restore+ ]]
+
+ runc resume -h
+ [ "$status" -eq 0 ]
+ [[ ${lines[1]} =~ runc\ resume+ ]]
+
+ # We don't use runc_spec here, because we're just testing the help page.
+ runc spec -h
+ [ "$status" -eq 0 ]
+ [[ ${lines[1]} =~ runc\ spec+ ]]
+
+ runc start -h
+ [ "$status" -eq 0 ]
+ [[ ${lines[1]} =~ runc\ start+ ]]
+
+ runc run -h
+ [ "$status" -eq 0 ]
+ [[ ${lines[1]} =~ runc\ run+ ]]
+
+ runc state -h
+ [ "$status" -eq 0 ]
+ [[ ${lines[1]} =~ runc\ state+ ]]
+
+ runc update -h
+ [ "$status" -eq 0 ]
+ [[ ${lines[1]} =~ runc\ update+ ]]
+
+}
+
+@test "runc foo -h" {
+ runc foo -h
+ [ "$status" -ne 0 ]
+ [[ "${output}" == *"No help topic for 'foo'"* ]]
+}
--- /dev/null
+#!/bin/bash
+
+# Root directory of integration tests.
+INTEGRATION_ROOT=$(dirname "$(readlink -f "$BASH_SOURCE")")
+
+. ${INTEGRATION_ROOT}/multi-arch.bash
+
+RUNC="${INTEGRATION_ROOT}/../../runc"
+RECVTTY="${INTEGRATION_ROOT}/../../contrib/cmd/recvtty/recvtty"
+GOPATH="$(mktemp -d --tmpdir runc-integration-gopath.XXXXXX)"
+
+# Test data path.
+TESTDATA="${INTEGRATION_ROOT}/testdata"
+
+# Busybox image
+BUSYBOX_IMAGE="$BATS_TMPDIR/busybox.tar"
+BUSYBOX_BUNDLE="$BATS_TMPDIR/busyboxtest"
+
+# hello-world in tar format
+HELLO_FILE=`get_hello`
+HELLO_IMAGE="$TESTDATA/$HELLO_FILE"
+HELLO_BUNDLE="$BATS_TMPDIR/hello-world"
+
+# CRIU PATH
+CRIU="$(which criu || true)"
+
+# Kernel version
+KERNEL_VERSION="$(uname -r)"
+KERNEL_MAJOR="${KERNEL_VERSION%%.*}"
+KERNEL_MINOR="${KERNEL_VERSION#$KERNEL_MAJOR.}"
+KERNEL_MINOR="${KERNEL_MINOR%%.*}"
+
+# Root state path.
+ROOT=$(mktemp -d "$BATS_TMPDIR/runc.XXXXXX")
+
+# Path to console socket.
+CONSOLE_SOCKET="$BATS_TMPDIR/console.sock"
+
+# Cgroup paths
+CGROUP_MEMORY_BASE_PATH=$(grep "cgroup" /proc/self/mountinfo | gawk 'toupper($NF) ~ /\<MEMORY\>/ { print $5; exit }')
+CGROUP_CPU_BASE_PATH=$(grep "cgroup" /proc/self/mountinfo | gawk 'toupper($NF) ~ /\<CPU\>/ { print $5; exit }')
+CGROUPS_PATH="/runc-cgroups-integration-test/test-cgroup"
+CGROUP_MEMORY="${CGROUP_MEMORY_BASE_PATH}${CGROUPS_PATH}"
+
+# CONFIG_MEMCG_KMEM support
+KMEM="${CGROUP_MEMORY_BASE_PATH}/memory.kmem.limit_in_bytes"
+RT_PERIOD="${CGROUP_CPU_BASE_PATH}/cpu.rt_period_us"
+
+# Check if we're in rootless mode.
+ROOTLESS=$(id -u)
+
+# Wrapper for runc.
+function runc() {
+ run __runc "$@"
+
+ # Some debug information to make life easier. bats will only print it if the
+ # test failed, in which case the output is useful.
+ echo "runc $@ (status=$status):" >&2
+ echo "$output" >&2
+}
+
+# Raw wrapper for runc.
+function __runc() {
+ "$RUNC" --log /proc/self/fd/2 --root "$ROOT" "$@"
+}
+
+# Wrapper for runc spec, which takes only one argument (the bundle path).
+function runc_spec() {
+ ! [[ "$#" > 1 ]]
+
+ local args=()
+ local bundle=""
+
+ if [ "$ROOTLESS" -ne 0 ]; then
+ args+=("--rootless")
+ fi
+ if [ "$#" -ne 0 ]; then
+ bundle="$1"
+ args+=("--bundle" "$bundle")
+ fi
+
+ runc spec "${args[@]}"
+
+ # Always add additional mappings if we have idmaps.
+ if [[ "$ROOTLESS" -ne 0 ]] && [[ "$ROOTLESS_FEATURES" == *"idmap"* ]]; then
+ runc_rootless_idmap "$bundle"
+ fi
+
+ # Ensure config.json contains linux.resources
+ if [[ "$ROOTLESS" -ne 0 ]] && [[ "$ROOTLESS_FEATURES" == *"cgroup"* ]]; then
+ runc_rootless_cgroup "$bundle"
+ fi
+}
+
+# Shortcut to add additional uids and gids, based on the values set as part of
+# a rootless configuration.
+function runc_rootless_idmap() {
+ bundle="${1:-.}"
+ cat "$bundle/config.json" \
+ | jq '.mounts |= map((select(.type == "devpts") | .options += ["gid=5"]) // .)' \
+ | jq '.linux.uidMappings |= .+ [{"hostID": '"$ROOTLESS_UIDMAP_START"', "containerID": 1000, "size": '"$ROOTLESS_UIDMAP_LENGTH"'}]' \
+ | jq '.linux.gidMappings |= .+ [{"hostID": '"$ROOTLESS_GIDMAP_START"', "containerID": 100, "size": 1}]' \
+ | jq '.linux.gidMappings |= .+ [{"hostID": '"$(($ROOTLESS_GIDMAP_START+10))"', "containerID": 1, "size": 20}]' \
+ | jq '.linux.gidMappings |= .+ [{"hostID": '"$(($ROOTLESS_GIDMAP_START+100))"', "containerID": 1000, "size": '"$(($ROOTLESS_GIDMAP_LENGTH-1000))"'}]' \
+ >"$bundle/config.json.tmp"
+ mv "$bundle/config.json"{.tmp,}
+}
+
+# Shortcut to add empty resources as part of a rootless configuration.
+function runc_rootless_cgroup() {
+ bundle="${1:-.}"
+ cat "$bundle/config.json" \
+ | jq '.linux.resources |= .+ {"memory":{},"cpu":{},"blockio":{},"pids":{}}' \
+ >"$bundle/config.json.tmp"
+ mv "$bundle/config.json"{.tmp,}
+}
+
+# Helper function to set cgroupsPath to the value of $CGROUPS_PATH
+function set_cgroups_path() {
+ bundle="${1:-.}"
+ sed -i 's/\("linux": {\)/\1\n "cgroupsPath": "\/runc-cgroups-integration-test\/test-cgroup",/' "$bundle/config.json"
+}
+
+# Helper function to set a resources limit
+function set_resources_limit() {
+ bundle="${1:-.}"
+ sed -i 's/\("linux": {\)/\1\n "resources": { "pids": { "limit": 100 } },/' "$bundle/config.json"
+}
+
+# Fails the current test, providing the error given.
+function fail() {
+ echo "$@" >&2
+ exit 1
+}
+
+# Allows a test to specify what things it requires. If the environment can't
+# support it, the test is skipped with a message.
+function requires() {
+ for var in "$@"; do
+ case $var in
+ criu)
+ if [ ! -e "$CRIU" ]; then
+ skip "test requires ${var}"
+ fi
+ ;;
+ root)
+ if [ "$ROOTLESS" -ne 0 ]; then
+ skip "test requires ${var}"
+ fi
+ ;;
+ rootless)
+ if [ "$ROOTLESS" -eq 0 ]; then
+ skip "test requires ${var}"
+ fi
+ ;;
+ rootless_idmap)
+ if [[ "$ROOTLESS_FEATURES" != *"idmap"* ]]; then
+ skip "test requires ${var}"
+ fi
+ ;;
+ rootless_cgroup)
+ if [[ "$ROOTLESS_FEATURES" != *"cgroup"* ]]; then
+ skip "test requires ${var}"
+ fi
+ ;;
+ rootless_no_cgroup)
+ if [[ "$ROOTLESS_FEATURES" == *"cgroup"* ]]; then
+ skip "test requires ${var}"
+ fi
+ ;;
+ cgroups_kmem)
+ if [ ! -e "$KMEM" ]; then
+ skip "Test requires ${var}"
+ fi
+ ;;
+ cgroups_rt)
+ if [ ! -e "$RT_PERIOD" ]; then
+ skip "Test requires ${var}"
+ fi
+ ;;
+ *)
+ fail "BUG: Invalid requires ${var}."
+ ;;
+ esac
+ done
+}
+
+# Retry a command $1 times until it succeeds. Wait $2 seconds between retries.
+function retry() {
+ local attempts=$1
+ shift
+ local delay=$1
+ shift
+ local i
+
+ for ((i = 0; i < attempts; i++)); do
+ run "$@"
+ if [[ "$status" -eq 0 ]]; then
+ return 0
+ fi
+ sleep $delay
+ done
+
+ echo "Command \"$@\" failed $attempts times. Output: $output"
+ false
+}
+
+# retry until the given container has state
+function wait_for_container() {
+ local attempts=$1
+ local delay=$2
+ local cid=$3
+ local i
+
+ for ((i = 0; i < attempts; i++)); do
+ runc state $cid
+ if [[ "$status" -eq 0 ]]; then
+ return 0
+ fi
+ sleep $delay
+ done
+
+ echo "runc state failed to return state $statecheck $attempts times. Output: $output"
+ false
+}
+
+# retry until the given container has state
+function wait_for_container_inroot() {
+ local attempts=$1
+ local delay=$2
+ local cid=$3
+ local i
+
+ for ((i = 0; i < attempts; i++)); do
+ ROOT=$4 runc state $cid
+ if [[ "$status" -eq 0 ]]; then
+ return 0
+ fi
+ sleep $delay
+ done
+
+ echo "runc state failed to return state $statecheck $attempts times. Output: $output"
+ false
+}
+
+function testcontainer() {
+ # test state of container
+ runc state $1
+ [ "$status" -eq 0 ]
+ [[ "${output}" == *"$2"* ]]
+}
+
+function setup_recvtty() {
+ # We need to start recvtty in the background, so we double fork in the shell.
+ ("$RECVTTY" --pid-file "$BATS_TMPDIR/recvtty.pid" --mode null "$CONSOLE_SOCKET" &) &
+}
+
+function teardown_recvtty() {
+ # When we kill recvtty, the container will also be killed.
+ if [ -f "$BATS_TMPDIR/recvtty.pid" ]; then
+ kill -9 $(cat "$BATS_TMPDIR/recvtty.pid")
+ fi
+
+ # Clean up the files that might be left over.
+ rm -f "$BATS_TMPDIR/recvtty.pid"
+ rm -f "$CONSOLE_SOCKET"
+}
+
+function setup_busybox() {
+ setup_recvtty
+ run mkdir "$BUSYBOX_BUNDLE"
+ run mkdir "$BUSYBOX_BUNDLE"/rootfs
+ if [ -e "/testdata/busybox.tar" ]; then
+ BUSYBOX_IMAGE="/testdata/busybox.tar"
+ fi
+ if [ ! -e $BUSYBOX_IMAGE ]; then
+ curl -o $BUSYBOX_IMAGE -sSL `get_busybox`
+ fi
+ tar --exclude './dev/*' -C "$BUSYBOX_BUNDLE"/rootfs -xf "$BUSYBOX_IMAGE"
+ cd "$BUSYBOX_BUNDLE"
+ runc_spec
+}
+
+function setup_hello() {
+ setup_recvtty
+ run mkdir "$HELLO_BUNDLE"
+ run mkdir "$HELLO_BUNDLE"/rootfs
+ tar --exclude './dev/*' -C "$HELLO_BUNDLE"/rootfs -xf "$HELLO_IMAGE"
+ cd "$HELLO_BUNDLE"
+ runc_spec
+ sed -i 's;"sh";"/hello";' config.json
+}
+
+function teardown_running_container() {
+ runc list
+ # $1 should be a container name such as "test_busybox"
+ # here we detect "test_busybox "(with one extra blank) to avoid conflict prefix
+ # e.g. "test_busybox" and "test_busybox_update"
+ if [[ "${output}" == *"$1 "* ]]; then
+ runc kill $1 KILL
+ retry 10 1 eval "__runc state '$1' | grep -q 'stopped'"
+ runc delete $1
+ fi
+}
+
+function teardown_running_container_inroot() {
+ ROOT=$2 runc list
+ # $1 should be a container name such as "test_busybox"
+ # here we detect "test_busybox "(with one extra blank) to avoid conflict prefix
+ # e.g. "test_busybox" and "test_busybox_update"
+ if [[ "${output}" == *"$1 "* ]]; then
+ ROOT=$2 runc kill $1 KILL
+ retry 10 1 eval "ROOT='$2' __runc state '$1' | grep -q 'stopped'"
+ ROOT=$2 runc delete $1
+ fi
+}
+
+function teardown_busybox() {
+ cd "$INTEGRATION_ROOT"
+ teardown_recvtty
+ teardown_running_container test_busybox
+ run rm -f -r "$BUSYBOX_BUNDLE"
+}
+
+function teardown_hello() {
+ cd "$INTEGRATION_ROOT"
+ teardown_recvtty
+ teardown_running_container test_hello
+ run rm -f -r "$HELLO_BUNDLE"
+}
--- /dev/null
+#!/usr/bin/env bats
+
+load helpers
+
+function setup() {
+ teardown_busybox
+ setup_busybox
+}
+
+function teardown() {
+ teardown_busybox
+}
+
+
+@test "kill detached busybox" {
+ # run busybox detached
+ runc run -d --console-socket $CONSOLE_SOCKET test_busybox
+ [ "$status" -eq 0 ]
+
+ # check state
+ testcontainer test_busybox running
+
+ runc kill test_busybox KILL
+ [ "$status" -eq 0 ]
+
+ retry 10 1 eval "__runc state test_busybox | grep -q 'stopped'"
+
+ runc delete test_busybox
+ [ "$status" -eq 0 ]
+}
--- /dev/null
+#!/usr/bin/env bats
+
+load helpers
+
+function setup() {
+ teardown_running_container_inroot test_box1 $HELLO_BUNDLE
+ teardown_running_container_inroot test_box2 $HELLO_BUNDLE
+ teardown_running_container_inroot test_box3 $HELLO_BUNDLE
+ teardown_busybox
+ setup_busybox
+}
+
+function teardown() {
+ teardown_running_container_inroot test_box1 $HELLO_BUNDLE
+ teardown_running_container_inroot test_box2 $HELLO_BUNDLE
+ teardown_running_container_inroot test_box3 $HELLO_BUNDLE
+ teardown_busybox
+}
+
+@test "list" {
+ # run a few busyboxes detached
+ ROOT=$HELLO_BUNDLE runc run -d --console-socket $CONSOLE_SOCKET test_box1
+ [ "$status" -eq 0 ]
+
+ ROOT=$HELLO_BUNDLE runc run -d --console-socket $CONSOLE_SOCKET test_box2
+ [ "$status" -eq 0 ]
+
+ ROOT=$HELLO_BUNDLE runc run -d --console-socket $CONSOLE_SOCKET test_box3
+ [ "$status" -eq 0 ]
+
+ ROOT=$HELLO_BUNDLE runc list
+ [ "$status" -eq 0 ]
+ [[ ${lines[0]} =~ ID\ +PID\ +STATUS\ +BUNDLE\ +CREATED+ ]]
+ [[ "${lines[1]}" == *"test_box1"*[0-9]*"running"*$BUSYBOX_BUNDLE*[0-9]* ]]
+ [[ "${lines[2]}" == *"test_box2"*[0-9]*"running"*$BUSYBOX_BUNDLE*[0-9]* ]]
+ [[ "${lines[3]}" == *"test_box3"*[0-9]*"running"*$BUSYBOX_BUNDLE*[0-9]* ]]
+
+ ROOT=$HELLO_BUNDLE runc list -q
+ [ "$status" -eq 0 ]
+ [[ "${lines[0]}" == "test_box1" ]]
+ [[ "${lines[1]}" == "test_box2" ]]
+ [[ "${lines[2]}" == "test_box3" ]]
+
+ ROOT=$HELLO_BUNDLE runc list --format table
+ [ "$status" -eq 0 ]
+ [[ ${lines[0]} =~ ID\ +PID\ +STATUS\ +BUNDLE\ +CREATED+ ]]
+ [[ "${lines[1]}" == *"test_box1"*[0-9]*"running"*$BUSYBOX_BUNDLE*[0-9]* ]]
+ [[ "${lines[2]}" == *"test_box2"*[0-9]*"running"*$BUSYBOX_BUNDLE*[0-9]* ]]
+ [[ "${lines[3]}" == *"test_box3"*[0-9]*"running"*$BUSYBOX_BUNDLE*[0-9]* ]]
+
+ ROOT=$HELLO_BUNDLE runc list --format json
+ [ "$status" -eq 0 ]
+ [[ "${lines[0]}" == [\[][\{]"\"ociVersion\""[:]"\""*[0-9][\.]*[0-9][\.]*[0-9]*"\""[,]"\"id\""[:]"\"test_box1\""[,]"\"pid\""[:]*[0-9][,]"\"status\""[:]*"\"running\""[,]"\"bundle\""[:]*$BUSYBOX_BUNDLE*[,]"\"rootfs\""[:]"\""*"\""[,]"\"created\""[:]*[0-9]*[\}]* ]]
+ [[ "${lines[0]}" == *[,][\{]"\"ociVersion\""[:]"\""*[0-9][\.]*[0-9][\.]*[0-9]*"\""[,]"\"id\""[:]"\"test_box2\""[,]"\"pid\""[:]*[0-9][,]"\"status\""[:]*"\"running\""[,]"\"bundle\""[:]*$BUSYBOX_BUNDLE*[,]"\"rootfs\""[:]"\""*"\""[,]"\"created\""[:]*[0-9]*[\}]* ]]
+ [[ "${lines[0]}" == *[,][\{]"\"ociVersion\""[:]"\""*[0-9][\.]*[0-9][\.]*[0-9]*"\""[,]"\"id\""[:]"\"test_box3\""[,]"\"pid\""[:]*[0-9][,]"\"status\""[:]*"\"running\""[,]"\"bundle\""[:]*$BUSYBOX_BUNDLE*[,]"\"rootfs\""[:]"\""*"\""[,]"\"created\""[:]*[0-9]*[\}][\]] ]]
+}
--- /dev/null
+#!/usr/bin/env bats
+
+load helpers
+
+function setup() {
+ teardown_busybox
+ setup_busybox
+
+ # Create fake rootfs.
+ mkdir rootfs/testdir
+ echo "Forbidden information!" > rootfs/testfile
+
+ # add extra masked paths
+ sed -i 's;"maskedPaths": \[;"maskedPaths": \["/testdir","/testfile",;g' config.json
+}
+
+function teardown() {
+ teardown_busybox
+}
+
+@test "mask paths [file]" {
+ # run busybox detached
+ runc run -d --console-socket $CONSOLE_SOCKET test_busybox
+ [ "$status" -eq 0 ]
+
+ runc exec test_busybox cat /testfile
+ [ "$status" -eq 0 ]
+ [[ "${output}" == "" ]]
+
+ runc exec test_busybox rm -f /testfile
+ [ "$status" -eq 1 ]
+ [[ "${output}" == *"Read-only file system"* ]]
+
+ runc exec test_busybox umount /testfile
+ [ "$status" -eq 1 ]
+ [[ "${output}" == *"Operation not permitted"* ]]
+}
+
+@test "mask paths [directory]" {
+ # run busybox detached
+ runc run -d --console-socket $CONSOLE_SOCKET test_busybox
+ [ "$status" -eq 0 ]
+
+ runc exec test_busybox ls /testdir
+ [ "$status" -eq 0 ]
+ [[ "${output}" == "" ]]
+
+ runc exec test_busybox touch /testdir/foo
+ [ "$status" -eq 1 ]
+ [[ "${output}" == *"Read-only file system"* ]]
+
+ runc exec test_busybox rm -rf /testdir
+ [ "$status" -eq 1 ]
+ [[ "${output}" == *"Read-only file system"* ]]
+
+ runc exec test_busybox umount /testdir
+ [ "$status" -eq 1 ]
+ [[ "${output}" == *"Operation not permitted"* ]]
+}
--- /dev/null
+#!/usr/bin/env bats
+
+load helpers
+
+function setup() {
+ teardown_busybox
+ setup_busybox
+}
+
+function teardown() {
+ teardown_busybox
+}
+
+@test "runc run [bind mount]" {
+ CONFIG=$(jq '.mounts |= . + [{"source": ".", "destination": "/tmp/bind", "options": ["bind"]}] | .process.args = ["ls", "/tmp/bind/config.json"]' config.json)
+ echo "${CONFIG}" >config.json
+
+ runc run test_bind_mount
+ [ "$status" -eq 0 ]
+ [[ "${lines[0]}" =~ '/tmp/bind/config.json' ]]
+}
--- /dev/null
+#!/bin/bash
+get_busybox(){
+ case $(go env GOARCH) in
+ arm64)
+ echo 'https://github.com/docker-library/busybox/raw/23fbd9c43e0f4bec7605091bfba23db278c367ac/glibc/busybox.tar.xz'
+ ;;
+ *)
+ echo 'https://github.com/docker-library/busybox/raw/a0558a9006ce0dd6f6ec5d56cfd3f32ebeeb815f/glibc/busybox.tar.xz'
+ ;;
+ esac
+}
+
+get_hello(){
+ case $(go env GOARCH) in
+ arm64)
+ echo 'hello-world-aarch64.tar'
+ ;;
+ *)
+ echo 'hello-world.tar'
+ ;;
+ esac
+}
--- /dev/null
+#!/usr/bin/env bats
+
+load helpers
+
+function setup() {
+ teardown_busybox
+ setup_busybox
+}
+
+function teardown() {
+ teardown_busybox
+}
+
+@test "runc pause and resume" {
+ # XXX: currently cgroups require root containers.
+ requires root
+
+ # run busybox detached
+ runc run -d --console-socket $CONSOLE_SOCKET test_busybox
+ [ "$status" -eq 0 ]
+
+ testcontainer test_busybox running
+
+ # pause busybox
+ runc pause test_busybox
+ [ "$status" -eq 0 ]
+
+ # test state of busybox is paused
+ testcontainer test_busybox paused
+
+ # resume busybox
+ runc resume test_busybox
+ [ "$status" -eq 0 ]
+
+ # test state of busybox is back to running
+ testcontainer test_busybox running
+}
+
+@test "runc pause and resume with nonexist container" {
+ # XXX: currently cgroups require root containers.
+ requires root
+
+ # run test_busybox detached
+ runc run -d --console-socket $CONSOLE_SOCKET test_busybox
+ [ "$status" -eq 0 ]
+
+ testcontainer test_busybox running
+
+ # pause test_busybox and nonexistent container
+ runc pause test_busybox
+ [ "$status" -eq 0 ]
+ runc pause nonexistent
+ [ "$status" -ne 0 ]
+
+ # test state of test_busybox is paused
+ testcontainer test_busybox paused
+
+ # resume test_busybox and nonexistent container
+ runc resume test_busybox
+ [ "$status" -eq 0 ]
+ runc resume nonexistent
+ [ "$status" -ne 0 ]
+
+ # test state of test_busybox is back to running
+ testcontainer test_busybox running
+
+ # delete test_busybox
+ runc delete --force test_busybox
+
+ runc state test_busybox
+ [ "$status" -ne 0 ]
+}
--- /dev/null
+#!/usr/bin/env bats
+
+load helpers
+
+function setup() {
+ teardown_busybox
+ setup_busybox
+}
+
+function teardown() {
+ teardown_busybox
+}
+
+@test "ps" {
+ # ps is not supported, it requires cgroups
+ requires root
+
+ # start busybox detached
+ runc run -d --console-socket $CONSOLE_SOCKET test_busybox
+ [ "$status" -eq 0 ]
+
+ # check state
+ testcontainer test_busybox running
+
+ runc ps test_busybox
+ [ "$status" -eq 0 ]
+ [[ ${lines[0]} =~ UID\ +PID\ +PPID\ +C\ +STIME\ +TTY\ +TIME\ +CMD+ ]]
+ [[ "${lines[1]}" == *"$(id -un 2>/dev/null)"*[0-9]* ]]
+}
+
+@test "ps -f json" {
+ # ps is not supported, it requires cgroups
+ requires root
+
+ # start busybox detached
+ runc run -d --console-socket $CONSOLE_SOCKET test_busybox
+ [ "$status" -eq 0 ]
+
+ # check state
+ testcontainer test_busybox running
+
+ runc ps -f json test_busybox
+ [ "$status" -eq 0 ]
+ [[ ${lines[0]} =~ [0-9]+ ]]
+}
+
+@test "ps -e -x" {
+ # ps is not supported, it requires cgroups
+ requires root
+
+ # start busybox detached
+ runc run -d --console-socket $CONSOLE_SOCKET test_busybox
+ [ "$status" -eq 0 ]
+
+ # check state
+ testcontainer test_busybox running
+
+ runc ps test_busybox -e -x
+ [ "$status" -eq 0 ]
+ [[ ${lines[0]} =~ \ +PID\ +TTY\ +STAT\ +TIME\ +COMMAND+ ]]
+ [[ "${lines[1]}" =~ [0-9]+ ]]
+}
--- /dev/null
+#!/usr/bin/env bats
+
+load helpers
+
+function setup() {
+ teardown_running_container_inroot test_dotbox $HELLO_BUNDLE
+ teardown_busybox
+ setup_busybox
+}
+
+function teardown() {
+ teardown_running_container_inroot test_dotbox $HELLO_BUNDLE
+ teardown_busybox
+}
+
+@test "global --root" {
+ # run busybox detached using $HELLO_BUNDLE for state
+ ROOT=$HELLO_BUNDLE runc run -d --console-socket $CONSOLE_SOCKET test_dotbox
+ [ "$status" -eq 0 ]
+
+ # run busybox detached in default root
+ runc run -d --console-socket $CONSOLE_SOCKET test_busybox
+ [ "$status" -eq 0 ]
+
+ runc state test_busybox
+ [ "$status" -eq 0 ]
+ [[ "${output}" == *"running"* ]]
+
+ ROOT=$HELLO_BUNDLE runc state test_dotbox
+ [ "$status" -eq 0 ]
+ [[ "${output}" == *"running"* ]]
+
+ ROOT=$HELLO_BUNDLE runc state test_busybox
+ [ "$status" -ne 0 ]
+
+ runc state test_dotbox
+ [ "$status" -ne 0 ]
+
+ runc kill test_busybox KILL
+ [ "$status" -eq 0 ]
+ retry 10 1 eval "__runc state test_busybox | grep -q 'stopped'"
+ runc delete test_busybox
+ [ "$status" -eq 0 ]
+
+ ROOT=$HELLO_BUNDLE runc kill test_dotbox KILL
+ [ "$status" -eq 0 ]
+ retry 10 1 eval "ROOT='$HELLO_BUNDLE' __runc state test_dotbox | grep -q 'stopped'"
+ ROOT=$HELLO_BUNDLE runc delete test_dotbox
+ [ "$status" -eq 0 ]
+}
--- /dev/null
+#!/usr/bin/env bats
+
+load helpers
+
+function setup() {
+ # initial cleanup in case a prior test exited and did not cleanup
+ cd "$INTEGRATION_ROOT"
+ run rm -f -r "$HELLO_BUNDLE"
+
+ # setup hello-world for spec generation testing
+ run mkdir "$HELLO_BUNDLE"
+ run mkdir "$HELLO_BUNDLE"/rootfs
+ run tar -C "$HELLO_BUNDLE"/rootfs -xf "$HELLO_IMAGE"
+}
+
+function teardown() {
+ cd "$INTEGRATION_ROOT"
+ run rm -f -r "$HELLO_BUNDLE"
+}
+
+@test "spec generation cwd" {
+ cd "$HELLO_BUNDLE"
+ # note this test runs from the bundle not the integration root
+
+ # test that config.json does not exist after the above partial setup
+ [ ! -e config.json ]
+
+ # test generation of spec does not return an error
+ runc_spec
+ [ "$status" -eq 0 ]
+
+ # test generation of spec created our config.json (spec)
+ [ -e config.json ]
+
+ # test existence of required args parameter in the generated config.json
+ run bash -c "grep -A2 'args' config.json | grep 'sh'"
+ [[ "${output}" == *"sh"* ]]
+
+ # change the default args parameter from sh to hello
+ sed -i 's;"sh";"/hello";' config.json
+
+ # ensure the generated spec works by running hello-world
+ runc run test_hello
+ [ "$status" -eq 0 ]
+}
+
+@test "spec generation --bundle" {
+ # note this test runs from the integration root not the bundle
+
+ # test that config.json does not exist after the above partial setup
+ [ ! -e "$HELLO_BUNDLE"/config.json ]
+
+ # test generation of spec does not return an error
+ runc_spec "$HELLO_BUNDLE"
+ [ "$status" -eq 0 ]
+
+ # test generation of spec created our config.json (spec)
+ [ -e "$HELLO_BUNDLE"/config.json ]
+
+ # change the default args parameter from sh to hello
+ sed -i 's;"sh";"/hello";' "$HELLO_BUNDLE"/config.json
+
+ # ensure the generated spec works by running hello-world
+ runc run --bundle "$HELLO_BUNDLE" test_hello
+ [ "$status" -eq 0 ]
+}
+
+@test "spec validator" {
+ TESTDIR=$(pwd)
+ cd "$HELLO_BUNDLE"
+
+ run git clone https://github.com/opencontainers/runtime-spec.git src/runtime-spec
+ [ "$status" -eq 0 ]
+
+ SPEC_COMMIT=$(grep '^github.com/opencontainers/runtime-spec' ${TESTDIR}/../../vendor.conf | cut -d ' ' -f 2)
+ run git -C src/runtime-spec reset --hard "${SPEC_COMMIT}"
+
+ [ "$status" -eq 0 ]
+ [ -e src/runtime-spec/schema/config-schema.json ]
+
+ run bash -c "GOPATH='$GOPATH' go get github.com/xeipuuv/gojsonschema"
+ [ "$status" -eq 0 ]
+
+ run git -C "${GOPATH}/src/github.com/xeipuuv/gojsonschema" reset --hard 6637feb73ee44cd4640bb3def285c29774234c7f
+ [ "$status" -eq 0 ]
+
+ GOPATH="$GOPATH" go build src/runtime-spec/schema/validate.go
+ [ -e ./validate ]
+
+ runc spec
+ [ -e config.json ]
+
+ run ./validate src/runtime-spec/schema/config-schema.json config.json
+ [ "$status" -eq 0 ]
+ [[ "${lines[0]}" == *"The document is valid"* ]]
+}
--- /dev/null
+#!/usr/bin/env bats
+
+load helpers
+
+function setup() {
+ teardown_busybox
+ setup_busybox
+}
+
+function teardown() {
+ teardown_busybox
+}
+
+@test "runc start" {
+ runc create --console-socket $CONSOLE_SOCKET test_busybox
+ [ "$status" -eq 0 ]
+
+ testcontainer test_busybox created
+
+ # start container test_busybox
+ runc start test_busybox
+ [ "$status" -eq 0 ]
+
+ testcontainer test_busybox running
+
+ # delete test_busybox
+ runc delete --force test_busybox
+
+ runc state test_busybox
+ [ "$status" -ne 0 ]
+}
--- /dev/null
+#!/usr/bin/env bats
+
+load helpers
+
+function setup() {
+ teardown_busybox
+ setup_busybox
+}
+
+function teardown() {
+ teardown_busybox
+}
+
+@test "runc run detached" {
+ # run busybox detached
+ runc run -d --console-socket $CONSOLE_SOCKET test_busybox
+ [ "$status" -eq 0 ]
+
+ # check state
+ testcontainer test_busybox running
+}
+
+@test "runc run detached ({u,g}id != 0)" {
+ # cannot start containers as another user in rootless setup without idmap
+ [[ "$ROOTLESS" -ne 0 ]] && requires rootless_idmap
+
+ # replace "uid": 0 with "uid": 1000
+ # and do a similar thing for gid.
+ sed -i 's;"uid": 0;"uid": 1000;g' config.json
+ sed -i 's;"gid": 0;"gid": 100;g' config.json
+
+ # run busybox detached
+ runc run -d --console-socket $CONSOLE_SOCKET test_busybox
+ [ "$status" -eq 0 ]
+
+ # check state
+ testcontainer test_busybox running
+}
+
+@test "runc run detached --pid-file" {
+ # run busybox detached
+ runc run --pid-file pid.txt -d --console-socket $CONSOLE_SOCKET test_busybox
+ [ "$status" -eq 0 ]
+
+ # check state
+ testcontainer test_busybox running
+
+ # check pid.txt was generated
+ [ -e pid.txt ]
+
+ run cat pid.txt
+ [ "$status" -eq 0 ]
+ [[ ${lines[0]} == $(__runc state test_busybox | jq '.pid') ]]
+}
+
+@test "runc run detached --pid-file with new CWD" {
+ # create pid_file directory as the CWD
+ run mkdir pid_file
+ [ "$status" -eq 0 ]
+ run cd pid_file
+ [ "$status" -eq 0 ]
+
+ # run busybox detached
+ runc run --pid-file pid.txt -d -b $BUSYBOX_BUNDLE --console-socket $CONSOLE_SOCKET test_busybox
+ [ "$status" -eq 0 ]
+
+ # check state
+ testcontainer test_busybox running
+
+ # check pid.txt was generated
+ [ -e pid.txt ]
+
+ run cat pid.txt
+ [ "$status" -eq 0 ]
+ [[ ${lines[0]} == $(__runc state test_busybox | jq '.pid') ]]
+}
--- /dev/null
+#!/usr/bin/env bats
+
+load helpers
+
+function setup() {
+ teardown_hello
+ setup_hello
+}
+
+function teardown() {
+ teardown_hello
+}
+
+@test "runc run" {
+ # run hello-world
+ runc run test_hello
+ [ "$status" -eq 0 ]
+
+ # check expected output
+ [[ "${output}" == *"Hello"* ]]
+}
+
+@test "runc run ({u,g}id != 0)" {
+ # cannot start containers as another user in rootless setup without idmap
+ [[ "$ROOTLESS" -ne 0 ]] && requires rootless_idmap
+
+ # replace "uid": 0 with "uid": 1000
+ # and do a similar thing for gid.
+ sed -i 's;"uid": 0;"uid": 1000;g' config.json
+ sed -i 's;"gid": 0;"gid": 100;g' config.json
+
+ # run hello-world
+ runc run test_hello
+ [ "$status" -eq 0 ]
+
+ # check expected output
+ [[ "${output}" == *"Hello"* ]]
+}
+
+@test "runc run with rootfs set to ." {
+ cp config.json rootfs/.
+ rm config.json
+ cd rootfs
+ sed -i 's;"rootfs";".";' config.json
+
+ # run hello-world
+ runc run test_hello
+ [ "$status" -eq 0 ]
+ [[ "${output}" == *"Hello"* ]]
+}
+
+@test "runc run --pid-file" {
+ # run hello-world
+ runc run --pid-file pid.txt test_hello
+ [ "$status" -eq 0 ]
+ [[ "${output}" == *"Hello"* ]]
+
+ # check pid.txt was generated
+ [ -e pid.txt ]
+
+ run cat pid.txt
+ [ "$status" -eq 0 ]
+ [[ ${lines[0]} =~ [0-9]+ ]]
+}
--- /dev/null
+#!/usr/bin/env bats
+
+load helpers
+
+function setup() {
+ teardown_busybox
+ setup_busybox
+}
+
+function teardown() {
+ teardown_busybox
+}
+
+@test "state (kill + delete)" {
+ runc state test_busybox
+ [ "$status" -ne 0 ]
+
+ # run busybox detached
+ runc run -d --console-socket $CONSOLE_SOCKET test_busybox
+ [ "$status" -eq 0 ]
+
+ # check state
+ testcontainer test_busybox running
+
+ runc kill test_busybox KILL
+ [ "$status" -eq 0 ]
+
+ # wait for busybox to be in the destroyed state
+ retry 10 1 eval "__runc state test_busybox | grep -q 'stopped'"
+
+ # delete test_busybox
+ runc delete test_busybox
+ [ "$status" -eq 0 ]
+
+ runc state test_busybox
+ [ "$status" -ne 0 ]
+}
+
+@test "state (pause + resume)" {
+ # XXX: pause and resume require cgroups.
+ requires root
+
+ runc state test_busybox
+ [ "$status" -ne 0 ]
+
+ # run busybox detached
+ runc run -d --console-socket $CONSOLE_SOCKET test_busybox
+ [ "$status" -eq 0 ]
+
+ # check state
+ testcontainer test_busybox running
+
+ # pause busybox
+ runc pause test_busybox
+ [ "$status" -eq 0 ]
+
+ # test state of busybox is paused
+ testcontainer test_busybox paused
+
+ # resume busybox
+ runc resume test_busybox
+ [ "$status" -eq 0 ]
+
+ # test state of busybox is back to running
+ testcontainer test_busybox running
+}
--- /dev/null
+#!/usr/bin/env bats
+
+load helpers
+
+function setup() {
+ teardown_busybox
+ setup_busybox
+}
+
+function teardown() {
+ teardown_busybox
+}
+
+@test "runc run [tty ptsname]" {
+ # Replace sh script with readlink.
+ sed -i 's|"sh"|"sh", "-c", "for file in /proc/self/fd/[012]; do readlink $file; done"|' config.json
+
+ # run busybox
+ runc run test_busybox
+ [ "$status" -eq 0 ]
+ [[ ${lines[0]} =~ /dev/pts/+ ]]
+ [[ ${lines[1]} =~ /dev/pts/+ ]]
+ [[ ${lines[2]} =~ /dev/pts/+ ]]
+}
+
+@test "runc run [tty owner]" {
+ # tty chmod is not doable in rootless containers without idmap.
+ # TODO: this can be made as a change to the gid test.
+ [[ "$ROOTLESS" -ne 0 ]] && requires rootless_idmap
+
+ # Replace sh script with stat.
+ sed -i 's/"sh"/"sh", "-c", "stat -c %u:%g $(tty) | tr : \\\\\\\\n"/' config.json
+
+ # run busybox
+ runc run test_busybox
+ [ "$status" -eq 0 ]
+ [[ ${lines[0]} =~ 0 ]]
+ # This is set by the default config.json (it corresponds to the standard tty group).
+ [[ ${lines[1]} =~ 5 ]]
+}
+
+@test "runc run [tty owner] ({u,g}id != 0)" {
+ # tty chmod is not doable in rootless containers without idmap.
+ [[ "$ROOTLESS" -ne 0 ]] && requires rootless_idmap
+
+ # replace "uid": 0 with "uid": 1000
+ # and do a similar thing for gid.
+ sed -i 's;"uid": 0;"uid": 1000;g' config.json
+ sed -i 's;"gid": 0;"gid": 100;g' config.json
+
+ # Replace sh script with stat.
+ sed -i 's/"sh"/"sh", "-c", "stat -c %u:%g $(tty) | tr : \\\\\\\\n"/' config.json
+
+ # run busybox
+ runc run test_busybox
+ [ "$status" -eq 0 ]
+ [[ ${lines[0]} =~ 1000 ]]
+ # This is set by the default config.json (it corresponds to the standard tty group).
+ [[ ${lines[1]} =~ 5 ]]
+}
+
+@test "runc exec [tty ptsname]" {
+ # run busybox detached
+ runc run -d --console-socket $CONSOLE_SOCKET test_busybox
+ [ "$status" -eq 0 ]
+
+ # make sure we're running
+ testcontainer test_busybox running
+
+ # run the exec
+ runc exec test_busybox sh -c 'for file in /proc/self/fd/[012]; do readlink $file; done'
+ [ "$status" -eq 0 ]
+ [[ ${lines[0]} =~ /dev/pts/+ ]]
+ [[ ${lines[1]} =~ /dev/pts/+ ]]
+ [[ ${lines[2]} =~ /dev/pts/+ ]]
+}
+
+@test "runc exec [tty owner]" {
+ # tty chmod is not doable in rootless containers without idmap.
+ # TODO: this can be made as a change to the gid test.
+ [[ "$ROOTLESS" -ne 0 ]] && requires rootless_idmap
+
+ # run busybox detached
+ runc run -d --console-socket $CONSOLE_SOCKET test_busybox
+ [ "$status" -eq 0 ]
+
+ # make sure we're running
+ testcontainer test_busybox running
+
+ # run the exec
+ runc exec test_busybox sh -c 'stat -c %u:%g $(tty) | tr : \\n'
+ [ "$status" -eq 0 ]
+ [[ ${lines[0]} =~ 0 ]]
+ [[ ${lines[1]} =~ 5 ]]
+}
+
+@test "runc exec [tty owner] ({u,g}id != 0)" {
+ # tty chmod is not doable in rootless containers without idmap.
+ [[ "$ROOTLESS" -ne 0 ]] && requires rootless_idmap
+
+ # replace "uid": 0 with "uid": 1000
+ # and do a similar thing for gid.
+ sed -i 's;"uid": 0;"uid": 1000;g' config.json
+ sed -i 's;"gid": 0;"gid": 100;g' config.json
+
+ # run busybox detached
+ runc run -d --console-socket $CONSOLE_SOCKET test_busybox
+ [ "$status" -eq 0 ]
+
+ # make sure we're running
+ testcontainer test_busybox running
+
+ # run the exec
+ runc exec test_busybox sh -c 'stat -c %u:%g $(tty) | tr : \\n'
+ [ "$status" -eq 0 ]
+ [[ ${lines[0]} =~ 1000 ]]
+ [[ ${lines[1]} =~ 5 ]]
+}
+
+@test "runc exec [tty consolesize]" {
+ # allow writing to filesystem
+ sed -i 's/"readonly": true/"readonly": false/' config.json
+
+ # run busybox detached
+ runc run -d --console-socket $CONSOLE_SOCKET test_busybox
+ [ "$status" -eq 0 ]
+
+ # make sure we're running
+ testcontainer test_busybox running
+
+ tty_info_with_consize_size=$( cat <<EOF
+{
+ "terminal": true,
+ "consoleSize": {
+ "height": 10,
+ "width": 110
+ },
+ "args": [
+ "/bin/sh",
+ "-c",
+ "/bin/stty -a > /tmp/tty-info"
+ ],
+ "cwd": "/"
+}
+EOF
+ )
+
+ # run the exec
+ runc exec --pid-file pid.txt -d --console-socket $CONSOLE_SOCKET -p <( echo $tty_info_with_consize_size ) test_busybox
+ [ "$status" -eq 0 ]
+
+ # check the pid was generated
+ [ -e pid.txt ]
+
+ #wait user process to finish
+ timeout 1 tail --pid=$(head -n 1 pid.txt) -f /dev/null
+
+ tty_info=$( cat <<EOF
+{
+ "args": [
+ "/bin/cat",
+ "/tmp/tty-info"
+ ],
+ "cwd": "/"
+}
+EOF
+ )
+
+ # run the exec
+ runc exec -p <( echo $tty_info ) test_busybox
+ [ "$status" -eq 0 ]
+
+ # test tty width and height against original process.json
+ [[ ${lines[0]} =~ "rows 10; columns 110" ]]
+}
+
+@test "runc create [terminal=false]" {
+ # Disable terminal creation.
+ sed -i 's|"terminal": true,|"terminal": false,|g' config.json
+ # Replace sh script with sleep.
+ sed -i 's|"sh"|"sleep", "1000s"|' config.json
+
+ # Make sure that the handling of detached IO is done properly. See #1354.
+ __runc create test_busybox
+
+ # Start the command.
+ runc start test_busybox
+ [ "$status" -eq 0 ]
+
+ testcontainer test_busybox running
+
+ # Kill the container.
+ runc kill test_busybox KILL
+ [ "$status" -eq 0 ]
+}
+
+@test "runc run [terminal=false]" {
+ # Disable terminal creation.
+ sed -i 's|"terminal": true,|"terminal": false,|g' config.json
+ # Replace sh script with sleep.
+ sed -i 's|"sh"|"sleep", "1000s"|' config.json
+
+ # Make sure that the handling of non-detached IO is done properly. See #1354.
+ (
+ __runc run test_busybox
+ ) &
+
+ wait_for_container 15 1 test_busybox
+ testcontainer test_busybox running
+
+ # Kill the container.
+ runc kill test_busybox KILL
+ [ "$status" -eq 0 ]
+}
+
+@test "runc run -d [terminal=false]" {
+ # Disable terminal creation.
+ sed -i 's|"terminal": true,|"terminal": false,|g' config.json
+ # Replace sh script with sleep.
+ sed -i 's|"sh"|"sleep", "1000s"|' config.json
+
+ # Make sure that the handling of detached IO is done properly. See #1354.
+ __runc run -d test_busybox
+
+ testcontainer test_busybox running
+
+ # Kill the container.
+ runc kill test_busybox KILL
+ [ "$status" -eq 0 ]
+}
--- /dev/null
+#!/usr/bin/env bats
+
+load helpers
+
+function teardown() {
+ rm -f $BATS_TMPDIR/runc-cgroups-integration-test.json
+ teardown_running_container test_update
+ teardown_running_container test_update_rt
+ teardown_busybox
+}
+
+function setup() {
+ teardown
+ setup_busybox
+
+ set_cgroups_path "$BUSYBOX_BUNDLE"
+
+ # Set some initial known values
+ DATA=$(cat <<EOF
+ "memory": {
+ "limit": 33554432,
+ "reservation": 25165824,
+ "kernel": 16777216,
+ "kernelTCP": 11534336
+ },
+ "cpu": {
+ "shares": 100,
+ "quota": 500000,
+ "period": 1000000,
+ "cpus": "0"
+ },
+ "blockio": {
+ "weight": 1000
+ },
+ "pids": {
+ "limit": 20
+ },
+EOF
+ )
+ DATA=$(echo ${DATA} | sed 's/\n/\\n/g')
+ sed -i "s/\(\"resources\": {\)/\1\n${DATA}/" ${BUSYBOX_BUNDLE}/config.json
+}
+
+function check_cgroup_value() {
+ cgroup=$1
+ source=$2
+ expected=$3
+
+ current=$(cat $cgroup/$source)
+ [ "$current" == "$expected" ]
+}
+
+# TODO: test rt cgroup updating
+@test "update" {
+ # XXX: Also, this test should be split into separate sections so that we
+ # can skip kmem without skipping update tests overall.
+ [[ "$ROOTLESS" -ne 0 ]] && requires rootless_cgroup
+ requires cgroups_kmem
+
+ # run a few busyboxes detached
+ runc run -d --console-socket $CONSOLE_SOCKET test_update
+ [ "$status" -eq 0 ]
+
+ # get the cgroup paths
+ for g in MEMORY CPUSET CPU BLKIO PIDS; do
+ base_path=$(grep "cgroup" /proc/self/mountinfo | gawk 'toupper($NF) ~ /\<'${g}'\>/ { print $5; exit }')
+ eval CGROUP_${g}="${base_path}${CGROUPS_PATH}"
+ done
+
+ CGROUP_SYSTEM_MEMORY=$(grep "cgroup" /proc/self/mountinfo | gawk 'toupper($NF) ~ /\<'MEMORY'\>/ { print $5; exit }')
+
+ # check that initial values were properly set
+ check_cgroup_value $CGROUP_BLKIO "blkio.weight" 1000
+ check_cgroup_value $CGROUP_CPU "cpu.cfs_period_us" 1000000
+ check_cgroup_value $CGROUP_CPU "cpu.cfs_quota_us" 500000
+ check_cgroup_value $CGROUP_CPU "cpu.shares" 100
+ check_cgroup_value $CGROUP_CPUSET "cpuset.cpus" 0
+ check_cgroup_value $CGROUP_MEMORY "memory.kmem.limit_in_bytes" 16777216
+ check_cgroup_value $CGROUP_MEMORY "memory.kmem.tcp.limit_in_bytes" 11534336
+ check_cgroup_value $CGROUP_MEMORY "memory.limit_in_bytes" 33554432
+ check_cgroup_value $CGROUP_MEMORY "memory.soft_limit_in_bytes" 25165824
+ check_cgroup_value $CGROUP_PIDS "pids.max" 20
+
+ # update blkio-weight
+ runc update test_update --blkio-weight 500
+ [ "$status" -eq 0 ]
+ check_cgroup_value $CGROUP_BLKIO "blkio.weight" 500
+
+ # update cpu-period
+ runc update test_update --cpu-period 900000
+ [ "$status" -eq 0 ]
+ check_cgroup_value $CGROUP_CPU "cpu.cfs_period_us" 900000
+
+ # update cpu-quota
+ runc update test_update --cpu-quota 600000
+ [ "$status" -eq 0 ]
+ check_cgroup_value $CGROUP_CPU "cpu.cfs_quota_us" 600000
+
+ # update cpu-shares
+ runc update test_update --cpu-share 200
+ [ "$status" -eq 0 ]
+ check_cgroup_value $CGROUP_CPU "cpu.shares" 200
+
+ # update cpuset if supported (i.e. we're running on a multicore cpu)
+ cpu_count=$(grep '^processor' /proc/cpuinfo | wc -l)
+ if [ $cpu_count -gt 1 ]; then
+ runc update test_update --cpuset-cpus "1"
+ [ "$status" -eq 0 ]
+ check_cgroup_value $CGROUP_CPUSET "cpuset.cpus" 1
+ fi
+
+ # update memory limit
+ runc update test_update --memory 67108864
+ [ "$status" -eq 0 ]
+ check_cgroup_value $CGROUP_MEMORY "memory.limit_in_bytes" 67108864
+
+ runc update test_update --memory 50M
+ [ "$status" -eq 0 ]
+ check_cgroup_value $CGROUP_MEMORY "memory.limit_in_bytes" 52428800
+
+ # update memory soft limit
+ runc update test_update --memory-reservation 33554432
+ [ "$status" -eq 0 ]
+ check_cgroup_value $CGROUP_MEMORY "memory.soft_limit_in_bytes" 33554432
+
+ # Run swap memory tests if swap is available
+ if [ -f "$CGROUP_MEMORY/memory.memsw.limit_in_bytes" ]; then
+ # try to remove memory swap limit
+ runc update test_update --memory-swap -1
+ [ "$status" -eq 0 ]
+ # Get System memory swap limit
+ SYSTEM_MEMORY_SW=$(cat "${CGROUP_SYSTEM_MEMORY}/memory.memsw.limit_in_bytes")
+ check_cgroup_value $CGROUP_MEMORY "memory.memsw.limit_in_bytes" ${SYSTEM_MEMORY_SW}
+
+ # update memory swap
+ runc update test_update --memory-swap 96468992
+ [ "$status" -eq 0 ]
+ check_cgroup_value $CGROUP_MEMORY "memory.memsw.limit_in_bytes" 96468992
+ fi;
+
+ # try to remove memory limit
+ runc update test_update --memory -1
+ [ "$status" -eq 0 ]
+
+ # Get System memory limit
+ SYSTEM_MEMORY=$(cat "${CGROUP_SYSTEM_MEMORY}/memory.limit_in_bytes")
+ # check memory limited is gone
+ check_cgroup_value $CGROUP_MEMORY "memory.limit_in_bytes" ${SYSTEM_MEMORY}
+
+ # check swap memory limited is gone
+ if [ -f "$CGROUP_MEMORY/memory.memsw.limit_in_bytes" ]; then
+ check_cgroup_value $CGROUP_MEMORY "memory.memsw.limit_in_bytes" ${SYSTEM_MEMORY}
+ fi
+
+ # update kernel memory limit
+ runc update test_update --kernel-memory 50331648
+ [ "$status" -eq 0 ]
+ check_cgroup_value $CGROUP_MEMORY "memory.kmem.limit_in_bytes" 50331648
+
+ # update kernel memory tcp limit
+ runc update test_update --kernel-memory-tcp 41943040
+ [ "$status" -eq 0 ]
+ check_cgroup_value $CGROUP_MEMORY "memory.kmem.tcp.limit_in_bytes" 41943040
+
+ # update pids limit
+ runc update test_update --pids-limit 10
+ [ "$status" -eq 0 ]
+ check_cgroup_value $CGROUP_PIDS "pids.max" 10
+
+ # Revert to the test initial value via json on stding
+ runc update -r - test_update <<EOF
+{
+ "memory": {
+ "limit": 33554432,
+ "reservation": 25165824,
+ "kernel": 16777216,
+ "kernelTCP": 11534336
+ },
+ "cpu": {
+ "shares": 100,
+ "quota": 500000,
+ "period": 1000000,
+ "cpus": "0"
+ },
+ "blockIO": {
+ "weight": 1000
+ },
+ "pids": {
+ "limit": 20
+ }
+}
+EOF
+ [ "$status" -eq 0 ]
+ check_cgroup_value $CGROUP_BLKIO "blkio.weight" 1000
+ check_cgroup_value $CGROUP_CPU "cpu.cfs_period_us" 1000000
+ check_cgroup_value $CGROUP_CPU "cpu.cfs_quota_us" 500000
+ check_cgroup_value $CGROUP_CPU "cpu.shares" 100
+ check_cgroup_value $CGROUP_CPUSET "cpuset.cpus" 0
+ check_cgroup_value $CGROUP_MEMORY "memory.kmem.limit_in_bytes" 16777216
+ check_cgroup_value $CGROUP_MEMORY "memory.kmem.tcp.limit_in_bytes" 11534336
+ check_cgroup_value $CGROUP_MEMORY "memory.limit_in_bytes" 33554432
+ check_cgroup_value $CGROUP_MEMORY "memory.soft_limit_in_bytes" 25165824
+ check_cgroup_value $CGROUP_PIDS "pids.max" 20
+
+ # redo all the changes at once
+ runc update test_update --blkio-weight 500 \
+ --cpu-period 900000 --cpu-quota 600000 --cpu-share 200 --memory 67108864 \
+ --memory-reservation 33554432 --kernel-memory 50331648 --kernel-memory-tcp 41943040 \
+ --pids-limit 10
+ [ "$status" -eq 0 ]
+ check_cgroup_value $CGROUP_BLKIO "blkio.weight" 500
+ check_cgroup_value $CGROUP_CPU "cpu.cfs_period_us" 900000
+ check_cgroup_value $CGROUP_CPU "cpu.cfs_quota_us" 600000
+ check_cgroup_value $CGROUP_CPU "cpu.shares" 200
+ check_cgroup_value $CGROUP_MEMORY "memory.kmem.limit_in_bytes" 50331648
+ check_cgroup_value $CGROUP_MEMORY "memory.kmem.tcp.limit_in_bytes" 41943040
+ check_cgroup_value $CGROUP_MEMORY "memory.limit_in_bytes" 67108864
+ check_cgroup_value $CGROUP_MEMORY "memory.soft_limit_in_bytes" 33554432
+ check_cgroup_value $CGROUP_PIDS "pids.max" 10
+
+ # reset to initial test value via json file
+ DATA=$(cat <<"EOF"
+{
+ "memory": {
+ "limit": 33554432,
+ "reservation": 25165824,
+ "kernel": 16777216,
+ "kernelTCP": 11534336
+ },
+ "cpu": {
+ "shares": 100,
+ "quota": 500000,
+ "period": 1000000,
+ "cpus": "0"
+ },
+ "blockIO": {
+ "weight": 1000
+ },
+ "pids": {
+ "limit": 20
+ }
+}
+EOF
+)
+ echo $DATA > $BATS_TMPDIR/runc-cgroups-integration-test.json
+
+ runc update -r $BATS_TMPDIR/runc-cgroups-integration-test.json test_update
+ [ "$status" -eq 0 ]
+ check_cgroup_value $CGROUP_BLKIO "blkio.weight" 1000
+ check_cgroup_value $CGROUP_CPU "cpu.cfs_period_us" 1000000
+ check_cgroup_value $CGROUP_CPU "cpu.cfs_quota_us" 500000
+ check_cgroup_value $CGROUP_CPU "cpu.shares" 100
+ check_cgroup_value $CGROUP_CPUSET "cpuset.cpus" 0
+ check_cgroup_value $CGROUP_MEMORY "memory.kmem.limit_in_bytes" 16777216
+ check_cgroup_value $CGROUP_MEMORY "memory.kmem.tcp.limit_in_bytes" 11534336
+ check_cgroup_value $CGROUP_MEMORY "memory.limit_in_bytes" 33554432
+ check_cgroup_value $CGROUP_MEMORY "memory.soft_limit_in_bytes" 25165824
+ check_cgroup_value $CGROUP_PIDS "pids.max" 20
+}
+
+@test "update rt period and runtime" {
+ [[ "$ROOTLESS" -ne 0 ]] && requires rootless_cgroup
+ requires cgroups_kmem cgroups_rt
+
+ # run a detached busybox
+ runc run -d --console-socket $CONSOLE_SOCKET test_update_rt
+ [ "$status" -eq 0 ]
+
+ # get the cgroup paths
+ eval CGROUP_CPU="${CGROUP_CPU_BASE_PATH}${CGROUPS_PATH}"
+
+ runc update -r - test_update_rt <<EOF
+{
+ "cpu": {
+ "realtimePeriod": 800001,
+ "realtimeRuntime": 500001
+ }
+}
+EOF
+ check_cgroup_value $CGROUP_CPU "cpu.rt_period_us" 800001
+ check_cgroup_value $CGROUP_CPU "cpu.rt_runtime_us" 500001
+
+ runc update test_update_rt --cpu-rt-period 900001 --cpu-rt-runtime 600001
+
+ check_cgroup_value $CGROUP_CPU "cpu.rt_period_us" 900001
+ check_cgroup_value $CGROUP_CPU "cpu.rt_runtime_us" 600001
+}
--- /dev/null
+#!/usr/bin/env bats
+
+load helpers
+
+@test "runc version" {
+ runc -v
+ [ "$status" -eq 0 ]
+ [[ ${lines[0]} =~ runc\ version\ [0-9]+\.[0-9]+\.[0-9]+ ]]
+ [[ ${lines[1]} =~ commit:+ ]]
+ [[ ${lines[2]} =~ spec:\ [0-9]+\.[0-9]+\.[0-9]+ ]]
+}
--- /dev/null
+#!/bin/bash
+# Copyright (C) 2017 SUSE LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# rootless.sh -- Runner for rootless container tests. The purpose of this
+# script is to allow for the addition (and testing) of "opportunistic" features
+# to rootless containers while still testing the base features. In order to add
+# a new feature, please match the existing style. Add an entry to $ALL_FEATURES,
+# and add an enable_* and disable_* hook.
+
+ALL_FEATURES=("idmap" "cgroup")
+ROOT="$(readlink -f "$(dirname "${BASH_SOURCE}")/..")"
+
+# FEATURE: Opportunistic new{uid,gid}map support, allowing a rootless container
+# to be set up with the usage of helper setuid binaries.
+
+function enable_idmap() {
+ export ROOTLESS_UIDMAP_START=100000 ROOTLESS_UIDMAP_LENGTH=65536
+ export ROOTLESS_GIDMAP_START=200000 ROOTLESS_GIDMAP_LENGTH=65536
+
+ # Set up sub{uid,gid} mappings.
+ [ -e /etc/subuid.tmp ] && mv /etc/subuid{.tmp,}
+ ( grep -v '^rootless' /etc/subuid ; echo "rootless:$ROOTLESS_UIDMAP_START:$ROOTLESS_UIDMAP_LENGTH" ) > /etc/subuid.tmp
+ mv /etc/subuid{.tmp,}
+ [ -e /etc/subgid.tmp ] && mv /etc/subgid{.tmp,}
+ ( grep -v '^rootless' /etc/subgid ; echo "rootless:$ROOTLESS_GIDMAP_START:$ROOTLESS_GIDMAP_LENGTH" ) > /etc/subgid.tmp
+ mv /etc/subgid{.tmp,}
+
+ # Reactivate new{uid,gid}map helpers if applicable.
+ [ -e /usr/bin/unused-newuidmap ] && mv /usr/bin/{unused-,}newuidmap
+ [ -e /usr/bin/unused-newgidmap ] && mv /usr/bin/{unused-,}newgidmap
+}
+
+function disable_idmap() {
+ export ROOTLESS_UIDMAP_START ROOTLESS_UIDMAP_LENGTH
+ export ROOTLESS_GIDMAP_START ROOTLESS_GIDMAP_LENGTH
+
+ # Deactivate sub{uid,gid} mappings.
+ [ -e /etc/subuid ] && mv /etc/subuid{,.tmp}
+ [ -e /etc/subgid ] && mv /etc/subgid{,.tmp}
+
+ # Deactivate new{uid,gid}map helpers. setuid is preserved with mv(1).
+ [ -e /usr/bin/newuidmap ] && mv /usr/bin/{,unused-}newuidmap
+ [ -e /usr/bin/newgidmap ] && mv /usr/bin/{,unused-}newgidmap
+}
+
+# FEATURE: Opportunistic cgroups support, allowing a rootless container to set
+# resource limits on condition that cgroupsPath is set to a path the
+# rootless user has permissions on.
+
+# List of cgroups. We handle name= cgroups as well as combined
+# (comma-separated) cgroups and correctly split and/or strip them.
+ALL_CGROUPS=( $(cat /proc/self/cgroup | cut -d: -f2 | sed -E '{s/^name=//;s/,/\n/;/^$/D}') )
+CGROUP_MOUNT="/sys/fs/cgroup"
+CGROUP_PATH="/runc-cgroups-integration-test"
+
+function enable_cgroup() {
+ # Set up cgroups for use in rootless containers.
+ for cg in "${ALL_CGROUPS[@]}"
+ do
+ mkdir -p "$CGROUP_MOUNT/$cg$CGROUP_PATH"
+ # We only need to allow write access to {cgroup.procs,tasks} and the
+ # directory. Rather than changing the owner entirely, we just change
+ # the group and then allow write access to the group (in order to
+ # further limit the possible DAC permissions that runc could use).
+ chown root:rootless "$CGROUP_MOUNT/$cg$CGROUP_PATH/"{,cgroup.procs,tasks}
+ chmod g+rwx "$CGROUP_MOUNT/$cg$CGROUP_PATH/"{,cgroup.procs,tasks}
+ # Due to cpuset's semantics we need to give extra permissions to allow
+ # for runc to set up the hierarchy. XXX: This really shouldn't be
+ # necessary, and might actually be a bug in our impl of cgroup
+ # handling.
+ [[ "$cg" == "cpuset" ]] && chown rootless:rootless "$CGROUP_MOUNT/$cg$CGROUP_PATH/cpuset."{cpus,mems}
+ done
+}
+
+function disable_cgroup() {
+ # Remove cgroups used in rootless containers.
+ for cg in "${ALL_CGROUPS[@]}"
+ do
+ [ -d "$CGROUP_MOUNT/$cg$CGROUP_PATH" ] && rmdir "$CGROUP_MOUNT/$cg$CGROUP_PATH"
+ done
+}
+
+# Create a powerset of $ALL_FEATURES (the set of all subsets of $ALL_FEATURES).
+# We test all of the possible combinations (as long as we don't add too many
+# feature knobs this shouldn't take too long -- but the number of tested
+# combinations is O(2^n)).
+function powerset() {
+ eval printf '%s' $(printf '{,%s+}' "$@"):
+}
+features_powerset="$(powerset "${ALL_FEATURES[@]}")"
+
+# Iterate over the powerset of all features.
+IFS=:
+for enabled_features in $features_powerset
+do
+ idx="$(($idx+1))"
+ echo "[$(printf '%.2d' "$idx")] run rootless tests ... (${enabled_features%%+})"
+
+ unset IFS
+ for feature in "${ALL_FEATURES[@]}"
+ do
+ hook_func="disable_$feature"
+ grep -E "(^|\+)$feature(\+|$)" <<<$enabled_features &>/dev/null && hook_func="enable_$feature"
+ "$hook_func"
+ done
+
+ # Run the test suite!
+ set -e
+ echo path: $PATH
+ export ROOTLESS_FEATURES="$enabled_features"
+ sudo -HE -u rootless PATH="$PATH" bats -t "$ROOT/tests/integration$TESTFLAGS"
+ set +e
+done
--- /dev/null
+// +build linux
+
+package main
+
+import (
+ "fmt"
+ "io"
+ "os"
+ "os/signal"
+ "sync"
+
+ "github.com/containerd/console"
+ "github.com/opencontainers/runc/libcontainer"
+ "github.com/opencontainers/runc/libcontainer/utils"
+)
+
+type tty struct {
+ epoller *console.Epoller
+ console *console.EpollConsole
+ stdin console.Console
+ closers []io.Closer
+ postStart []io.Closer
+ wg sync.WaitGroup
+ consoleC chan error
+}
+
+func (t *tty) copyIO(w io.Writer, r io.ReadCloser) {
+ defer t.wg.Done()
+ io.Copy(w, r)
+ r.Close()
+}
+
+// setup pipes for the process so that advanced features like c/r are able to easily checkpoint
+// and restore the process's IO without depending on a host specific path or device
+func setupProcessPipes(p *libcontainer.Process, rootuid, rootgid int) (*tty, error) {
+ i, err := p.InitializeIO(rootuid, rootgid)
+ if err != nil {
+ return nil, err
+ }
+ t := &tty{
+ closers: []io.Closer{
+ i.Stdin,
+ i.Stdout,
+ i.Stderr,
+ },
+ }
+ // add the process's io to the post start closers if they support close
+ for _, cc := range []interface{}{
+ p.Stdin,
+ p.Stdout,
+ p.Stderr,
+ } {
+ if c, ok := cc.(io.Closer); ok {
+ t.postStart = append(t.postStart, c)
+ }
+ }
+ go func() {
+ io.Copy(i.Stdin, os.Stdin)
+ i.Stdin.Close()
+ }()
+ t.wg.Add(2)
+ go t.copyIO(os.Stdout, i.Stdout)
+ go t.copyIO(os.Stderr, i.Stderr)
+ return t, nil
+}
+
+func inheritStdio(process *libcontainer.Process) error {
+ process.Stdin = os.Stdin
+ process.Stdout = os.Stdout
+ process.Stderr = os.Stderr
+ return nil
+}
+
+func (t *tty) recvtty(process *libcontainer.Process, socket *os.File) (Err error) {
+ f, err := utils.RecvFd(socket)
+ if err != nil {
+ return err
+ }
+ cons, err := console.ConsoleFromFile(f)
+ if err != nil {
+ return err
+ }
+ console.ClearONLCR(cons.Fd())
+ epoller, err := console.NewEpoller()
+ if err != nil {
+ return err
+ }
+ epollConsole, err := epoller.Add(cons)
+ if err != nil {
+ return err
+ }
+ defer func() {
+ if Err != nil {
+ epollConsole.Close()
+ }
+ }()
+ go epoller.Wait()
+ go io.Copy(epollConsole, os.Stdin)
+ t.wg.Add(1)
+ go t.copyIO(os.Stdout, epollConsole)
+
+ // set raw mode to stdin and also handle interrupt
+ stdin, err := console.ConsoleFromFile(os.Stdin)
+ if err != nil {
+ return err
+ }
+ if err := stdin.SetRaw(); err != nil {
+ return fmt.Errorf("failed to set the terminal from the stdin: %v", err)
+ }
+ go handleInterrupt(stdin)
+
+ t.epoller = epoller
+ t.stdin = stdin
+ t.console = epollConsole
+ t.closers = []io.Closer{epollConsole}
+ return nil
+}
+
+func handleInterrupt(c console.Console) {
+ sigchan := make(chan os.Signal, 1)
+ signal.Notify(sigchan, os.Interrupt)
+ <-sigchan
+ c.Reset()
+ os.Exit(0)
+}
+
+func (t *tty) waitConsole() error {
+ if t.consoleC != nil {
+ return <-t.consoleC
+ }
+ return nil
+}
+
+// ClosePostStart closes any fds that are provided to the container and dup2'd
+// so that we no longer have copy in our process.
+func (t *tty) ClosePostStart() error {
+ for _, c := range t.postStart {
+ c.Close()
+ }
+ return nil
+}
+
+// Close closes all open fds for the tty and/or restores the original
+// stdin state to what it was prior to the container execution
+func (t *tty) Close() error {
+ // ensure that our side of the fds are always closed
+ for _, c := range t.postStart {
+ c.Close()
+ }
+ // the process is gone at this point, shutting down the console if we have
+ // one and wait for all IO to be finished
+ if t.console != nil && t.epoller != nil {
+ t.console.Shutdown(t.epoller.CloseConsole)
+ }
+ t.wg.Wait()
+ for _, c := range t.closers {
+ c.Close()
+ }
+ if t.stdin != nil {
+ t.stdin.Reset()
+ }
+ return nil
+}
+
+func (t *tty) resize() error {
+ if t.console == nil {
+ return nil
+ }
+ return t.console.ResizeFrom(console.Current())
+}
--- /dev/null
+// +build linux
+
+package main
+
+import (
+ "encoding/json"
+ "fmt"
+ "os"
+ "strconv"
+
+ "github.com/docker/go-units"
+ "github.com/opencontainers/runc/libcontainer/configs"
+ "github.com/opencontainers/runc/libcontainer/intelrdt"
+ "github.com/opencontainers/runtime-spec/specs-go"
+ "github.com/urfave/cli"
+)
+
+func i64Ptr(i int64) *int64 { return &i }
+func u64Ptr(i uint64) *uint64 { return &i }
+func u16Ptr(i uint16) *uint16 { return &i }
+
+var updateCommand = cli.Command{
+ Name: "update",
+ Usage: "update container resource constraints",
+ ArgsUsage: `<container-id>`,
+ Flags: []cli.Flag{
+ cli.StringFlag{
+ Name: "resources, r",
+ Value: "",
+ Usage: `path to the file containing the resources to update or '-' to read from the standard input
+
+The accepted format is as follow (unchanged values can be omitted):
+
+{
+ "memory": {
+ "limit": 0,
+ "reservation": 0,
+ "swap": 0,
+ "kernel": 0,
+ "kernelTCP": 0
+ },
+ "cpu": {
+ "shares": 0,
+ "quota": 0,
+ "period": 0,
+ "realtimeRuntime": 0,
+ "realtimePeriod": 0,
+ "cpus": "",
+ "mems": ""
+ },
+ "blockIO": {
+ "weight": 0
+ }
+}
+
+Note: if data is to be read from a file or the standard input, all
+other options are ignored.
+`,
+ },
+
+ cli.IntFlag{
+ Name: "blkio-weight",
+ Usage: "Specifies per cgroup weight, range is from 10 to 1000",
+ },
+ cli.StringFlag{
+ Name: "cpu-period",
+ Usage: "CPU CFS period to be used for hardcapping (in usecs). 0 to use system default",
+ },
+ cli.StringFlag{
+ Name: "cpu-quota",
+ Usage: "CPU CFS hardcap limit (in usecs). Allowed cpu time in a given period",
+ },
+ cli.StringFlag{
+ Name: "cpu-share",
+ Usage: "CPU shares (relative weight vs. other containers)",
+ },
+ cli.StringFlag{
+ Name: "cpu-rt-period",
+ Usage: "CPU realtime period to be used for hardcapping (in usecs). 0 to use system default",
+ },
+ cli.StringFlag{
+ Name: "cpu-rt-runtime",
+ Usage: "CPU realtime hardcap limit (in usecs). Allowed cpu time in a given period",
+ },
+ cli.StringFlag{
+ Name: "cpuset-cpus",
+ Usage: "CPU(s) to use",
+ },
+ cli.StringFlag{
+ Name: "cpuset-mems",
+ Usage: "Memory node(s) to use",
+ },
+ cli.StringFlag{
+ Name: "kernel-memory",
+ Usage: "Kernel memory limit (in bytes)",
+ },
+ cli.StringFlag{
+ Name: "kernel-memory-tcp",
+ Usage: "Kernel memory limit (in bytes) for tcp buffer",
+ },
+ cli.StringFlag{
+ Name: "memory",
+ Usage: "Memory limit (in bytes)",
+ },
+ cli.StringFlag{
+ Name: "memory-reservation",
+ Usage: "Memory reservation or soft_limit (in bytes)",
+ },
+ cli.StringFlag{
+ Name: "memory-swap",
+ Usage: "Total memory usage (memory + swap); set '-1' to enable unlimited swap",
+ },
+ cli.IntFlag{
+ Name: "pids-limit",
+ Usage: "Maximum number of pids allowed in the container",
+ },
+ cli.StringFlag{
+ Name: "l3-cache-schema",
+ Usage: "The string of Intel RDT/CAT L3 cache schema",
+ },
+ cli.StringFlag{
+ Name: "mem-bw-schema",
+ Usage: "The string of Intel RDT/MBA memory bandwidth schema",
+ },
+ },
+ Action: func(context *cli.Context) error {
+ if err := checkArgs(context, 1, exactArgs); err != nil {
+ return err
+ }
+ container, err := getContainer(context)
+ if err != nil {
+ return err
+ }
+
+ r := specs.LinuxResources{
+ Memory: &specs.LinuxMemory{
+ Limit: i64Ptr(0),
+ Reservation: i64Ptr(0),
+ Swap: i64Ptr(0),
+ Kernel: i64Ptr(0),
+ KernelTCP: i64Ptr(0),
+ },
+ CPU: &specs.LinuxCPU{
+ Shares: u64Ptr(0),
+ Quota: i64Ptr(0),
+ Period: u64Ptr(0),
+ RealtimeRuntime: i64Ptr(0),
+ RealtimePeriod: u64Ptr(0),
+ Cpus: "",
+ Mems: "",
+ },
+ BlockIO: &specs.LinuxBlockIO{
+ Weight: u16Ptr(0),
+ },
+ Pids: &specs.LinuxPids{
+ Limit: 0,
+ },
+ }
+
+ config := container.Config()
+
+ if in := context.String("resources"); in != "" {
+ var (
+ f *os.File
+ err error
+ )
+ switch in {
+ case "-":
+ f = os.Stdin
+ default:
+ f, err = os.Open(in)
+ if err != nil {
+ return err
+ }
+ }
+ err = json.NewDecoder(f).Decode(&r)
+ if err != nil {
+ return err
+ }
+ } else {
+ if val := context.Int("blkio-weight"); val != 0 {
+ r.BlockIO.Weight = u16Ptr(uint16(val))
+ }
+ if val := context.String("cpuset-cpus"); val != "" {
+ r.CPU.Cpus = val
+ }
+ if val := context.String("cpuset-mems"); val != "" {
+ r.CPU.Mems = val
+ }
+
+ for _, pair := range []struct {
+ opt string
+ dest *uint64
+ }{
+
+ {"cpu-period", r.CPU.Period},
+ {"cpu-rt-period", r.CPU.RealtimePeriod},
+ {"cpu-share", r.CPU.Shares},
+ } {
+ if val := context.String(pair.opt); val != "" {
+ var err error
+ *pair.dest, err = strconv.ParseUint(val, 10, 64)
+ if err != nil {
+ return fmt.Errorf("invalid value for %s: %s", pair.opt, err)
+ }
+ }
+ }
+ for _, pair := range []struct {
+ opt string
+ dest *int64
+ }{
+
+ {"cpu-quota", r.CPU.Quota},
+ {"cpu-rt-runtime", r.CPU.RealtimeRuntime},
+ } {
+ if val := context.String(pair.opt); val != "" {
+ var err error
+ *pair.dest, err = strconv.ParseInt(val, 10, 64)
+ if err != nil {
+ return fmt.Errorf("invalid value for %s: %s", pair.opt, err)
+ }
+ }
+ }
+ for _, pair := range []struct {
+ opt string
+ dest *int64
+ }{
+ {"memory", r.Memory.Limit},
+ {"memory-swap", r.Memory.Swap},
+ {"kernel-memory", r.Memory.Kernel},
+ {"kernel-memory-tcp", r.Memory.KernelTCP},
+ {"memory-reservation", r.Memory.Reservation},
+ } {
+ if val := context.String(pair.opt); val != "" {
+ var v int64
+
+ if val != "-1" {
+ v, err = units.RAMInBytes(val)
+ if err != nil {
+ return fmt.Errorf("invalid value for %s: %s", pair.opt, err)
+ }
+ } else {
+ v = -1
+ }
+ *pair.dest = v
+ }
+ }
+ r.Pids.Limit = int64(context.Int("pids-limit"))
+ }
+
+ // Update the value
+ config.Cgroups.Resources.BlkioWeight = *r.BlockIO.Weight
+ config.Cgroups.Resources.CpuPeriod = *r.CPU.Period
+ config.Cgroups.Resources.CpuQuota = *r.CPU.Quota
+ config.Cgroups.Resources.CpuShares = *r.CPU.Shares
+ config.Cgroups.Resources.CpuRtPeriod = *r.CPU.RealtimePeriod
+ config.Cgroups.Resources.CpuRtRuntime = *r.CPU.RealtimeRuntime
+ config.Cgroups.Resources.CpusetCpus = r.CPU.Cpus
+ config.Cgroups.Resources.CpusetMems = r.CPU.Mems
+ config.Cgroups.Resources.KernelMemory = *r.Memory.Kernel
+ config.Cgroups.Resources.KernelMemoryTCP = *r.Memory.KernelTCP
+ config.Cgroups.Resources.Memory = *r.Memory.Limit
+ config.Cgroups.Resources.MemoryReservation = *r.Memory.Reservation
+ config.Cgroups.Resources.MemorySwap = *r.Memory.Swap
+ config.Cgroups.Resources.PidsLimit = r.Pids.Limit
+
+ // Update Intel RDT
+ l3CacheSchema := context.String("l3-cache-schema")
+ memBwSchema := context.String("mem-bw-schema")
+ if l3CacheSchema != "" && !intelrdt.IsCatEnabled() {
+ return fmt.Errorf("Intel RDT/CAT: l3 cache schema is not enabled")
+ }
+
+ if memBwSchema != "" && !intelrdt.IsMbaEnabled() {
+ return fmt.Errorf("Intel RDT/MBA: memory bandwidth schema is not enabled")
+ }
+
+ if l3CacheSchema != "" || memBwSchema != "" {
+ // If intelRdt is not specified in original configuration, we just don't
+ // Apply() to create intelRdt group or attach tasks for this container.
+ // In update command, we could re-enable through IntelRdtManager.Apply()
+ // and then update intelrdt constraint.
+ if config.IntelRdt == nil {
+ state, err := container.State()
+ if err != nil {
+ return err
+ }
+ config.IntelRdt = &configs.IntelRdt{}
+ intelRdtManager := intelrdt.IntelRdtManager{
+ Config: &config,
+ Id: container.ID(),
+ Path: state.IntelRdtPath,
+ }
+ if err := intelRdtManager.Apply(state.InitProcessPid); err != nil {
+ return err
+ }
+ }
+ config.IntelRdt.L3CacheSchema = l3CacheSchema
+ config.IntelRdt.MemBwSchema = memBwSchema
+ }
+
+ return container.Set(config)
+ },
+}
--- /dev/null
+package main
+
+import (
+ "fmt"
+ "os"
+ "path/filepath"
+ "strconv"
+ "strings"
+
+ "github.com/opencontainers/runtime-spec/specs-go"
+
+ "github.com/sirupsen/logrus"
+ "github.com/urfave/cli"
+)
+
+const (
+ exactArgs = iota
+ minArgs
+ maxArgs
+)
+
+func checkArgs(context *cli.Context, expected, checkType int) error {
+ var err error
+ cmdName := context.Command.Name
+ switch checkType {
+ case exactArgs:
+ if context.NArg() != expected {
+ err = fmt.Errorf("%s: %q requires exactly %d argument(s)", os.Args[0], cmdName, expected)
+ }
+ case minArgs:
+ if context.NArg() < expected {
+ err = fmt.Errorf("%s: %q requires a minimum of %d argument(s)", os.Args[0], cmdName, expected)
+ }
+ case maxArgs:
+ if context.NArg() > expected {
+ err = fmt.Errorf("%s: %q requires a maximum of %d argument(s)", os.Args[0], cmdName, expected)
+ }
+ }
+
+ if err != nil {
+ fmt.Printf("Incorrect Usage.\n\n")
+ cli.ShowCommandHelp(context, cmdName)
+ return err
+ }
+ return nil
+}
+
+// fatal prints the error's details if it is a libcontainer specific error type
+// then exits the program with an exit status of 1.
+func fatal(err error) {
+ // make sure the error is written to the logger
+ logrus.Error(err)
+ fmt.Fprintln(os.Stderr, err)
+ os.Exit(1)
+}
+
+// setupSpec performs initial setup based on the cli.Context for the container
+func setupSpec(context *cli.Context) (*specs.Spec, error) {
+ bundle := context.String("bundle")
+ if bundle != "" {
+ if err := os.Chdir(bundle); err != nil {
+ return nil, err
+ }
+ }
+ spec, err := loadSpec(specConfig)
+ if err != nil {
+ return nil, err
+ }
+ return spec, nil
+}
+
+func revisePidFile(context *cli.Context) error {
+ pidFile := context.String("pid-file")
+ if pidFile == "" {
+ return nil
+ }
+
+ // convert pid-file to an absolute path so we can write to the right
+ // file after chdir to bundle
+ pidFile, err := filepath.Abs(pidFile)
+ if err != nil {
+ return err
+ }
+ return context.Set("pid-file", pidFile)
+}
+
+// parseBoolOrAuto returns (nil, nil) if s is empty or "auto"
+func parseBoolOrAuto(s string) (*bool, error) {
+ if s == "" || strings.ToLower(s) == "auto" {
+ return nil, nil
+ }
+ b, err := strconv.ParseBool(s)
+ return &b, err
+}
--- /dev/null
+// +build linux
+
+package main
+
+import (
+ "errors"
+ "fmt"
+ "net"
+ "os"
+ "os/exec"
+ "path/filepath"
+ "strconv"
+
+ "github.com/opencontainers/runc/libcontainer"
+ "github.com/opencontainers/runc/libcontainer/cgroups/systemd"
+ "github.com/opencontainers/runc/libcontainer/configs"
+ "github.com/opencontainers/runc/libcontainer/intelrdt"
+ "github.com/opencontainers/runc/libcontainer/specconv"
+ "github.com/opencontainers/runc/libcontainer/utils"
+ "github.com/opencontainers/runtime-spec/specs-go"
+
+ "github.com/coreos/go-systemd/activation"
+ "github.com/sirupsen/logrus"
+ "github.com/urfave/cli"
+ "golang.org/x/sys/unix"
+)
+
+var errEmptyID = errors.New("container id cannot be empty")
+
+// loadFactory returns the configured factory instance for execing containers.
+func loadFactory(context *cli.Context) (libcontainer.Factory, error) {
+ root := context.GlobalString("root")
+ abs, err := filepath.Abs(root)
+ if err != nil {
+ return nil, err
+ }
+
+ // We default to cgroupfs, and can only use systemd if the system is a
+ // systemd box.
+ cgroupManager := libcontainer.Cgroupfs
+ rootlessCg, err := shouldUseRootlessCgroupManager(context)
+ if err != nil {
+ return nil, err
+ }
+ if rootlessCg {
+ cgroupManager = libcontainer.RootlessCgroupfs
+ }
+ if context.GlobalBool("systemd-cgroup") {
+ if systemd.UseSystemd() {
+ cgroupManager = libcontainer.SystemdCgroups
+ } else {
+ return nil, fmt.Errorf("systemd cgroup flag passed, but systemd support for managing cgroups is not available")
+ }
+ }
+
+ intelRdtManager := libcontainer.IntelRdtFs
+ if !intelrdt.IsCatEnabled() && !intelrdt.IsMbaEnabled() {
+ intelRdtManager = nil
+ }
+
+ // We resolve the paths for {newuidmap,newgidmap} from the context of runc,
+ // to avoid doing a path lookup in the nsexec context. TODO: The binary
+ // names are not currently configurable.
+ newuidmap, err := exec.LookPath("newuidmap")
+ if err != nil {
+ newuidmap = ""
+ }
+ newgidmap, err := exec.LookPath("newgidmap")
+ if err != nil {
+ newgidmap = ""
+ }
+
+ return libcontainer.New(abs, cgroupManager, intelRdtManager,
+ libcontainer.CriuPath(context.GlobalString("criu")),
+ libcontainer.NewuidmapPath(newuidmap),
+ libcontainer.NewgidmapPath(newgidmap))
+}
+
+// getContainer returns the specified container instance by loading it from state
+// with the default factory.
+func getContainer(context *cli.Context) (libcontainer.Container, error) {
+ id := context.Args().First()
+ if id == "" {
+ return nil, errEmptyID
+ }
+ factory, err := loadFactory(context)
+ if err != nil {
+ return nil, err
+ }
+ return factory.Load(id)
+}
+
+func fatalf(t string, v ...interface{}) {
+ fatal(fmt.Errorf(t, v...))
+}
+
+func getDefaultImagePath(context *cli.Context) string {
+ cwd, err := os.Getwd()
+ if err != nil {
+ panic(err)
+ }
+ return filepath.Join(cwd, "checkpoint")
+}
+
+// newProcess returns a new libcontainer Process with the arguments from the
+// spec and stdio from the current process.
+func newProcess(p specs.Process, init bool) (*libcontainer.Process, error) {
+ lp := &libcontainer.Process{
+ Args: p.Args,
+ Env: p.Env,
+ // TODO: fix libcontainer's API to better support uid/gid in a typesafe way.
+ User: fmt.Sprintf("%d:%d", p.User.UID, p.User.GID),
+ Cwd: p.Cwd,
+ Label: p.SelinuxLabel,
+ NoNewPrivileges: &p.NoNewPrivileges,
+ AppArmorProfile: p.ApparmorProfile,
+ Init: init,
+ }
+
+ if p.ConsoleSize != nil {
+ lp.ConsoleWidth = uint16(p.ConsoleSize.Width)
+ lp.ConsoleHeight = uint16(p.ConsoleSize.Height)
+ }
+
+ if p.Capabilities != nil {
+ lp.Capabilities = &configs.Capabilities{}
+ lp.Capabilities.Bounding = p.Capabilities.Bounding
+ lp.Capabilities.Effective = p.Capabilities.Effective
+ lp.Capabilities.Inheritable = p.Capabilities.Inheritable
+ lp.Capabilities.Permitted = p.Capabilities.Permitted
+ lp.Capabilities.Ambient = p.Capabilities.Ambient
+ }
+ for _, gid := range p.User.AdditionalGids {
+ lp.AdditionalGroups = append(lp.AdditionalGroups, strconv.FormatUint(uint64(gid), 10))
+ }
+ for _, rlimit := range p.Rlimits {
+ rl, err := createLibContainerRlimit(rlimit)
+ if err != nil {
+ return nil, err
+ }
+ lp.Rlimits = append(lp.Rlimits, rl)
+ }
+ return lp, nil
+}
+
+func destroy(container libcontainer.Container) {
+ if err := container.Destroy(); err != nil {
+ logrus.Error(err)
+ }
+}
+
+// setupIO modifies the given process config according to the options.
+func setupIO(process *libcontainer.Process, rootuid, rootgid int, createTTY, detach bool, sockpath string) (*tty, error) {
+ if createTTY {
+ process.Stdin = nil
+ process.Stdout = nil
+ process.Stderr = nil
+ t := &tty{}
+ if !detach {
+ parent, child, err := utils.NewSockPair("console")
+ if err != nil {
+ return nil, err
+ }
+ process.ConsoleSocket = child
+ t.postStart = append(t.postStart, parent, child)
+ t.consoleC = make(chan error, 1)
+ go func() {
+ if err := t.recvtty(process, parent); err != nil {
+ t.consoleC <- err
+ }
+ t.consoleC <- nil
+ }()
+ } else {
+ // the caller of runc will handle receiving the console master
+ conn, err := net.Dial("unix", sockpath)
+ if err != nil {
+ return nil, err
+ }
+ uc, ok := conn.(*net.UnixConn)
+ if !ok {
+ return nil, fmt.Errorf("casting to UnixConn failed")
+ }
+ t.postStart = append(t.postStart, uc)
+ socket, err := uc.File()
+ if err != nil {
+ return nil, err
+ }
+ t.postStart = append(t.postStart, socket)
+ process.ConsoleSocket = socket
+ }
+ return t, nil
+ }
+ // when runc will detach the caller provides the stdio to runc via runc's 0,1,2
+ // and the container's process inherits runc's stdio.
+ if detach {
+ if err := inheritStdio(process); err != nil {
+ return nil, err
+ }
+ return &tty{}, nil
+ }
+ return setupProcessPipes(process, rootuid, rootgid)
+}
+
+// createPidFile creates a file with the processes pid inside it atomically
+// it creates a temp file with the paths filename + '.' infront of it
+// then renames the file
+func createPidFile(path string, process *libcontainer.Process) error {
+ pid, err := process.Pid()
+ if err != nil {
+ return err
+ }
+ var (
+ tmpDir = filepath.Dir(path)
+ tmpName = filepath.Join(tmpDir, fmt.Sprintf(".%s", filepath.Base(path)))
+ )
+ f, err := os.OpenFile(tmpName, os.O_RDWR|os.O_CREATE|os.O_EXCL|os.O_SYNC, 0666)
+ if err != nil {
+ return err
+ }
+ _, err = fmt.Fprintf(f, "%d", pid)
+ f.Close()
+ if err != nil {
+ return err
+ }
+ return os.Rename(tmpName, path)
+}
+
+func createContainer(context *cli.Context, id string, spec *specs.Spec) (libcontainer.Container, error) {
+ rootlessCg, err := shouldUseRootlessCgroupManager(context)
+ if err != nil {
+ return nil, err
+ }
+ config, err := specconv.CreateLibcontainerConfig(&specconv.CreateOpts{
+ CgroupName: id,
+ UseSystemdCgroup: context.GlobalBool("systemd-cgroup"),
+ NoPivotRoot: context.Bool("no-pivot"),
+ NoNewKeyring: context.Bool("no-new-keyring"),
+ Spec: spec,
+ RootlessEUID: os.Geteuid() != 0,
+ RootlessCgroups: rootlessCg,
+ })
+ if err != nil {
+ return nil, err
+ }
+
+ factory, err := loadFactory(context)
+ if err != nil {
+ return nil, err
+ }
+ return factory.Create(id, config)
+}
+
+type runner struct {
+ init bool
+ enableSubreaper bool
+ shouldDestroy bool
+ detach bool
+ listenFDs []*os.File
+ preserveFDs int
+ pidFile string
+ consoleSocket string
+ container libcontainer.Container
+ action CtAct
+ notifySocket *notifySocket
+ criuOpts *libcontainer.CriuOpts
+}
+
+func (r *runner) run(config *specs.Process) (int, error) {
+ if err := r.checkTerminal(config); err != nil {
+ r.destroy()
+ return -1, err
+ }
+ process, err := newProcess(*config, r.init)
+ if err != nil {
+ r.destroy()
+ return -1, err
+ }
+ if len(r.listenFDs) > 0 {
+ process.Env = append(process.Env, fmt.Sprintf("LISTEN_FDS=%d", len(r.listenFDs)), "LISTEN_PID=1")
+ process.ExtraFiles = append(process.ExtraFiles, r.listenFDs...)
+ }
+ baseFd := 3 + len(process.ExtraFiles)
+ for i := baseFd; i < baseFd+r.preserveFDs; i++ {
+ process.ExtraFiles = append(process.ExtraFiles, os.NewFile(uintptr(i), "PreserveFD:"+strconv.Itoa(i)))
+ }
+ rootuid, err := r.container.Config().HostRootUID()
+ if err != nil {
+ r.destroy()
+ return -1, err
+ }
+ rootgid, err := r.container.Config().HostRootGID()
+ if err != nil {
+ r.destroy()
+ return -1, err
+ }
+ var (
+ detach = r.detach || (r.action == CT_ACT_CREATE)
+ )
+ // Setting up IO is a two stage process. We need to modify process to deal
+ // with detaching containers, and then we get a tty after the container has
+ // started.
+ handler := newSignalHandler(r.enableSubreaper, r.notifySocket)
+ tty, err := setupIO(process, rootuid, rootgid, config.Terminal, detach, r.consoleSocket)
+ if err != nil {
+ r.destroy()
+ return -1, err
+ }
+ defer tty.Close()
+
+ switch r.action {
+ case CT_ACT_CREATE:
+ err = r.container.Start(process)
+ case CT_ACT_RESTORE:
+ err = r.container.Restore(process, r.criuOpts)
+ case CT_ACT_RUN:
+ err = r.container.Run(process)
+ default:
+ panic("Unknown action")
+ }
+ if err != nil {
+ r.destroy()
+ return -1, err
+ }
+ if err := tty.waitConsole(); err != nil {
+ r.terminate(process)
+ r.destroy()
+ return -1, err
+ }
+ if err = tty.ClosePostStart(); err != nil {
+ r.terminate(process)
+ r.destroy()
+ return -1, err
+ }
+ if r.pidFile != "" {
+ if err = createPidFile(r.pidFile, process); err != nil {
+ r.terminate(process)
+ r.destroy()
+ return -1, err
+ }
+ }
+ status, err := handler.forward(process, tty, detach)
+ if err != nil {
+ r.terminate(process)
+ }
+ if detach {
+ return 0, nil
+ }
+ r.destroy()
+ return status, err
+}
+
+func (r *runner) destroy() {
+ if r.shouldDestroy {
+ destroy(r.container)
+ }
+}
+
+func (r *runner) terminate(p *libcontainer.Process) {
+ _ = p.Signal(unix.SIGKILL)
+ _, _ = p.Wait()
+}
+
+func (r *runner) checkTerminal(config *specs.Process) error {
+ detach := r.detach || (r.action == CT_ACT_CREATE)
+ // Check command-line for sanity.
+ if detach && config.Terminal && r.consoleSocket == "" {
+ return fmt.Errorf("cannot allocate tty if runc will detach without setting console socket")
+ }
+ if (!detach || !config.Terminal) && r.consoleSocket != "" {
+ return fmt.Errorf("cannot use console socket if runc will not detach or allocate tty")
+ }
+ return nil
+}
+
+func validateProcessSpec(spec *specs.Process) error {
+ if spec.Cwd == "" {
+ return fmt.Errorf("Cwd property must not be empty")
+ }
+ if !filepath.IsAbs(spec.Cwd) {
+ return fmt.Errorf("Cwd must be an absolute path")
+ }
+ if len(spec.Args) == 0 {
+ return fmt.Errorf("args must not be empty")
+ }
+ return nil
+}
+
+type CtAct uint8
+
+const (
+ CT_ACT_CREATE CtAct = iota + 1
+ CT_ACT_RUN
+ CT_ACT_RESTORE
+)
+
+func startContainer(context *cli.Context, spec *specs.Spec, action CtAct, criuOpts *libcontainer.CriuOpts) (int, error) {
+ id := context.Args().First()
+ if id == "" {
+ return -1, errEmptyID
+ }
+
+ notifySocket := newNotifySocket(context, os.Getenv("NOTIFY_SOCKET"), id)
+ if notifySocket != nil {
+ notifySocket.setupSpec(context, spec)
+ }
+
+ container, err := createContainer(context, id, spec)
+ if err != nil {
+ return -1, err
+ }
+
+ if notifySocket != nil {
+ err := notifySocket.setupSocket()
+ if err != nil {
+ return -1, err
+ }
+ }
+
+ // Support on-demand socket activation by passing file descriptors into the container init process.
+ listenFDs := []*os.File{}
+ if os.Getenv("LISTEN_FDS") != "" {
+ listenFDs = activation.Files(false)
+ }
+ r := &runner{
+ enableSubreaper: !context.Bool("no-subreaper"),
+ shouldDestroy: true,
+ container: container,
+ listenFDs: listenFDs,
+ notifySocket: notifySocket,
+ consoleSocket: context.String("console-socket"),
+ detach: context.Bool("detach"),
+ pidFile: context.String("pid-file"),
+ preserveFDs: context.Int("preserve-fds"),
+ action: action,
+ criuOpts: criuOpts,
+ init: true,
+ }
+ return r.run(spec.Process)
+}
--- /dev/null
+# OCI runtime-spec. When updating this, make sure you use a version tag rather
+# than a commit ID so it's much more obvious what version of the spec we are
+# using.
+github.com/opencontainers/runtime-spec 5684b8af48c1ac3b1451fa499724e30e3c20a294
+# Core libcontainer functionality.
+github.com/mrunalp/fileutils ed869b029674c0e9ce4c0dfa781405c2d9946d08
+github.com/opencontainers/selinux v1.0.0-rc1
+github.com/seccomp/libseccomp-golang 84e90a91acea0f4e51e62bc1a75de18b1fc0790f
+github.com/sirupsen/logrus a3f95b5c423586578a4e099b11a46c2479628cac
+github.com/syndtr/gocapability db04d3cc01c8b54962a58ec7e491717d06cfcc16
+github.com/vishvananda/netlink 1e2e08e8a2dcdacaae3f14ac44c5cfa31361f270
+# systemd integration.
+github.com/coreos/go-systemd v14
+github.com/coreos/pkg v3
+github.com/godbus/dbus v3
+github.com/golang/protobuf 18c9bb3261723cd5401db4d0c9fbc5c3b6c70fe8
+# Command-line interface.
+github.com/cyphar/filepath-securejoin v0.2.1
+github.com/docker/go-units v0.2.0
+github.com/urfave/cli d53eb991652b1d438abdd34ce4bfa3ef1539108e
+golang.org/x/sys 7ddbeae9ae08c6a06a59597f0c9edbc5ff2444ce https://github.com/golang/sys
+
+# console dependencies
+github.com/containerd/console 2748ece16665b45a47f884001d5831ec79703880
+github.com/pkg/errors v0.8.0
--- /dev/null
+Copyright (C) 2014-2015 Docker Inc & Go Authors. All rights reserved.
+Copyright (C) 2017 SUSE LLC. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+ * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+ * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- /dev/null
+## `filepath-securejoin` ##
+
+[](https://travis-ci.org/cyphar/filepath-securejoin)
+
+An implementation of `SecureJoin`, a [candidate for inclusion in the Go
+standard library][go#20126]. The purpose of this function is to be a "secure"
+alternative to `filepath.Join`, and in particular it provides certain
+guarantees that are not provided by `filepath.Join`.
+
+This is the function prototype:
+
+```go
+func SecureJoin(root, unsafePath string) (string, error)
+```
+
+This library **guarantees** the following:
+
+* If no error is set, the resulting string **must** be a child path of
+ `SecureJoin` and will not contain any symlink path components (they will all
+ be expanded).
+
+* When expanding symlinks, all symlink path components **must** be resolved
+ relative to the provided root. In particular, this can be considered a
+ userspace implementation of how `chroot(2)` operates on file paths. Note that
+ these symlinks will **not** be expanded lexically (`filepath.Clean` is not
+ called on the input before processing).
+
+* Non-existant path components are unaffected by `SecureJoin` (similar to
+ `filepath.EvalSymlinks`'s semantics).
+
+* The returned path will always be `filepath.Clean`ed and thus not contain any
+ `..` components.
+
+A (trivial) implementation of this function on GNU/Linux systems could be done
+with the following (note that this requires root privileges and is far more
+opaque than the implementation in this library, and also requires that
+`readlink` is inside the `root` path):
+
+```go
+package securejoin
+
+import (
+ "os/exec"
+ "path/filepath"
+)
+
+func SecureJoin(root, unsafePath string) (string, error) {
+ unsafePath = string(filepath.Separator) + unsafePath
+ cmd := exec.Command("chroot", root,
+ "readlink", "--canonicalize-missing", "--no-newline", unsafePath)
+ output, err := cmd.CombinedOutput()
+ if err != nil {
+ return "", err
+ }
+ expanded := string(output)
+ return filepath.Join(root, expanded), nil
+}
+```
+
+[go#20126]: https://github.com/golang/go/issues/20126
+
+### License ###
+
+The license of this project is the same as Go, which is a BSD 3-clause license
+available in the `LICENSE` file.
--- /dev/null
+// Copyright (C) 2014-2015 Docker Inc & Go Authors. All rights reserved.
+// Copyright (C) 2017 SUSE LLC. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package securejoin is an implementation of the hopefully-soon-to-be-included
+// SecureJoin helper that is meant to be part of the "path/filepath" package.
+// The purpose of this project is to provide a PoC implementation to make the
+// SecureJoin proposal (https://github.com/golang/go/issues/20126) more
+// tangible.
+package securejoin
+
+import (
+ "bytes"
+ "fmt"
+ "os"
+ "path/filepath"
+ "strings"
+ "syscall"
+
+ "github.com/pkg/errors"
+)
+
+// ErrSymlinkLoop is returned by SecureJoinVFS when too many symlinks have been
+// evaluated in attempting to securely join the two given paths.
+var ErrSymlinkLoop = fmt.Errorf("SecureJoin: too many links")
+
+// IsNotExist tells you if err is an error that implies that either the path
+// accessed does not exist (or path components don't exist). This is
+// effectively a more broad version of os.IsNotExist.
+func IsNotExist(err error) bool {
+ // If it's a bone-fide ENOENT just bail.
+ if os.IsNotExist(errors.Cause(err)) {
+ return true
+ }
+
+ // Check that it's not actually an ENOTDIR, which in some cases is a more
+ // convoluted case of ENOENT (usually involving weird paths).
+ var errno error
+ switch err := errors.Cause(err).(type) {
+ case *os.PathError:
+ errno = err.Err
+ case *os.LinkError:
+ errno = err.Err
+ case *os.SyscallError:
+ errno = err.Err
+ }
+ return errno == syscall.ENOTDIR || errno == syscall.ENOENT
+}
+
+// SecureJoinVFS joins the two given path components (similar to Join) except
+// that the returned path is guaranteed to be scoped inside the provided root
+// path (when evaluated). Any symbolic links in the path are evaluated with the
+// given root treated as the root of the filesystem, similar to a chroot. The
+// filesystem state is evaluated through the given VFS interface (if nil, the
+// standard os.* family of functions are used).
+//
+// Note that the guarantees provided by this function only apply if the path
+// components in the returned string are not modified (in other words are not
+// replaced with symlinks on the filesystem) after this function has returned.
+// Such a symlink race is necessarily out-of-scope of SecureJoin.
+func SecureJoinVFS(root, unsafePath string, vfs VFS) (string, error) {
+ // Use the os.* VFS implementation if none was specified.
+ if vfs == nil {
+ vfs = osVFS{}
+ }
+
+ var path bytes.Buffer
+ n := 0
+ for unsafePath != "" {
+ if n > 255 {
+ return "", ErrSymlinkLoop
+ }
+
+ // Next path component, p.
+ i := strings.IndexRune(unsafePath, filepath.Separator)
+ var p string
+ if i == -1 {
+ p, unsafePath = unsafePath, ""
+ } else {
+ p, unsafePath = unsafePath[:i], unsafePath[i+1:]
+ }
+
+ // Create a cleaned path, using the lexical semantics of /../a, to
+ // create a "scoped" path component which can safely be joined to fullP
+ // for evaluation. At this point, path.String() doesn't contain any
+ // symlink components.
+ cleanP := filepath.Clean(string(filepath.Separator) + path.String() + p)
+ if cleanP == string(filepath.Separator) {
+ path.Reset()
+ continue
+ }
+ fullP := filepath.Clean(root + cleanP)
+
+ // Figure out whether the path is a symlink.
+ fi, err := vfs.Lstat(fullP)
+ if err != nil && !IsNotExist(err) {
+ return "", err
+ }
+ // Treat non-existent path components the same as non-symlinks (we
+ // can't do any better here).
+ if IsNotExist(err) || fi.Mode()&os.ModeSymlink == 0 {
+ path.WriteString(p)
+ path.WriteRune(filepath.Separator)
+ continue
+ }
+
+ // Only increment when we actually dereference a link.
+ n++
+
+ // It's a symlink, expand it by prepending it to the yet-unparsed path.
+ dest, err := vfs.Readlink(fullP)
+ if err != nil {
+ return "", err
+ }
+ // Absolute symlinks reset any work we've already done.
+ if filepath.IsAbs(dest) {
+ path.Reset()
+ }
+ unsafePath = dest + string(filepath.Separator) + unsafePath
+ }
+
+ // We have to clean path.String() here because it may contain '..'
+ // components that are entirely lexical, but would be misleading otherwise.
+ // And finally do a final clean to ensure that root is also lexically
+ // clean.
+ fullP := filepath.Clean(string(filepath.Separator) + path.String())
+ return filepath.Clean(root + fullP), nil
+}
+
+// SecureJoin is a wrapper around SecureJoinVFS that just uses the os.* library
+// of functions as the VFS. If in doubt, use this function over SecureJoinVFS.
+func SecureJoin(root, unsafePath string) (string, error) {
+ return SecureJoinVFS(root, unsafePath, nil)
+}
--- /dev/null
+github.com/pkg/errors v0.8.0
--- /dev/null
+// Copyright (C) 2017 SUSE LLC. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package securejoin
+
+import "os"
+
+// In future this should be moved into a separate package, because now there
+// are several projects (umoci and go-mtree) that are using this sort of
+// interface.
+
+// VFS is the minimal interface necessary to use SecureJoinVFS. A nil VFS is
+// equivalent to using the standard os.* family of functions. This is mainly
+// used for the purposes of mock testing, but also can be used to otherwise use
+// SecureJoin with VFS-like system.
+type VFS interface {
+ // Lstat returns a FileInfo describing the named file. If the file is a
+ // symbolic link, the returned FileInfo describes the symbolic link. Lstat
+ // makes no attempt to follow the link. These semantics are identical to
+ // os.Lstat.
+ Lstat(name string) (os.FileInfo, error)
+
+ // Readlink returns the destination of the named symbolic link. These
+ // semantics are identical to os.Readlink.
+ Readlink(name string) (string, error)
+}
+
+// osVFS is the "nil" VFS, in that it just passes everything through to the os
+// module.
+type osVFS struct{}
+
+// Lstat returns a FileInfo describing the named file. If the file is a
+// symbolic link, the returned FileInfo describes the symbolic link. Lstat
+// makes no attempt to follow the link. These semantics are identical to
+// os.Lstat.
+func (o osVFS) Lstat(name string) (os.FileInfo, error) { return os.Lstat(name) }
+
+// Readlink returns the destination of the named symbolic link. These
+// semantics are identical to os.Readlink.
+func (o osVFS) Readlink(name string) (string, error) { return os.Readlink(name) }