From 1c04b0bbb7c76ef25dda4f9d8fe35e2aabb8874b Mon Sep 17 00:00:00 2001 From: Tianon Gravi Date: Fri, 19 Dec 2014 04:54:12 +0000 Subject: [PATCH] Import docker.io_1.3.3~dfsg1.orig-libcontainer.tar.gz [dgit import orig docker.io_1.3.3~dfsg1.orig-libcontainer.tar.gz] --- .travis.yml | 36 + CONTRIBUTORS_GUIDE.md | 204 ++++ Dockerfile | 23 + LICENSE | 191 ++++ MAINTAINERS | 6 + MAINTAINERS_GUIDE.md | 99 ++ Makefile | 24 + NOTICE | 16 + PRINCIPLES.md | 19 + README.md | 62 ++ ROADMAP.md | 16 + api_temp.go | 34 + apparmor/apparmor.go | 35 + apparmor/apparmor_disabled.go | 11 + apparmor/gen.go | 94 ++ apparmor/setup.go | 44 + cgroups/cgroups.go | 60 + cgroups/cgroups_test.go | 27 + cgroups/cgutil/cgutil.go | 264 +++++ cgroups/cgutil/sample_cgroup.json | 10 + cgroups/fs/apply_raw.go | 248 +++++ cgroups/fs/blkio.go | 167 +++ cgroups/fs/blkio_test.go | 248 +++++ cgroups/fs/cpu.go | 72 ++ cgroups/fs/cpu_test.go | 69 ++ cgroups/fs/cpuacct.go | 110 ++ cgroups/fs/cpuset.go | 119 ++ cgroups/fs/devices.go | 34 + cgroups/fs/freezer.go | 50 + cgroups/fs/memory.go | 93 ++ cgroups/fs/memory_test.go | 134 +++ cgroups/fs/notify_linux.go | 82 ++ cgroups/fs/notify_linux_test.go | 86 ++ cgroups/fs/perf_event.go | 24 + cgroups/fs/stats_util_test.go | 73 ++ cgroups/fs/util_test.go | 60 + cgroups/fs/utils.go | 62 ++ cgroups/fs/utils_test.go | 95 ++ cgroups/stats.go | 69 ++ cgroups/systemd/apply_nosystemd.go | 29 + cgroups/systemd/apply_systemd.go | 358 ++++++ cgroups/utils.go | 194 ++++ config.go | 89 ++ config_test.go | 160 +++ console/console.go | 128 +++ container.go | 78 ++ devices/defaults.go | 159 +++ devices/devices.go | 129 +++ devices/devices_test.go | 61 ++ devices/number.go | 26 + error.go | 37 + factory.go | 32 + label/label.go | 45 + label/label_selinux.go | 132 +++ label/label_selinux_test.go | 48 + mount/init.go | 208 ++++ mount/mount.go | 109 ++ mount/mount_config.go | 28 + mount/msmoveroot.go | 20 + mount/nodes/nodes.go | 57 + mount/nodes/nodes_unsupported.go | 13 + mount/pivotroot.go | 34 + mount/ptmx.go | 30 + mount/readonly.go | 11 + mount/remount.go | 31 + namespaces/create.go | 10 + namespaces/exec.go | 200 ++++ namespaces/execin.go | 119 ++ namespaces/init.go | 263 +++++ namespaces/nsenter/README.md | 6 + namespaces/nsenter/nsenter.c | 218 ++++ namespaces/nsenter/nsenter.go | 10 + namespaces/nsenter/nsenter_unsupported.go | 3 + namespaces/types.go | 50 + namespaces/types_linux.go | 16 + namespaces/types_test.go | 30 + netlink/MAINTAINERS | 2 + netlink/netlink.go | 31 + netlink/netlink_linux.go | 1219 +++++++++++++++++++++ netlink/netlink_linux_arm.go | 5 + netlink/netlink_linux_notarm.go | 7 + netlink/netlink_linux_test.go | 356 ++++++ netlink/netlink_unsupported.go | 84 ++ network/loopback.go | 23 + network/netns.go | 39 + network/network.go | 97 ++ network/stats.go | 69 ++ network/strategy.go | 35 + network/types.go | 50 + network/veth.go | 121 ++ network/veth_test.go | 53 + nsinit/config.go | 29 + nsinit/exec.go | 208 ++++ nsinit/init.go | 52 + nsinit/main.go | 67 ++ nsinit/nsenter.go | 84 ++ nsinit/pause.go | 49 + nsinit/stats.go | 39 + nsinit/utils.go | 94 ++ process.go | 27 + sample_configs/README.md | 5 + sample_configs/apparmor.json | 196 ++++ sample_configs/attach_to_bridge.json | 202 ++++ sample_configs/minimal.json | 201 ++++ sample_configs/selinux.json | 197 ++++ security/capabilities/capabilities.go | 56 + security/capabilities/types.go | 88 ++ security/capabilities/types_test.go | 19 + security/restrict/restrict.go | 53 + security/restrict/unsupported.go | 9 + selinux/selinux.go | 436 ++++++++ selinux/selinux_test.go | 64 ++ state.go | 77 ++ syncpipe/sync_pipe.go | 105 ++ syncpipe/sync_pipe_linux.go | 20 + syncpipe/sync_pipe_test.go | 72 ++ system/linux.go | 60 + system/proc.go | 27 + system/setns_linux.go | 31 + system/sysconfig.go | 12 + system/sysconfig_notcgo.go | 8 + system/xattrs_linux.go | 99 ++ types.go | 11 + update-vendor.sh | 48 + user/MAINTAINERS | 1 + user/user.go | 258 +++++ user/user_test.go | 94 ++ utils/utils.go | 55 + xattr/xattr.go | 53 + xattr/xattr_test.go | 77 ++ 130 files changed, 11815 insertions(+) create mode 100644 .travis.yml create mode 100644 CONTRIBUTORS_GUIDE.md create mode 100644 Dockerfile create mode 100644 LICENSE create mode 100644 MAINTAINERS create mode 100644 MAINTAINERS_GUIDE.md create mode 100644 Makefile create mode 100644 NOTICE create mode 100644 PRINCIPLES.md create mode 100644 README.md create mode 100644 ROADMAP.md create mode 100644 api_temp.go create mode 100644 apparmor/apparmor.go create mode 100644 apparmor/apparmor_disabled.go create mode 100644 apparmor/gen.go create mode 100644 apparmor/setup.go create mode 100644 cgroups/cgroups.go create mode 100644 cgroups/cgroups_test.go create mode 100644 cgroups/cgutil/cgutil.go create mode 100644 cgroups/cgutil/sample_cgroup.json create mode 100644 cgroups/fs/apply_raw.go create mode 100644 cgroups/fs/blkio.go create mode 100644 cgroups/fs/blkio_test.go create mode 100644 cgroups/fs/cpu.go create mode 100644 cgroups/fs/cpu_test.go create mode 100644 cgroups/fs/cpuacct.go create mode 100644 cgroups/fs/cpuset.go create mode 100644 cgroups/fs/devices.go create mode 100644 cgroups/fs/freezer.go create mode 100644 cgroups/fs/memory.go create mode 100644 cgroups/fs/memory_test.go create mode 100644 cgroups/fs/notify_linux.go create mode 100644 cgroups/fs/notify_linux_test.go create mode 100644 cgroups/fs/perf_event.go create mode 100644 cgroups/fs/stats_util_test.go create mode 100644 cgroups/fs/util_test.go create mode 100644 cgroups/fs/utils.go create mode 100644 cgroups/fs/utils_test.go create mode 100644 cgroups/stats.go create mode 100644 cgroups/systemd/apply_nosystemd.go create mode 100644 cgroups/systemd/apply_systemd.go create mode 100644 cgroups/utils.go create mode 100644 config.go create mode 100644 config_test.go create mode 100644 console/console.go create mode 100644 container.go create mode 100644 devices/defaults.go create mode 100644 devices/devices.go create mode 100644 devices/devices_test.go create mode 100644 devices/number.go create mode 100644 error.go create mode 100644 factory.go create mode 100644 label/label.go create mode 100644 label/label_selinux.go create mode 100644 label/label_selinux_test.go create mode 100644 mount/init.go create mode 100644 mount/mount.go create mode 100644 mount/mount_config.go create mode 100644 mount/msmoveroot.go create mode 100644 mount/nodes/nodes.go create mode 100644 mount/nodes/nodes_unsupported.go create mode 100644 mount/pivotroot.go create mode 100644 mount/ptmx.go create mode 100644 mount/readonly.go create mode 100644 mount/remount.go create mode 100644 namespaces/create.go create mode 100644 namespaces/exec.go create mode 100644 namespaces/execin.go create mode 100644 namespaces/init.go create mode 100644 namespaces/nsenter/README.md create mode 100644 namespaces/nsenter/nsenter.c create mode 100644 namespaces/nsenter/nsenter.go create mode 100644 namespaces/nsenter/nsenter_unsupported.go create mode 100644 namespaces/types.go create mode 100644 namespaces/types_linux.go create mode 100644 namespaces/types_test.go create mode 100644 netlink/MAINTAINERS create mode 100644 netlink/netlink.go create mode 100644 netlink/netlink_linux.go create mode 100644 netlink/netlink_linux_arm.go create mode 100644 netlink/netlink_linux_notarm.go create mode 100644 netlink/netlink_linux_test.go create mode 100644 netlink/netlink_unsupported.go create mode 100644 network/loopback.go create mode 100644 network/netns.go create mode 100644 network/network.go create mode 100644 network/stats.go create mode 100644 network/strategy.go create mode 100644 network/types.go create mode 100644 network/veth.go create mode 100644 network/veth_test.go create mode 100644 nsinit/config.go create mode 100644 nsinit/exec.go create mode 100644 nsinit/init.go create mode 100644 nsinit/main.go create mode 100644 nsinit/nsenter.go create mode 100644 nsinit/pause.go create mode 100644 nsinit/stats.go create mode 100644 nsinit/utils.go create mode 100644 process.go create mode 100644 sample_configs/README.md create mode 100644 sample_configs/apparmor.json create mode 100644 sample_configs/attach_to_bridge.json create mode 100644 sample_configs/minimal.json create mode 100644 sample_configs/selinux.json create mode 100644 security/capabilities/capabilities.go create mode 100644 security/capabilities/types.go create mode 100644 security/capabilities/types_test.go create mode 100644 security/restrict/restrict.go create mode 100644 security/restrict/unsupported.go create mode 100644 selinux/selinux.go create mode 100644 selinux/selinux_test.go create mode 100644 state.go create mode 100644 syncpipe/sync_pipe.go create mode 100644 syncpipe/sync_pipe_linux.go create mode 100644 syncpipe/sync_pipe_test.go create mode 100644 system/linux.go create mode 100644 system/proc.go create mode 100644 system/setns_linux.go create mode 100644 system/sysconfig.go create mode 100644 system/sysconfig_notcgo.go create mode 100644 system/xattrs_linux.go create mode 100644 types.go create mode 100755 update-vendor.sh create mode 100644 user/MAINTAINERS create mode 100644 user/user.go create mode 100644 user/user_test.go create mode 100644 utils/utils.go create mode 100644 xattr/xattr.go create mode 100644 xattr/xattr_test.go diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 00000000..3ce0e27e --- /dev/null +++ b/.travis.yml @@ -0,0 +1,36 @@ +language: go +go: 1.3 + +# let us have pretty experimental Docker-based Travis workers +sudo: false + +env: + - TRAVIS_GLOBAL_WTF=1 + - _GOOS=linux _GOARCH=amd64 CGO_ENABLED=1 + - _GOOS=linux _GOARCH=amd64 CGO_ENABLED=0 +# - _GOOS=linux _GOARCH=386 CGO_ENABLED=1 # TODO add this once Travis can handle it (https://github.com/travis-ci/travis-ci/issues/2207#issuecomment-49625061) + - _GOOS=linux _GOARCH=386 CGO_ENABLED=0 + - _GOOS=linux _GOARCH=arm CGO_ENABLED=0 + +install: + - go get code.google.com/p/go.tools/cmd/cover + - mkdir -pv "${GOPATH%%:*}/src/github.com/docker" && [ -d "${GOPATH%%:*}/src/github.com/docker/libcontainer" ] || ln -sv "$(readlink -f .)" "${GOPATH%%:*}/src/github.com/docker/libcontainer" + - if [ -z "$TRAVIS_GLOBAL_WTF" ]; then + gvm cross "$_GOOS" "$_GOARCH"; + export GOOS="$_GOOS" GOARCH="$_GOARCH"; + fi + - export GOPATH="$GOPATH:$(pwd)/vendor" + - if [ -z "$TRAVIS_GLOBAL_WTF" ]; then go env; fi + - go get -d -v ./... # TODO remove this if /docker/docker gets purged from our includes + - if [ "$TRAVIS_GLOBAL_WTF" ]; then + export DOCKER_PATH="${GOPATH%%:*}/src/github.com/docker/docker"; + mkdir -p "$DOCKER_PATH/hack/make"; + ( cd "$DOCKER_PATH/hack/make" && wget -c 'https://raw.githubusercontent.com/docker/docker/master/hack/make/'{.validate,validate-dco,validate-gofmt} ); + sed -i 's!docker/docker!docker/libcontainer!' "$DOCKER_PATH/hack/make/.validate"; + fi + +script: + - if [ "$TRAVIS_GLOBAL_WTF" ]; then bash "$DOCKER_PATH/hack/make/validate-dco"; fi + - if [ "$TRAVIS_GLOBAL_WTF" ]; then bash "$DOCKER_PATH/hack/make/validate-gofmt"; fi + - if [ -z "$TRAVIS_GLOBAL_WTF" ]; then make direct-build; fi + - if [ -z "$TRAVIS_GLOBAL_WTF" -a "$GOARCH" != 'arm' ]; then make direct-test-short; fi diff --git a/CONTRIBUTORS_GUIDE.md b/CONTRIBUTORS_GUIDE.md new file mode 100644 index 00000000..07bf22a0 --- /dev/null +++ b/CONTRIBUTORS_GUIDE.md @@ -0,0 +1,204 @@ +# The libcontainer Contributors' Guide + +Want to hack on libcontainer? Awesome! Here are instructions to get you +started. They are probably not perfect, please let us know if anything +feels wrong or incomplete. + +## Reporting Issues + +When reporting [issues](https://github.com/docker/libcontainer/issues) +on GitHub please include your host OS (Ubuntu 12.04, Fedora 19, etc), +the output of `uname -a`. Please include the steps required to reproduce +the problem if possible and applicable. +This information will help us review and fix your issue faster. + +## Development Environment + +*Add instructions on setting up the development environment.* + +## Contribution Guidelines + +### Pull requests are always welcome + +We are always thrilled to receive pull requests, and do our best to +process them as fast as possible. Not sure if that typo is worth a pull +request? Do it! We will appreciate it. + +If your pull request is not accepted on the first try, don't be +discouraged! If there's a problem with the implementation, hopefully you +received feedback on what to improve. + +We're trying very hard to keep libcontainer lean and focused. We don't want it +to do everything for everybody. This means that we might decide against +incorporating a new feature. However, there might be a way to implement +that feature *on top of* libcontainer. + +### Discuss your design on the mailing list + +We recommend discussing your plans [on the mailing +list](https://groups.google.com/forum/?fromgroups#!forum/libcontainer) +before starting to code - especially for more ambitious contributions. +This gives other contributors a chance to point you in the right +direction, give feedback on your design, and maybe point out if someone +else is working on the same thing. + +### Create issues... + +Any significant improvement should be documented as [a GitHub +issue](https://github.com/docker/libcontainer/issues) before anybody +starts working on it. + +### ...but check for existing issues first! + +Please take a moment to check that an issue doesn't already exist +documenting your bug report or improvement proposal. If it does, it +never hurts to add a quick "+1" or "I have this problem too". This will +help prioritize the most common problems and requests. + +### Conventions + +Fork the repo and make changes on your fork in a feature branch: + +- If it's a bugfix branch, name it XXX-something where XXX is the number of the + issue +- If it's a feature branch, create an enhancement issue to announce your + intentions, and name it XXX-something where XXX is the number of the issue. + +Submit unit tests for your changes. Go has a great test framework built in; use +it! Take a look at existing tests for inspiration. Run the full test suite on +your branch before submitting a pull request. + +Update the documentation when creating or modifying features. Test +your documentation changes for clarity, concision, and correctness, as +well as a clean documentation build. See ``docs/README.md`` for more +information on building the docs and how docs get released. + +Write clean code. Universally formatted code promotes ease of writing, reading, +and maintenance. Always run `gofmt -s -w file.go` on each changed file before +committing your changes. Most editors have plugins that do this automatically. + +Pull requests descriptions should be as clear as possible and include a +reference to all the issues that they address. + +Pull requests must not contain commits from other users or branches. + +Commit messages must start with a capitalized and short summary (max. 50 +chars) written in the imperative, followed by an optional, more detailed +explanatory text which is separated from the summary by an empty line. + +Code review comments may be added to your pull request. Discuss, then make the +suggested modifications and push additional commits to your feature branch. Be +sure to post a comment after pushing. The new commits will show up in the pull +request automatically, but the reviewers will not be notified unless you +comment. + +Before the pull request is merged, make sure that you squash your commits into +logical units of work using `git rebase -i` and `git push -f`. After every +commit the test suite should be passing. Include documentation changes in the +same commit so that a revert would remove all traces of the feature or fix. + +Commits that fix or close an issue should include a reference like `Closes #XXX` +or `Fixes #XXX`, which will automatically close the issue when merged. + +### Testing + +Make sure you include suitable tests, preferably unit tests, in your pull request +and that all the tests pass. + +*Instructions for running tests to be added.* + +### Merge approval + +libcontainer maintainers use LGTM (looks good to me) in comments on the code review +to indicate acceptance. + +A change requires LGTMs from at lease two maintainers. One of those must come from +a maintainer of the component affected. For example, if a change affects `netlink/` +and `security`, it needs at least one LGTM from a maintainer of each. Maintainers +only need one LGTM as presumably they LGTM their own change. + +For more details see [MAINTAINERS.md](MAINTAINERS.md) + +### Sign your work + +The sign-off is a simple line at the end of the explanation for the +patch, which certifies that you wrote it or otherwise have the right to +pass it on as an open-source patch. The rules are pretty simple: if you +can certify the below (from +[developercertificate.org](http://developercertificate.org/)): + +``` +Developer Certificate of Origin +Version 1.1 + +Copyright (C) 2004, 2006 The Linux Foundation and its contributors. +660 York Street, Suite 102, +San Francisco, CA 94110 USA + +Everyone is permitted to copy and distribute verbatim copies of this +license document, but changing it is not allowed. + + +Developer's Certificate of Origin 1.1 + +By making a contribution to this project, I certify that: + +(a) The contribution was created in whole or in part by me and I + have the right to submit it under the open source license + indicated in the file; or + +(b) The contribution is based upon previous work that, to the best + of my knowledge, is covered under an appropriate open source + license and I have the right under that license to submit that + work with modifications, whether created in whole or in part + by me, under the same open source license (unless I am + permitted to submit under a different license), as indicated + in the file; or + +(c) The contribution was provided directly to me by some other + person who certified (a), (b) or (c) and I have not modified + it. + +(d) I understand and agree that this project and the contribution + are public and that a record of the contribution (including all + personal information I submit with it, including my sign-off) is + maintained indefinitely and may be redistributed consistent with + this project or the open source license(s) involved. +``` + +then you just add a line to every git commit message: + + Docker-DCO-1.1-Signed-off-by: Joe Smith (github: github_handle) + +using your real name (sorry, no pseudonyms or anonymous contributions.) + +One way to automate this, is customise your get ``commit.template`` by adding +a ``prepare-commit-msg`` hook to your libcontainer checkout: + +``` +curl -o .git/hooks/prepare-commit-msg https://raw.githubusercontent.com/docker/docker/master/contrib/prepare-commit-msg.hook && chmod +x .git/hooks/prepare-commit-msg +``` + +* Note: the above script expects to find your GitHub user name in ``git config --get github.user`` + +#### Small patch exception + +There are several exceptions to the signing requirement. Currently these are: + +* Your patch fixes spelling or grammar errors. +* Your patch is a single line change to documentation contained in the + `docs` directory. +* Your patch fixes Markdown formatting or syntax errors in the + documentation contained in the `docs` directory. + +If you have any questions, please refer to the FAQ in the [docs](to be written) + +### How can I become a maintainer? + +* Step 1: learn the component inside out +* Step 2: make yourself useful by contributing code, bugfixes, support etc. +* Step 3: volunteer on the irc channel (#libcontainer@freenode) + +Don't forget: being a maintainer is a time investment. Make sure you will have time to make yourself available. +You don't have to be a maintainer to make a difference on the project! + diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 00000000..65bf5731 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,23 @@ +FROM crosbymichael/golang + +RUN apt-get update && apt-get install -y gcc make +RUN go get code.google.com/p/go.tools/cmd/cover + +# setup a playground for us to spawn containers in +RUN mkdir /busybox && \ + curl -sSL 'https://github.com/jpetazzo/docker-busybox/raw/buildroot-2014.02/rootfs.tar' | tar -xC /busybox + +RUN curl -sSL https://raw.githubusercontent.com/docker/docker/master/hack/dind -o /dind && \ + chmod +x /dind + +COPY . /go/src/github.com/docker/libcontainer +WORKDIR /go/src/github.com/docker/libcontainer +RUN cp sample_configs/minimal.json /busybox/container.json + +ENV GOPATH $GOPATH:/go/src/github.com/docker/libcontainer/vendor + +RUN go get -d -v ./... +RUN make direct-install + +ENTRYPOINT ["/dind"] +CMD ["make", "direct-test"] diff --git a/LICENSE b/LICENSE new file mode 100644 index 00000000..27448585 --- /dev/null +++ b/LICENSE @@ -0,0 +1,191 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + Copyright 2014 Docker, Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/MAINTAINERS b/MAINTAINERS new file mode 100644 index 00000000..24011b05 --- /dev/null +++ b/MAINTAINERS @@ -0,0 +1,6 @@ +Michael Crosby (@crosbymichael) +Rohit Jnagal (@rjnagal) +Victor Marmol (@vmarmol) +Mrunal Patel (@mrunalp) +.travis.yml: Tianon Gravi (@tianon) +update-vendor.sh: Tianon Gravi (@tianon) diff --git a/MAINTAINERS_GUIDE.md b/MAINTAINERS_GUIDE.md new file mode 100644 index 00000000..2ac9ca21 --- /dev/null +++ b/MAINTAINERS_GUIDE.md @@ -0,0 +1,99 @@ +# The libcontainer Maintainers' Guide + +## Introduction + +Dear maintainer. Thank you for investing the time and energy to help +make libcontainer as useful as possible. Maintaining a project is difficult, +sometimes unrewarding work. Sure, you will get to contribute cool +features to the project. But most of your time will be spent reviewing, +cleaning up, documenting, answering questions, justifying design +decisions - while everyone has all the fun! But remember - the quality +of the maintainers work is what distinguishes the good projects from the +great. So please be proud of your work, even the unglamourous parts, +and encourage a culture of appreciation and respect for *every* aspect +of improving the project - not just the hot new features. + +This document is a manual for maintainers old and new. It explains what +is expected of maintainers, how they should work, and what tools are +available to them. + +This is a living document - if you see something out of date or missing, +speak up! + +## What are a maintainer's responsibility? + +It is every maintainer's responsibility to: + +* 1) Expose a clear roadmap for improving their component. +* 2) Deliver prompt feedback and decisions on pull requests. +* 3) Be available to anyone with questions, bug reports, criticism etc. + on their component. This includes IRC, GitHub requests and the mailing + list. +* 4) Make sure their component respects the philosophy, design and + roadmap of the project. + +## How are decisions made? + +Short answer: with pull requests to the libcontainer repository. + +libcontainer is an open-source project with an open design philosophy. This +means that the repository is the source of truth for EVERY aspect of the +project, including its philosophy, design, roadmap and APIs. *If it's +part of the project, it's in the repo. It's in the repo, it's part of +the project.* + +As a result, all decisions can be expressed as changes to the +repository. An implementation change is a change to the source code. An +API change is a change to the API specification. A philosophy change is +a change to the philosophy manifesto. And so on. + +All decisions affecting libcontainer, big and small, follow the same 3 steps: + +* Step 1: Open a pull request. Anyone can do this. + +* Step 2: Discuss the pull request. Anyone can do this. + +* Step 3: Accept (`LGTM`) or refuse a pull request. The relevant maintainers do +this (see below "Who decides what?") + + +## Who decides what? + +All decisions are pull requests, and the relevant maintainers make +decisions by accepting or refusing the pull request. Review and acceptance +by anyone is denoted by adding a comment in the pull request: `LGTM`. +However, only currently listed `MAINTAINERS` are counted towards the required +two LGTMs. + +libcontainer follows the timeless, highly efficient and totally unfair system +known as [Benevolent dictator for life](http://en.wikipedia.org/wiki/Benevolent_Dictator_for_Life), with Michael Crosby in the role of BDFL. +This means that all decisions are made by default by Michael. Since making +every decision himself would be highly un-scalable, in practice decisions +are spread across multiple maintainers. + +The relevant maintainers for a pull request can be worked out in two steps: + +* Step 1: Determine the subdirectories affected by the pull request. This + might be `netlink/` and `security/`, or any other part of the repo. + +* Step 2: Find the `MAINTAINERS` file which affects this directory. If the + directory itself does not have a `MAINTAINERS` file, work your way up + the repo hierarchy until you find one. + +### I'm a maintainer, and I'm going on holiday + +Please let your co-maintainers and other contributors know by raising a pull +request that comments out your `MAINTAINERS` file entry using a `#`. + +### I'm a maintainer, should I make pull requests too? + +Yes. Nobody should ever push to master directly. All changes should be +made through a pull request. + +### Who assigns maintainers? + +Michael has final `LGTM` approval for all pull requests to `MAINTAINERS` files. + +### How is this process changed? + +Just like everything else: by making a pull request :) diff --git a/Makefile b/Makefile new file mode 100644 index 00000000..0ec995fc --- /dev/null +++ b/Makefile @@ -0,0 +1,24 @@ + +all: + docker build -t docker/libcontainer . + +test: + # we need NET_ADMIN for the netlink tests and SYS_ADMIN for mounting + docker run --rm -it --privileged docker/libcontainer + +sh: + docker run --rm -it --privileged -w /busybox docker/libcontainer nsinit exec sh + +GO_PACKAGES = $(shell find . -not \( -wholename ./vendor -prune -o -wholename ./.git -prune \) -name '*.go' -print0 | xargs -0n1 dirname | sort -u) + +direct-test: + go test -cover -v $(GO_PACKAGES) + +direct-test-short: + go test -cover -test.short -v $(GO_PACKAGES) + +direct-build: + go build -v $(GO_PACKAGES) + +direct-install: + go install -v $(GO_PACKAGES) diff --git a/NOTICE b/NOTICE new file mode 100644 index 00000000..ca1635f8 --- /dev/null +++ b/NOTICE @@ -0,0 +1,16 @@ +libcontainer +Copyright 2012-2014 Docker, Inc. + +This product includes software developed at Docker, Inc. (http://www.docker.com). + +The following is courtesy of our legal counsel: + + +Use and transfer of Docker may be subject to certain restrictions by the +United States and other governments. +It is your responsibility to ensure that your use and/or transfer does not +violate applicable laws. + +For more information, please see http://www.bis.doc.gov + +See also http://www.apache.org/dev/crypto.html and/or seek legal counsel. diff --git a/PRINCIPLES.md b/PRINCIPLES.md new file mode 100644 index 00000000..42396c0e --- /dev/null +++ b/PRINCIPLES.md @@ -0,0 +1,19 @@ +# libcontainer Principles + +In the design and development of libcontainer we try to follow these principles: + +(Work in progress) + +* Don't try to replace every tool. Instead, be an ingredient to improve them. +* Less code is better. +* Fewer components are better. Do you really need to add one more class? +* 50 lines of straightforward, readable code is better than 10 lines of magic that nobody can understand. +* Don't do later what you can do now. "//FIXME: refactor" is not acceptable in new code. +* When hesitating between two options, choose the one that is easier to reverse. +* "No" is temporary; "Yes" is forever. If you're not sure about a new feature, say no. You can change your mind later. +* Containers must be portable to the greatest possible number of machines. Be suspicious of any change which makes machines less interchangeable. +* The fewer moving parts in a container, the better. +* Don't merge it unless you document it. +* Don't document it unless you can keep it up-to-date. +* Don't merge it unless you test it! +* Everyone's problem is slightly different. Focus on the part that is the same for everyone, and solve that. diff --git a/README.md b/README.md new file mode 100644 index 00000000..b80d2841 --- /dev/null +++ b/README.md @@ -0,0 +1,62 @@ +## libcontainer - reference implementation for containers [![Build Status](https://travis-ci.org/docker/libcontainer.png?branch=master)](https://travis-ci.org/docker/libcontainer) + +### Note on API changes: + +Please bear with us while we work on making the libcontainer API stable and something that we can support long term. We are currently discussing the API with the community, therefore, if you currently depend on libcontainer please pin your dependency at a specific tag or commit id. Please join the discussion and help shape the API. + +#### Background + +libcontainer specifies configuration options for what a container is. It provides a native Go implementation for using Linux namespaces with no external dependencies. libcontainer provides many convenience functions for working with namespaces, networking, and management. + + +#### Container +A container is a self contained execution environment that shares the kernel of the host system and which is (optionally) isolated from other containers in the system. + +libcontainer may be used to execute a process in a container. If a user tries to run a new process inside an existing container, the new process is added to the processes executing in the container. + + +#### Root file system + +A container runs with a directory known as its *root file system*, or *rootfs*, mounted as the file system root. The rootfs is usually a full system tree. + + +#### Configuration + +A container is initially configured by supplying configuration data when the container is created. + + +#### nsinit + +`nsinit` is a cli application which demonstrates the use of libcontainer. It is able to spawn new containers or join existing containers, based on the current directory. + +To use `nsinit`, cd into a Linux rootfs and copy a `container.json` file into the directory with your specified configuration. Environment, networking, and different capabilities for the container are specified in this file. The configuration is used for each process executed inside the container. + +See the `sample_configs` folder for examples of what the container configuration should look like. + +To execute `/bin/bash` in the current directory as a container just run the following **as root**: +```bash +nsinit exec /bin/bash +``` + +If you wish to spawn another process inside the container while your current bash session is running, run the same command again to get another bash shell (or change the command). If the original process (PID 1) dies, all other processes spawned inside the container will be killed and the namespace will be removed. + +You can identify if a process is running in a container by looking to see if `state.json` is in the root of the directory. + +You may also specify an alternate root place where the `container.json` file is read and where the `state.json` file will be saved. + +#### Future +See the [roadmap](ROADMAP.md). + +## Copyright and license + +Code and documentation copyright 2014 Docker, inc. Code released under the Apache 2.0 license. +Docs released under Creative commons. + +## Hacking on libcontainer + +First of all, please familiarise yourself with the [libcontainer Principles](PRINCIPLES.md). + +If you're a *contributor* or aspiring contributor, you should read the [Contributors' Guide](CONTRIBUTORS_GUIDE.md). + +If you're a *maintainer* or aspiring maintainer, you should read the [Maintainers' Guide](MAINTAINERS_GUIDE.md) and +"How can I become a maintainer?" in the Contributors' Guide. diff --git a/ROADMAP.md b/ROADMAP.md new file mode 100644 index 00000000..08deb9ad --- /dev/null +++ b/ROADMAP.md @@ -0,0 +1,16 @@ +# libcontainer: what's next? + +This document is a high-level overview of where we want to take libcontainer next. +It is a curated selection of planned improvements which are either important, difficult, or both. + +For a more complete view of planned and requested improvements, see [the Github issues](https://github.com/docker/libcontainer/issues). + +To suggest changes to the roadmap, including additions, please write the change as if it were already in effect, and make a pull request. + +## Broader kernel support + +Our goal is to make libcontainer run everywhere, but currently libcontainer requires Linux version 3.8 or higher. If you’re deploying new machines for the purpose of running libcontainer, this is a fairly easy requirement to meet. However, if you’re adding libcontainer to an existing deployment, you may not have the flexibility to update and patch the kernel. + +## Cross-architecture support + +Our goal is to make libcontainer run everywhere. However currently libcontainer only runs on x86_64 systems. We plan on expanding architecture support, so that libcontainer containers can be created and used on more architectures. diff --git a/api_temp.go b/api_temp.go new file mode 100644 index 00000000..9b2c5207 --- /dev/null +++ b/api_temp.go @@ -0,0 +1,34 @@ +/* +Temporary API endpoint for libcontainer while the full API is finalized (api.go). +*/ +package libcontainer + +import ( + "github.com/docker/libcontainer/cgroups/fs" + "github.com/docker/libcontainer/cgroups/systemd" + "github.com/docker/libcontainer/network" +) + +// TODO(vmarmol): Complete Stats() in final libcontainer API and move users to that. +// DEPRECATED: The below portions are only to be used during the transition to the official API. +// Returns all available stats for the given container. +func GetStats(container *Config, state *State) (*ContainerStats, error) { + var ( + err error + stats = &ContainerStats{} + ) + + if systemd.UseSystemd() { + stats.CgroupStats, err = systemd.GetStats(container.Cgroups) + } else { + stats.CgroupStats, err = fs.GetStats(container.Cgroups) + } + + if err != nil { + return stats, err + } + + stats.NetworkStats, err = network.GetStats(&state.NetworkState) + + return stats, err +} diff --git a/apparmor/apparmor.go b/apparmor/apparmor.go new file mode 100644 index 00000000..fb1574df --- /dev/null +++ b/apparmor/apparmor.go @@ -0,0 +1,35 @@ +// +build apparmor,linux + +package apparmor + +// #cgo LDFLAGS: -lapparmor +// #include +// #include +import "C" +import ( + "io/ioutil" + "os" + "unsafe" +) + +func IsEnabled() bool { + if _, err := os.Stat("/sys/kernel/security/apparmor"); err == nil && os.Getenv("container") == "" { + buf, err := ioutil.ReadFile("/sys/module/apparmor/parameters/enabled") + return err == nil && len(buf) > 1 && buf[0] == 'Y' + } + return false +} + +func ApplyProfile(name string) error { + if name == "" { + return nil + } + + cName := C.CString(name) + defer C.free(unsafe.Pointer(cName)) + + if _, err := C.aa_change_onexec(cName); err != nil { + return err + } + return nil +} diff --git a/apparmor/apparmor_disabled.go b/apparmor/apparmor_disabled.go new file mode 100644 index 00000000..937bf915 --- /dev/null +++ b/apparmor/apparmor_disabled.go @@ -0,0 +1,11 @@ +// +build !apparmor !linux + +package apparmor + +func IsEnabled() bool { + return false +} + +func ApplyProfile(name string) error { + return nil +} diff --git a/apparmor/gen.go b/apparmor/gen.go new file mode 100644 index 00000000..825e646d --- /dev/null +++ b/apparmor/gen.go @@ -0,0 +1,94 @@ +package apparmor + +import ( + "io" + "os" + "text/template" +) + +type data struct { + Name string + Imports []string + InnerImports []string +} + +const baseTemplate = ` +{{range $value := .Imports}} +{{$value}} +{{end}} + +profile {{.Name}} flags=(attach_disconnected,mediate_deleted) { +{{range $value := .InnerImports}} + {{$value}} +{{end}} + + network, + capability, + file, + umount, + + mount fstype=tmpfs, + mount fstype=mqueue, + mount fstype=fuse.*, + mount fstype=binfmt_misc -> /proc/sys/fs/binfmt_misc/, + mount fstype=efivarfs -> /sys/firmware/efi/efivars/, + mount fstype=fusectl -> /sys/fs/fuse/connections/, + mount fstype=securityfs -> /sys/kernel/security/, + mount fstype=debugfs -> /sys/kernel/debug/, + mount fstype=proc -> /proc/, + mount fstype=sysfs -> /sys/, + + deny @{PROC}/sys/fs/** wklx, + deny @{PROC}/sysrq-trigger rwklx, + deny @{PROC}/mem rwklx, + deny @{PROC}/kmem rwklx, + deny @{PROC}/sys/kernel/[^s][^h][^m]* wklx, + deny @{PROC}/sys/kernel/*/** wklx, + + deny mount options=(ro, remount) -> /, + deny mount fstype=debugfs -> /var/lib/ureadahead/debugfs/, + deny mount fstype=devpts, + + deny /sys/[^f]*/** wklx, + deny /sys/f[^s]*/** wklx, + deny /sys/fs/[^c]*/** wklx, + deny /sys/fs/c[^g]*/** wklx, + deny /sys/fs/cg[^r]*/** wklx, + deny /sys/firmware/efi/efivars/** rwklx, + deny /sys/kernel/security/** rwklx, +} +` + +func generateProfile(out io.Writer) error { + compiled, err := template.New("apparmor_profile").Parse(baseTemplate) + if err != nil { + return err + } + data := &data{ + Name: "docker-default", + } + if tuntablesExists() { + data.Imports = append(data.Imports, "#include ") + } else { + data.Imports = append(data.Imports, "@{PROC}=/proc/") + } + if abstrctionsEsists() { + data.InnerImports = append(data.InnerImports, "#include ") + } + if err := compiled.Execute(out, data); err != nil { + return err + } + return nil +} + +// check if the tunables/global exist +func tuntablesExists() bool { + _, err := os.Stat("/etc/apparmor.d/tunables/global") + return err == nil +} + +// check if abstractions/base exist +func abstrctionsEsists() bool { + _, err := os.Stat("/etc/apparmor.d/abstractions/base") + return err == nil +} diff --git a/apparmor/setup.go b/apparmor/setup.go new file mode 100644 index 00000000..8ed54374 --- /dev/null +++ b/apparmor/setup.go @@ -0,0 +1,44 @@ +package apparmor + +import ( + "fmt" + "os" + "os/exec" + "path" +) + +const ( + DefaultProfilePath = "/etc/apparmor.d/docker" +) + +func InstallDefaultProfile() error { + if !IsEnabled() { + return nil + } + + // Make sure /etc/apparmor.d exists + if err := os.MkdirAll(path.Dir(DefaultProfilePath), 0755); err != nil { + return err + } + + f, err := os.OpenFile(DefaultProfilePath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644) + if err != nil { + return err + } + if err := generateProfile(f); err != nil { + f.Close() + return err + } + f.Close() + + cmd := exec.Command("/sbin/apparmor_parser", "-r", "-W", "docker") + // to use the parser directly we have to make sure we are in the correct + // dir with the profile + cmd.Dir = "/etc/apparmor.d" + + output, err := cmd.CombinedOutput() + if err != nil { + return fmt.Errorf("Error loading docker apparmor profile: %s (%s)", err, output) + } + return nil +} diff --git a/cgroups/cgroups.go b/cgroups/cgroups.go new file mode 100644 index 00000000..567e9a6c --- /dev/null +++ b/cgroups/cgroups.go @@ -0,0 +1,60 @@ +package cgroups + +import ( + "fmt" + + "github.com/docker/libcontainer/devices" +) + +type FreezerState string + +const ( + Undefined FreezerState = "" + Frozen FreezerState = "FROZEN" + Thawed FreezerState = "THAWED" +) + +type NotFoundError struct { + Subsystem string +} + +func (e *NotFoundError) Error() string { + return fmt.Sprintf("mountpoint for %s not found", e.Subsystem) +} + +func NewNotFoundError(sub string) error { + return &NotFoundError{ + Subsystem: sub, + } +} + +func IsNotFound(err error) bool { + if err == nil { + return false + } + + _, ok := err.(*NotFoundError) + return ok +} + +type Cgroup struct { + Name string `json:"name,omitempty"` + Parent string `json:"parent,omitempty"` // name of parent cgroup or slice + + AllowAllDevices bool `json:"allow_all_devices,omitempty"` // If this is true allow access to any kind of device within the container. If false, allow access only to devices explicitly listed in the allowed_devices list. + AllowedDevices []*devices.Device `json:"allowed_devices,omitempty"` + Memory int64 `json:"memory,omitempty"` // Memory limit (in bytes) + MemoryReservation int64 `json:"memory_reservation,omitempty"` // Memory reservation or soft_limit (in bytes) + MemorySwap int64 `json:"memory_swap,omitempty"` // Total memory usage (memory + swap); set `-1' to disable swap + CpuShares int64 `json:"cpu_shares,omitempty"` // CPU shares (relative weight vs. other containers) + CpuQuota int64 `json:"cpu_quota,omitempty"` // CPU hardcap limit (in usecs). Allowed cpu time in a given period. + CpuPeriod int64 `json:"cpu_period,omitempty"` // CPU period to be used for hardcapping (in usecs). 0 to use system default. + CpusetCpus string `json:"cpuset_cpus,omitempty"` // CPU to use + Freezer FreezerState `json:"freezer,omitempty"` // set the freeze value for the process + Slice string `json:"slice,omitempty"` // Parent slice to use for systemd +} + +type ActiveCgroup interface { + Cleanup() error + Paths() (map[string]string, error) +} diff --git a/cgroups/cgroups_test.go b/cgroups/cgroups_test.go new file mode 100644 index 00000000..e8c52938 --- /dev/null +++ b/cgroups/cgroups_test.go @@ -0,0 +1,27 @@ +package cgroups + +import ( + "bytes" + "testing" +) + +const ( + cgroupsContents = `11:hugetlb:/ +10:perf_event:/ +9:blkio:/ +8:net_cls:/ +7:freezer:/ +6:devices:/ +5:memory:/ +4:cpuacct,cpu:/ +3:cpuset:/ +2:name=systemd:/user.slice/user-1000.slice/session-16.scope` +) + +func TestParseCgroups(t *testing.T) { + r := bytes.NewBuffer([]byte(cgroupsContents)) + _, err := ParseCgroupFile("blkio", r) + if err != nil { + t.Fatal(err) + } +} diff --git a/cgroups/cgutil/cgutil.go b/cgroups/cgutil/cgutil.go new file mode 100644 index 00000000..d1a66117 --- /dev/null +++ b/cgroups/cgutil/cgutil.go @@ -0,0 +1,264 @@ +package main + +import ( + "encoding/json" + "fmt" + "log" + "os" + "syscall" + "time" + + "github.com/codegangsta/cli" + "github.com/docker/libcontainer/cgroups" + "github.com/docker/libcontainer/cgroups/fs" + "github.com/docker/libcontainer/cgroups/systemd" +) + +var createCommand = cli.Command{ + Name: "create", + Usage: "Create a cgroup container using the supplied configuration and initial process.", + Flags: []cli.Flag{ + cli.StringFlag{Name: "config, c", Value: "cgroup.json", Usage: "path to container configuration (cgroups.Cgroup object)"}, + cli.IntFlag{Name: "pid, p", Value: 0, Usage: "pid of the initial process in the container"}, + }, + Action: createAction, +} + +var destroyCommand = cli.Command{ + Name: "destroy", + Usage: "Destroy an existing cgroup container.", + Flags: []cli.Flag{ + cli.StringFlag{Name: "name, n", Value: "", Usage: "container name"}, + cli.StringFlag{Name: "parent, p", Value: "", Usage: "container parent"}, + }, + Action: destroyAction, +} + +var statsCommand = cli.Command{ + Name: "stats", + Usage: "Get stats for cgroup", + Flags: []cli.Flag{ + cli.StringFlag{Name: "name, n", Value: "", Usage: "container name"}, + cli.StringFlag{Name: "parent, p", Value: "", Usage: "container parent"}, + }, + Action: statsAction, +} + +var pauseCommand = cli.Command{ + Name: "pause", + Usage: "Pause cgroup", + Flags: []cli.Flag{ + cli.StringFlag{Name: "name, n", Value: "", Usage: "container name"}, + cli.StringFlag{Name: "parent, p", Value: "", Usage: "container parent"}, + }, + Action: pauseAction, +} + +var resumeCommand = cli.Command{ + Name: "resume", + Usage: "Resume a paused cgroup", + Flags: []cli.Flag{ + cli.StringFlag{Name: "name, n", Value: "", Usage: "container name"}, + cli.StringFlag{Name: "parent, p", Value: "", Usage: "container parent"}, + }, + Action: resumeAction, +} + +var psCommand = cli.Command{ + Name: "ps", + Usage: "Get list of pids for a cgroup", + Flags: []cli.Flag{ + cli.StringFlag{Name: "name, n", Value: "", Usage: "container name"}, + cli.StringFlag{Name: "parent, p", Value: "", Usage: "container parent"}, + }, + Action: psAction, +} + +func getConfigFromFile(c *cli.Context) (*cgroups.Cgroup, error) { + f, err := os.Open(c.String("config")) + if err != nil { + return nil, err + } + defer f.Close() + + var config *cgroups.Cgroup + if err := json.NewDecoder(f).Decode(&config); err != nil { + log.Fatal(err) + } + return config, nil +} + +func openLog(name string) error { + f, err := os.OpenFile(name, os.O_CREATE|os.O_RDWR|os.O_APPEND, 0755) + if err != nil { + return err + } + + log.SetOutput(f) + return nil +} + +func getConfig(context *cli.Context) (*cgroups.Cgroup, error) { + name := context.String("name") + if name == "" { + log.Fatal(fmt.Errorf("Missing container name")) + } + parent := context.String("parent") + return &cgroups.Cgroup{ + Name: name, + Parent: parent, + }, nil +} + +func killAll(config *cgroups.Cgroup) { + // We could use freezer here to prevent process spawning while we are trying + // to kill everything. But going with more portable solution of retrying for + // now. + pids := getPids(config) + retry := 10 + for len(pids) != 0 || retry > 0 { + killPids(pids) + time.Sleep(100 * time.Millisecond) + retry-- + pids = getPids(config) + } + if len(pids) != 0 { + log.Fatal(fmt.Errorf("Could not kill existing processes in the container.")) + } +} + +func getPids(config *cgroups.Cgroup) []int { + pids, err := fs.GetPids(config) + if err != nil { + log.Fatal(err) + } + return pids +} + +func killPids(pids []int) { + for _, pid := range pids { + // pids might go away on their own. Ignore errors. + syscall.Kill(pid, syscall.SIGKILL) + } +} + +func setFreezerState(context *cli.Context, state cgroups.FreezerState) { + config, err := getConfig(context) + if err != nil { + log.Fatal(err) + } + + if systemd.UseSystemd() { + err = systemd.Freeze(config, state) + } else { + err = fs.Freeze(config, state) + } + if err != nil { + log.Fatal(err) + } +} + +func createAction(context *cli.Context) { + config, err := getConfigFromFile(context) + if err != nil { + log.Fatal(err) + } + pid := context.Int("pid") + if pid <= 0 { + log.Fatal(fmt.Errorf("Invalid pid : %d", pid)) + } + if systemd.UseSystemd() { + _, err := systemd.Apply(config, pid) + if err != nil { + log.Fatal(err) + } + } else { + _, err := fs.Apply(config, pid) + if err != nil { + log.Fatal(err) + } + } +} + +func destroyAction(context *cli.Context) { + config, err := getConfig(context) + if err != nil { + log.Fatal(err) + } + + killAll(config) + // Systemd will clean up cgroup state for empty container. + if !systemd.UseSystemd() { + err := fs.Cleanup(config) + if err != nil { + log.Fatal(err) + } + } +} + +func statsAction(context *cli.Context) { + config, err := getConfig(context) + if err != nil { + log.Fatal(err) + } + stats, err := fs.GetStats(config) + if err != nil { + log.Fatal(err) + } + + out, err := json.MarshalIndent(stats, "", "\t") + if err != nil { + log.Fatal(err) + } + fmt.Printf("Usage stats for '%s':\n %v\n", config.Name, string(out)) +} + +func pauseAction(context *cli.Context) { + setFreezerState(context, cgroups.Frozen) +} + +func resumeAction(context *cli.Context) { + setFreezerState(context, cgroups.Thawed) +} + +func psAction(context *cli.Context) { + config, err := getConfig(context) + if err != nil { + log.Fatal(err) + } + + pids, err := fs.GetPids(config) + if err != nil { + log.Fatal(err) + } + + fmt.Printf("Pids in '%s':\n", config.Name) + fmt.Println(pids) +} + +func main() { + logPath := os.Getenv("log") + if logPath != "" { + if err := openLog(logPath); err != nil { + log.Fatal(err) + } + } + + app := cli.NewApp() + app.Name = "cgutil" + app.Usage = "Test utility for libcontainer cgroups package" + app.Version = "0.1" + + app.Commands = []cli.Command{ + createCommand, + destroyCommand, + statsCommand, + pauseCommand, + resumeCommand, + psCommand, + } + + if err := app.Run(os.Args); err != nil { + log.Fatal(err) + } +} diff --git a/cgroups/cgutil/sample_cgroup.json b/cgroups/cgutil/sample_cgroup.json new file mode 100644 index 00000000..2d297849 --- /dev/null +++ b/cgroups/cgutil/sample_cgroup.json @@ -0,0 +1,10 @@ +{ + "name": "luke", + "parent": "darth", + "allow_all_devices": true, + "memory": 1073741824, + "memory_swap": -1, + "cpu_shares": 2048, + "cpu_quota": 500000, + "cpu_period": 250000 +} diff --git a/cgroups/fs/apply_raw.go b/cgroups/fs/apply_raw.go new file mode 100644 index 00000000..133241e4 --- /dev/null +++ b/cgroups/fs/apply_raw.go @@ -0,0 +1,248 @@ +package fs + +import ( + "fmt" + "io/ioutil" + "os" + "path/filepath" + "strconv" + + "github.com/docker/libcontainer/cgroups" +) + +var ( + subsystems = map[string]subsystem{ + "devices": &DevicesGroup{}, + "memory": &MemoryGroup{}, + "cpu": &CpuGroup{}, + "cpuset": &CpusetGroup{}, + "cpuacct": &CpuacctGroup{}, + "blkio": &BlkioGroup{}, + "perf_event": &PerfEventGroup{}, + "freezer": &FreezerGroup{}, + } + CgroupProcesses = "cgroup.procs" +) + +// The absolute path to the root of the cgroup hierarchies. +var cgroupRoot string + +// TODO(vmarmol): Report error here, we'll probably need to wait for the new API. +func init() { + // we can pick any subsystem to find the root + cpuRoot, err := cgroups.FindCgroupMountpoint("cpu") + if err != nil { + return + } + cgroupRoot = filepath.Dir(cpuRoot) + + if _, err := os.Stat(cgroupRoot); err != nil { + return + } +} + +type subsystem interface { + // Returns the stats, as 'stats', corresponding to the cgroup under 'path'. + GetStats(path string, stats *cgroups.Stats) error + // Removes the cgroup represented by 'data'. + Remove(*data) error + // Creates and joins the cgroup represented by data. + Set(*data) error +} + +type data struct { + root string + cgroup string + c *cgroups.Cgroup + pid int +} + +func Apply(c *cgroups.Cgroup, pid int) (cgroups.ActiveCgroup, error) { + d, err := getCgroupData(c, pid) + if err != nil { + return nil, err + } + + for _, sys := range subsystems { + if err := sys.Set(d); err != nil { + d.Cleanup() + return nil, err + } + } + + return d, nil +} + +func Cleanup(c *cgroups.Cgroup) error { + d, err := getCgroupData(c, 0) + if err != nil { + return fmt.Errorf("Could not get Cgroup data %s", err) + } + return d.Cleanup() +} + +func GetStats(c *cgroups.Cgroup) (*cgroups.Stats, error) { + stats := cgroups.NewStats() + + d, err := getCgroupData(c, 0) + if err != nil { + return nil, fmt.Errorf("getting CgroupData %s", err) + } + + for sysname, sys := range subsystems { + path, err := d.path(sysname) + if err != nil { + // Don't fail if a cgroup hierarchy was not found, just skip this subsystem + if cgroups.IsNotFound(err) { + continue + } + + return nil, err + } + + if err := sys.GetStats(path, stats); err != nil { + return nil, err + } + } + + return stats, nil +} + +// Freeze toggles the container's freezer cgroup depending on the state +// provided +func Freeze(c *cgroups.Cgroup, state cgroups.FreezerState) error { + d, err := getCgroupData(c, 0) + if err != nil { + return err + } + + c.Freezer = state + + freezer := subsystems["freezer"] + + return freezer.Set(d) +} + +func GetPids(c *cgroups.Cgroup) ([]int, error) { + d, err := getCgroupData(c, 0) + if err != nil { + return nil, err + } + + dir, err := d.path("devices") + if err != nil { + return nil, err + } + + return cgroups.ReadProcsFile(dir) +} + +func getCgroupData(c *cgroups.Cgroup, pid int) (*data, error) { + if cgroupRoot == "" { + return nil, fmt.Errorf("failed to find the cgroup root") + } + + cgroup := c.Name + if c.Parent != "" { + cgroup = filepath.Join(c.Parent, cgroup) + } + + return &data{ + root: cgroupRoot, + cgroup: cgroup, + c: c, + pid: pid, + }, nil +} + +func (raw *data) parent(subsystem string) (string, error) { + initPath, err := cgroups.GetInitCgroupDir(subsystem) + if err != nil { + return "", err + } + return filepath.Join(raw.root, subsystem, initPath), nil +} + +func (raw *data) Paths() (map[string]string, error) { + paths := make(map[string]string) + + for sysname := range subsystems { + path, err := raw.path(sysname) + if err != nil { + // Don't fail if a cgroup hierarchy was not found, just skip this subsystem + if cgroups.IsNotFound(err) { + continue + } + + return nil, err + } + + paths[sysname] = path + } + + return paths, nil +} + +func (raw *data) path(subsystem string) (string, error) { + // If the cgroup name/path is absolute do not look relative to the cgroup of the init process. + if filepath.IsAbs(raw.cgroup) { + path := filepath.Join(raw.root, subsystem, raw.cgroup) + + if _, err := os.Stat(path); err != nil { + if os.IsNotExist(err) { + return "", cgroups.NewNotFoundError(subsystem) + } + + return "", err + } + + return path, nil + } + + parent, err := raw.parent(subsystem) + if err != nil { + return "", err + } + + return filepath.Join(parent, raw.cgroup), nil +} + +func (raw *data) join(subsystem string) (string, error) { + path, err := raw.path(subsystem) + if err != nil { + return "", err + } + if err := os.MkdirAll(path, 0755); err != nil && !os.IsExist(err) { + return "", err + } + if err := writeFile(path, CgroupProcesses, strconv.Itoa(raw.pid)); err != nil { + return "", err + } + return path, nil +} + +func (raw *data) Cleanup() error { + for _, sys := range subsystems { + sys.Remove(raw) + } + return nil +} + +func writeFile(dir, file, data string) error { + return ioutil.WriteFile(filepath.Join(dir, file), []byte(data), 0700) +} + +func readFile(dir, file string) (string, error) { + data, err := ioutil.ReadFile(filepath.Join(dir, file)) + return string(data), err +} + +func removePath(p string, err error) error { + if err != nil { + return err + } + if p != "" { + return os.RemoveAll(p) + } + return nil +} diff --git a/cgroups/fs/blkio.go b/cgroups/fs/blkio.go new file mode 100644 index 00000000..261a97ff --- /dev/null +++ b/cgroups/fs/blkio.go @@ -0,0 +1,167 @@ +package fs + +import ( + "bufio" + "fmt" + "os" + "path/filepath" + "strconv" + "strings" + + "github.com/docker/libcontainer/cgroups" +) + +type BlkioGroup struct { +} + +func (s *BlkioGroup) Set(d *data) error { + // we just want to join this group even though we don't set anything + if _, err := d.join("blkio"); err != nil && !cgroups.IsNotFound(err) { + return err + } + + return nil +} + +func (s *BlkioGroup) Remove(d *data) error { + return removePath(d.path("blkio")) +} + +/* +examples: + + blkio.sectors + 8:0 6792 + + blkio.io_service_bytes + 8:0 Read 1282048 + 8:0 Write 2195456 + 8:0 Sync 2195456 + 8:0 Async 1282048 + 8:0 Total 3477504 + Total 3477504 + + blkio.io_serviced + 8:0 Read 124 + 8:0 Write 104 + 8:0 Sync 104 + 8:0 Async 124 + 8:0 Total 228 + Total 228 + + blkio.io_queued + 8:0 Read 0 + 8:0 Write 0 + 8:0 Sync 0 + 8:0 Async 0 + 8:0 Total 0 + Total 0 +*/ + +func splitBlkioStatLine(r rune) bool { + return r == ' ' || r == ':' +} + +func getBlkioStat(path string) ([]cgroups.BlkioStatEntry, error) { + var blkioStats []cgroups.BlkioStatEntry + f, err := os.Open(path) + if err != nil { + if os.IsNotExist(err) { + return blkioStats, nil + } + return nil, err + } + defer f.Close() + + sc := bufio.NewScanner(f) + for sc.Scan() { + // format: dev type amount + fields := strings.FieldsFunc(sc.Text(), splitBlkioStatLine) + if len(fields) < 3 { + if len(fields) == 2 && fields[0] == "Total" { + // skip total line + continue + } else { + return nil, fmt.Errorf("Invalid line found while parsing %s: %s", path, sc.Text()) + } + } + + v, err := strconv.ParseUint(fields[0], 10, 64) + if err != nil { + return nil, err + } + major := v + + v, err = strconv.ParseUint(fields[1], 10, 64) + if err != nil { + return nil, err + } + minor := v + + op := "" + valueField := 2 + if len(fields) == 4 { + op = fields[2] + valueField = 3 + } + v, err = strconv.ParseUint(fields[valueField], 10, 64) + if err != nil { + return nil, err + } + blkioStats = append(blkioStats, cgroups.BlkioStatEntry{Major: major, Minor: minor, Op: op, Value: v}) + } + + return blkioStats, nil +} + +func (s *BlkioGroup) GetStats(path string, stats *cgroups.Stats) error { + // Try to read CFQ stats available on all CFQ enabled kernels first + if blkioStats, err := getBlkioStat(filepath.Join(path, "blkio.io_serviced_recursive")); err == nil && blkioStats != nil { + return getCFQStats(path, stats) + } + return getStats(path, stats) // Use generic stats as fallback +} + +func getCFQStats(path string, stats *cgroups.Stats) error { + var blkioStats []cgroups.BlkioStatEntry + var err error + + if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.sectors_recursive")); err != nil { + return err + } + stats.BlkioStats.SectorsRecursive = blkioStats + + if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.io_service_bytes_recursive")); err != nil { + return err + } + stats.BlkioStats.IoServiceBytesRecursive = blkioStats + + if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.io_serviced_recursive")); err != nil { + return err + } + stats.BlkioStats.IoServicedRecursive = blkioStats + + if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.io_queued_recursive")); err != nil { + return err + } + stats.BlkioStats.IoQueuedRecursive = blkioStats + + return nil +} + +func getStats(path string, stats *cgroups.Stats) error { + var blkioStats []cgroups.BlkioStatEntry + var err error + + if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.throttle.io_service_bytes")); err != nil { + return err + } + stats.BlkioStats.IoServiceBytesRecursive = blkioStats + + if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.throttle.io_serviced")); err != nil { + return err + } + stats.BlkioStats.IoServicedRecursive = blkioStats + + return nil +} diff --git a/cgroups/fs/blkio_test.go b/cgroups/fs/blkio_test.go new file mode 100644 index 00000000..2a79d260 --- /dev/null +++ b/cgroups/fs/blkio_test.go @@ -0,0 +1,248 @@ +package fs + +import ( + "testing" + + "github.com/docker/libcontainer/cgroups" +) + +const ( + sectorsRecursiveContents = `8:0 1024` + serviceBytesRecursiveContents = `8:0 Read 100 +8:0 Write 200 +8:0 Sync 300 +8:0 Async 500 +8:0 Total 500 +Total 500` + servicedRecursiveContents = `8:0 Read 10 +8:0 Write 40 +8:0 Sync 20 +8:0 Async 30 +8:0 Total 50 +Total 50` + queuedRecursiveContents = `8:0 Read 1 +8:0 Write 4 +8:0 Sync 2 +8:0 Async 3 +8:0 Total 5 +Total 5` + throttleServiceBytes = `8:0 Read 11030528 +8:0 Write 23 +8:0 Sync 42 +8:0 Async 11030528 +8:0 Total 11030528 +252:0 Read 11030528 +252:0 Write 23 +252:0 Sync 42 +252:0 Async 11030528 +252:0 Total 11030528 +Total 22061056` + throttleServiced = `8:0 Read 164 +8:0 Write 23 +8:0 Sync 42 +8:0 Async 164 +8:0 Total 164 +252:0 Read 164 +252:0 Write 23 +252:0 Sync 42 +252:0 Async 164 +252:0 Total 164 +Total 328` +) + +func appendBlkioStatEntry(blkioStatEntries *[]cgroups.BlkioStatEntry, major, minor, value uint64, op string) { + *blkioStatEntries = append(*blkioStatEntries, cgroups.BlkioStatEntry{Major: major, Minor: minor, Value: value, Op: op}) +} + +func TestBlkioStats(t *testing.T) { + helper := NewCgroupTestUtil("blkio", t) + defer helper.cleanup() + helper.writeFileContents(map[string]string{ + "blkio.io_service_bytes_recursive": serviceBytesRecursiveContents, + "blkio.io_serviced_recursive": servicedRecursiveContents, + "blkio.io_queued_recursive": queuedRecursiveContents, + "blkio.sectors_recursive": sectorsRecursiveContents, + }) + + blkio := &BlkioGroup{} + actualStats := *cgroups.NewStats() + err := blkio.GetStats(helper.CgroupPath, &actualStats) + if err != nil { + t.Fatal(err) + } + + // Verify expected stats. + expectedStats := cgroups.BlkioStats{} + appendBlkioStatEntry(&expectedStats.SectorsRecursive, 8, 0, 1024, "") + + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 100, "Read") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 200, "Write") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 300, "Sync") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 500, "Async") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 500, "Total") + + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 10, "Read") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 40, "Write") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 20, "Sync") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 30, "Async") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 50, "Total") + + appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 1, "Read") + appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 4, "Write") + appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 2, "Sync") + appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 3, "Async") + appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 5, "Total") + + expectBlkioStatsEquals(t, expectedStats, actualStats.BlkioStats) +} + +func TestBlkioStatsNoSectorsFile(t *testing.T) { + helper := NewCgroupTestUtil("blkio", t) + defer helper.cleanup() + helper.writeFileContents(map[string]string{ + "blkio.io_service_bytes_recursive": serviceBytesRecursiveContents, + "blkio.io_serviced_recursive": servicedRecursiveContents, + "blkio.io_queued_recursive": queuedRecursiveContents, + }) + + blkio := &BlkioGroup{} + actualStats := *cgroups.NewStats() + err := blkio.GetStats(helper.CgroupPath, &actualStats) + if err != nil { + t.Fatalf("Failed unexpectedly: %s", err) + } +} + +func TestBlkioStatsNoServiceBytesFile(t *testing.T) { + helper := NewCgroupTestUtil("blkio", t) + defer helper.cleanup() + helper.writeFileContents(map[string]string{ + "blkio.io_serviced_recursive": servicedRecursiveContents, + "blkio.io_queued_recursive": queuedRecursiveContents, + "blkio.sectors_recursive": sectorsRecursiveContents, + }) + + blkio := &BlkioGroup{} + actualStats := *cgroups.NewStats() + err := blkio.GetStats(helper.CgroupPath, &actualStats) + if err != nil { + t.Fatalf("Failed unexpectedly: %s", err) + } +} + +func TestBlkioStatsNoServicedFile(t *testing.T) { + helper := NewCgroupTestUtil("blkio", t) + defer helper.cleanup() + helper.writeFileContents(map[string]string{ + "blkio.io_service_bytes_recursive": serviceBytesRecursiveContents, + "blkio.io_queued_recursive": queuedRecursiveContents, + "blkio.sectors_recursive": sectorsRecursiveContents, + }) + + blkio := &BlkioGroup{} + actualStats := *cgroups.NewStats() + err := blkio.GetStats(helper.CgroupPath, &actualStats) + if err != nil { + t.Fatalf("Failed unexpectedly: %s", err) + } +} + +func TestBlkioStatsNoQueuedFile(t *testing.T) { + helper := NewCgroupTestUtil("blkio", t) + defer helper.cleanup() + helper.writeFileContents(map[string]string{ + "blkio.io_service_bytes_recursive": serviceBytesRecursiveContents, + "blkio.io_serviced_recursive": servicedRecursiveContents, + "blkio.sectors_recursive": sectorsRecursiveContents, + }) + + blkio := &BlkioGroup{} + actualStats := *cgroups.NewStats() + err := blkio.GetStats(helper.CgroupPath, &actualStats) + if err != nil { + t.Fatalf("Failed unexpectedly: %s", err) + } +} + +func TestBlkioStatsUnexpectedNumberOfFields(t *testing.T) { + helper := NewCgroupTestUtil("blkio", t) + defer helper.cleanup() + helper.writeFileContents(map[string]string{ + "blkio.io_service_bytes_recursive": "8:0 Read 100 100", + "blkio.io_serviced_recursive": servicedRecursiveContents, + "blkio.io_queued_recursive": queuedRecursiveContents, + "blkio.sectors_recursive": sectorsRecursiveContents, + }) + + blkio := &BlkioGroup{} + actualStats := *cgroups.NewStats() + err := blkio.GetStats(helper.CgroupPath, &actualStats) + if err == nil { + t.Fatal("Expected to fail, but did not") + } +} + +func TestBlkioStatsUnexpectedFieldType(t *testing.T) { + helper := NewCgroupTestUtil("blkio", t) + defer helper.cleanup() + helper.writeFileContents(map[string]string{ + "blkio.io_service_bytes_recursive": "8:0 Read Write", + "blkio.io_serviced_recursive": servicedRecursiveContents, + "blkio.io_queued_recursive": queuedRecursiveContents, + "blkio.sectors_recursive": sectorsRecursiveContents, + }) + + blkio := &BlkioGroup{} + actualStats := *cgroups.NewStats() + err := blkio.GetStats(helper.CgroupPath, &actualStats) + if err == nil { + t.Fatal("Expected to fail, but did not") + } +} + +func TestNonCFQBlkioStats(t *testing.T) { + helper := NewCgroupTestUtil("blkio", t) + defer helper.cleanup() + helper.writeFileContents(map[string]string{ + "blkio.io_service_bytes_recursive": "", + "blkio.io_serviced_recursive": "", + "blkio.io_queued_recursive": "", + "blkio.sectors_recursive": "", + "blkio.throttle.io_service_bytes": throttleServiceBytes, + "blkio.throttle.io_serviced": throttleServiced, + }) + + blkio := &BlkioGroup{} + actualStats := *cgroups.NewStats() + err := blkio.GetStats(helper.CgroupPath, &actualStats) + if err != nil { + t.Fatal(err) + } + + // Verify expected stats. + expectedStats := cgroups.BlkioStats{} + + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 11030528, "Read") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 23, "Write") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 42, "Sync") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 11030528, "Async") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 11030528, "Total") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 252, 0, 11030528, "Read") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 252, 0, 23, "Write") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 252, 0, 42, "Sync") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 252, 0, 11030528, "Async") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 252, 0, 11030528, "Total") + + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 164, "Read") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 23, "Write") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 42, "Sync") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 164, "Async") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 164, "Total") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 252, 0, 164, "Read") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 252, 0, 23, "Write") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 252, 0, 42, "Sync") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 252, 0, 164, "Async") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 252, 0, 164, "Total") + + expectBlkioStatsEquals(t, expectedStats, actualStats.BlkioStats) +} diff --git a/cgroups/fs/cpu.go b/cgroups/fs/cpu.go new file mode 100644 index 00000000..efac9ed1 --- /dev/null +++ b/cgroups/fs/cpu.go @@ -0,0 +1,72 @@ +package fs + +import ( + "bufio" + "os" + "path/filepath" + "strconv" + + "github.com/docker/libcontainer/cgroups" +) + +type CpuGroup struct { +} + +func (s *CpuGroup) Set(d *data) error { + // We always want to join the cpu group, to allow fair cpu scheduling + // on a container basis + dir, err := d.join("cpu") + if err != nil { + return err + } + if d.c.CpuShares != 0 { + if err := writeFile(dir, "cpu.shares", strconv.FormatInt(d.c.CpuShares, 10)); err != nil { + return err + } + } + if d.c.CpuPeriod != 0 { + if err := writeFile(dir, "cpu.cfs_period_us", strconv.FormatInt(d.c.CpuPeriod, 10)); err != nil { + return err + } + } + if d.c.CpuQuota != 0 { + if err := writeFile(dir, "cpu.cfs_quota_us", strconv.FormatInt(d.c.CpuQuota, 10)); err != nil { + return err + } + } + return nil +} + +func (s *CpuGroup) Remove(d *data) error { + return removePath(d.path("cpu")) +} + +func (s *CpuGroup) GetStats(path string, stats *cgroups.Stats) error { + f, err := os.Open(filepath.Join(path, "cpu.stat")) + if err != nil { + if os.IsNotExist(err) { + return nil + } + return err + } + defer f.Close() + + sc := bufio.NewScanner(f) + for sc.Scan() { + t, v, err := getCgroupParamKeyValue(sc.Text()) + if err != nil { + return err + } + switch t { + case "nr_periods": + stats.CpuStats.ThrottlingData.Periods = v + + case "nr_throttled": + stats.CpuStats.ThrottlingData.ThrottledPeriods = v + + case "throttled_time": + stats.CpuStats.ThrottlingData.ThrottledTime = v + } + } + return nil +} diff --git a/cgroups/fs/cpu_test.go b/cgroups/fs/cpu_test.go new file mode 100644 index 00000000..2470e689 --- /dev/null +++ b/cgroups/fs/cpu_test.go @@ -0,0 +1,69 @@ +package fs + +import ( + "fmt" + "testing" + + "github.com/docker/libcontainer/cgroups" +) + +func TestCpuStats(t *testing.T) { + helper := NewCgroupTestUtil("cpu", t) + defer helper.cleanup() + + const ( + kNrPeriods = 2000 + kNrThrottled = 200 + kThrottledTime = uint64(18446744073709551615) + ) + + cpuStatContent := fmt.Sprintf("nr_periods %d\n nr_throttled %d\n throttled_time %d\n", + kNrPeriods, kNrThrottled, kThrottledTime) + helper.writeFileContents(map[string]string{ + "cpu.stat": cpuStatContent, + }) + + cpu := &CpuGroup{} + actualStats := *cgroups.NewStats() + err := cpu.GetStats(helper.CgroupPath, &actualStats) + if err != nil { + t.Fatal(err) + } + + expectedStats := cgroups.ThrottlingData{ + Periods: kNrPeriods, + ThrottledPeriods: kNrThrottled, + ThrottledTime: kThrottledTime} + + expectThrottlingDataEquals(t, expectedStats, actualStats.CpuStats.ThrottlingData) +} + +func TestNoCpuStatFile(t *testing.T) { + helper := NewCgroupTestUtil("cpu", t) + defer helper.cleanup() + + cpu := &CpuGroup{} + actualStats := *cgroups.NewStats() + err := cpu.GetStats(helper.CgroupPath, &actualStats) + if err != nil { + t.Fatal("Expected not to fail, but did") + } +} + +func TestInvalidCpuStat(t *testing.T) { + helper := NewCgroupTestUtil("cpu", t) + defer helper.cleanup() + cpuStatContent := `nr_periods 2000 + nr_throttled 200 + throttled_time fortytwo` + helper.writeFileContents(map[string]string{ + "cpu.stat": cpuStatContent, + }) + + cpu := &CpuGroup{} + actualStats := *cgroups.NewStats() + err := cpu.GetStats(helper.CgroupPath, &actualStats) + if err == nil { + t.Fatal("Expected failed stat parsing.") + } +} diff --git a/cgroups/fs/cpuacct.go b/cgroups/fs/cpuacct.go new file mode 100644 index 00000000..14b55ccd --- /dev/null +++ b/cgroups/fs/cpuacct.go @@ -0,0 +1,110 @@ +package fs + +import ( + "fmt" + "io/ioutil" + "path/filepath" + "strconv" + "strings" + + "github.com/docker/libcontainer/cgroups" + "github.com/docker/libcontainer/system" +) + +const ( + cgroupCpuacctStat = "cpuacct.stat" + nanosecondsInSecond = 1000000000 +) + +var clockTicks = uint64(system.GetClockTicks()) + +type CpuacctGroup struct { +} + +func (s *CpuacctGroup) Set(d *data) error { + // we just want to join this group even though we don't set anything + if _, err := d.join("cpuacct"); err != nil && !cgroups.IsNotFound(err) { + return err + } + + return nil +} + +func (s *CpuacctGroup) Remove(d *data) error { + return removePath(d.path("cpuacct")) +} + +func (s *CpuacctGroup) GetStats(path string, stats *cgroups.Stats) error { + userModeUsage, kernelModeUsage, err := getCpuUsageBreakdown(path) + if err != nil { + return err + } + + totalUsage, err := getCgroupParamUint(path, "cpuacct.usage") + if err != nil { + return err + } + + percpuUsage, err := getPercpuUsage(path) + if err != nil { + return err + } + + stats.CpuStats.CpuUsage.TotalUsage = totalUsage + stats.CpuStats.CpuUsage.PercpuUsage = percpuUsage + stats.CpuStats.CpuUsage.UsageInUsermode = userModeUsage + stats.CpuStats.CpuUsage.UsageInKernelmode = kernelModeUsage + return nil +} + +// Returns user and kernel usage breakdown in nanoseconds. +func getCpuUsageBreakdown(path string) (uint64, uint64, error) { + userModeUsage := uint64(0) + kernelModeUsage := uint64(0) + const ( + userField = "user" + systemField = "system" + ) + + // Expected format: + // user + // system + data, err := ioutil.ReadFile(filepath.Join(path, cgroupCpuacctStat)) + if err != nil { + return 0, 0, err + } + fields := strings.Fields(string(data)) + if len(fields) != 4 { + return 0, 0, fmt.Errorf("failure - %s is expected to have 4 fields", filepath.Join(path, cgroupCpuacctStat)) + } + if fields[0] != userField { + return 0, 0, fmt.Errorf("unexpected field %q in %q, expected %q", fields[0], cgroupCpuacctStat, userField) + } + if fields[2] != systemField { + return 0, 0, fmt.Errorf("unexpected field %q in %q, expected %q", fields[2], cgroupCpuacctStat, systemField) + } + if userModeUsage, err = strconv.ParseUint(fields[1], 10, 64); err != nil { + return 0, 0, err + } + if kernelModeUsage, err = strconv.ParseUint(fields[3], 10, 64); err != nil { + return 0, 0, err + } + + return (userModeUsage * nanosecondsInSecond) / clockTicks, (kernelModeUsage * nanosecondsInSecond) / clockTicks, nil +} + +func getPercpuUsage(path string) ([]uint64, error) { + percpuUsage := []uint64{} + data, err := ioutil.ReadFile(filepath.Join(path, "cpuacct.usage_percpu")) + if err != nil { + return percpuUsage, err + } + for _, value := range strings.Fields(string(data)) { + value, err := strconv.ParseUint(value, 10, 64) + if err != nil { + return percpuUsage, fmt.Errorf("Unable to convert param value to uint64: %s", err) + } + percpuUsage = append(percpuUsage, value) + } + return percpuUsage, nil +} diff --git a/cgroups/fs/cpuset.go b/cgroups/fs/cpuset.go new file mode 100644 index 00000000..88477394 --- /dev/null +++ b/cgroups/fs/cpuset.go @@ -0,0 +1,119 @@ +package fs + +import ( + "bytes" + "io/ioutil" + "os" + "path/filepath" + "strconv" + + "github.com/docker/libcontainer/cgroups" +) + +type CpusetGroup struct { +} + +func (s *CpusetGroup) Set(d *data) error { + // we don't want to join this cgroup unless it is specified + if d.c.CpusetCpus != "" { + dir, err := d.path("cpuset") + if err != nil { + return err + } + + return s.SetDir(dir, d.c.CpusetCpus, d.pid) + } + + return nil +} + +func (s *CpusetGroup) Remove(d *data) error { + return removePath(d.path("cpuset")) +} + +func (s *CpusetGroup) GetStats(path string, stats *cgroups.Stats) error { + return nil +} + +func (s *CpusetGroup) SetDir(dir, value string, pid int) error { + if err := s.ensureParent(dir); err != nil { + return err + } + + // because we are not using d.join we need to place the pid into the procs file + // unlike the other subsystems + if err := writeFile(dir, "cgroup.procs", strconv.Itoa(pid)); err != nil { + return err + } + + if err := writeFile(dir, "cpuset.cpus", value); err != nil { + return err + } + + return nil +} + +func (s *CpusetGroup) getSubsystemSettings(parent string) (cpus []byte, mems []byte, err error) { + if cpus, err = ioutil.ReadFile(filepath.Join(parent, "cpuset.cpus")); err != nil { + return + } + if mems, err = ioutil.ReadFile(filepath.Join(parent, "cpuset.mems")); err != nil { + return + } + return cpus, mems, nil +} + +// ensureParent ensures that the parent directory of current is created +// with the proper cpus and mems files copied from it's parent if the values +// are a file with a new line char +func (s *CpusetGroup) ensureParent(current string) error { + parent := filepath.Dir(current) + + if _, err := os.Stat(parent); err != nil { + if !os.IsNotExist(err) { + return err + } + + if err := s.ensureParent(parent); err != nil { + return err + } + } + + if err := os.MkdirAll(current, 0755); err != nil && !os.IsExist(err) { + return err + } + return s.copyIfNeeded(current, parent) +} + +// copyIfNeeded copies the cpuset.cpus and cpuset.mems from the parent +// directory to the current directory if the file's contents are 0 +func (s *CpusetGroup) copyIfNeeded(current, parent string) error { + var ( + err error + currentCpus, currentMems []byte + parentCpus, parentMems []byte + ) + + if currentCpus, currentMems, err = s.getSubsystemSettings(current); err != nil { + return err + } + if parentCpus, parentMems, err = s.getSubsystemSettings(parent); err != nil { + return err + } + + if s.isEmpty(currentCpus) { + if err := writeFile(current, "cpuset.cpus", string(parentCpus)); err != nil { + return err + } + } + if s.isEmpty(currentMems) { + if err := writeFile(current, "cpuset.mems", string(parentMems)); err != nil { + return err + } + } + return nil +} + +func (s *CpusetGroup) isEmpty(b []byte) bool { + return len(bytes.Trim(b, "\n")) == 0 +} diff --git a/cgroups/fs/devices.go b/cgroups/fs/devices.go new file mode 100644 index 00000000..98d5d2d7 --- /dev/null +++ b/cgroups/fs/devices.go @@ -0,0 +1,34 @@ +package fs + +import "github.com/docker/libcontainer/cgroups" + +type DevicesGroup struct { +} + +func (s *DevicesGroup) Set(d *data) error { + dir, err := d.join("devices") + if err != nil { + return err + } + + if !d.c.AllowAllDevices { + if err := writeFile(dir, "devices.deny", "a"); err != nil { + return err + } + + for _, dev := range d.c.AllowedDevices { + if err := writeFile(dir, "devices.allow", dev.GetCgroupAllowString()); err != nil { + return err + } + } + } + return nil +} + +func (s *DevicesGroup) Remove(d *data) error { + return removePath(d.path("devices")) +} + +func (s *DevicesGroup) GetStats(path string, stats *cgroups.Stats) error { + return nil +} diff --git a/cgroups/fs/freezer.go b/cgroups/fs/freezer.go new file mode 100644 index 00000000..c6b677fa --- /dev/null +++ b/cgroups/fs/freezer.go @@ -0,0 +1,50 @@ +package fs + +import ( + "strings" + "time" + + "github.com/docker/libcontainer/cgroups" +) + +type FreezerGroup struct { +} + +func (s *FreezerGroup) Set(d *data) error { + switch d.c.Freezer { + case cgroups.Frozen, cgroups.Thawed: + dir, err := d.path("freezer") + if err != nil { + return err + } + + if err := writeFile(dir, "freezer.state", string(d.c.Freezer)); err != nil { + return err + } + + for { + state, err := readFile(dir, "freezer.state") + if err != nil { + return err + } + if strings.TrimSpace(state) == string(d.c.Freezer) { + break + } + time.Sleep(1 * time.Millisecond) + } + default: + if _, err := d.join("freezer"); err != nil && !cgroups.IsNotFound(err) { + return err + } + } + + return nil +} + +func (s *FreezerGroup) Remove(d *data) error { + return removePath(d.path("freezer")) +} + +func (s *FreezerGroup) GetStats(path string, stats *cgroups.Stats) error { + return nil +} diff --git a/cgroups/fs/memory.go b/cgroups/fs/memory.go new file mode 100644 index 00000000..3f9647c2 --- /dev/null +++ b/cgroups/fs/memory.go @@ -0,0 +1,93 @@ +package fs + +import ( + "bufio" + "fmt" + "os" + "path/filepath" + "strconv" + + "github.com/docker/libcontainer/cgroups" +) + +type MemoryGroup struct { +} + +func (s *MemoryGroup) Set(d *data) error { + dir, err := d.join("memory") + // only return an error for memory if it was specified + if err != nil && (d.c.Memory != 0 || d.c.MemoryReservation != 0 || d.c.MemorySwap != 0) { + return err + } + defer func() { + if err != nil { + os.RemoveAll(dir) + } + }() + + // Only set values if some config was specified. + if d.c.Memory != 0 || d.c.MemoryReservation != 0 || d.c.MemorySwap != 0 { + if d.c.Memory != 0 { + if err := writeFile(dir, "memory.limit_in_bytes", strconv.FormatInt(d.c.Memory, 10)); err != nil { + return err + } + } + if d.c.MemoryReservation != 0 { + if err := writeFile(dir, "memory.soft_limit_in_bytes", strconv.FormatInt(d.c.MemoryReservation, 10)); err != nil { + return err + } + } + // By default, MemorySwap is set to twice the size of RAM. + // If you want to omit MemorySwap, set it to `-1'. + if d.c.MemorySwap != -1 { + if err := writeFile(dir, "memory.memsw.limit_in_bytes", strconv.FormatInt(d.c.Memory*2, 10)); err != nil { + return err + } + } + } + return nil +} + +func (s *MemoryGroup) Remove(d *data) error { + return removePath(d.path("memory")) +} + +func (s *MemoryGroup) GetStats(path string, stats *cgroups.Stats) error { + // Set stats from memory.stat. + statsFile, err := os.Open(filepath.Join(path, "memory.stat")) + if err != nil { + if os.IsNotExist(err) { + return nil + } + return err + } + defer statsFile.Close() + + sc := bufio.NewScanner(statsFile) + for sc.Scan() { + t, v, err := getCgroupParamKeyValue(sc.Text()) + if err != nil { + return fmt.Errorf("failed to parse memory.stat (%q) - %v", sc.Text(), err) + } + stats.MemoryStats.Stats[t] = v + } + + // Set memory usage and max historical usage. + value, err := getCgroupParamUint(path, "memory.usage_in_bytes") + if err != nil { + return fmt.Errorf("failed to parse memory.usage_in_bytes - %v", err) + } + stats.MemoryStats.Usage = value + value, err = getCgroupParamUint(path, "memory.max_usage_in_bytes") + if err != nil { + return fmt.Errorf("failed to parse memory.max_usage_in_bytes - %v", err) + } + stats.MemoryStats.MaxUsage = value + value, err = getCgroupParamUint(path, "memory.failcnt") + if err != nil { + return fmt.Errorf("failed to parse memory.failcnt - %v", err) + } + stats.MemoryStats.Failcnt = value + + return nil +} diff --git a/cgroups/fs/memory_test.go b/cgroups/fs/memory_test.go new file mode 100644 index 00000000..a21cec75 --- /dev/null +++ b/cgroups/fs/memory_test.go @@ -0,0 +1,134 @@ +package fs + +import ( + "testing" + + "github.com/docker/libcontainer/cgroups" +) + +const ( + memoryStatContents = `cache 512 +rss 1024` + memoryUsageContents = "2048\n" + memoryMaxUsageContents = "4096\n" + memoryFailcnt = "100\n" +) + +func TestMemoryStats(t *testing.T) { + helper := NewCgroupTestUtil("memory", t) + defer helper.cleanup() + helper.writeFileContents(map[string]string{ + "memory.stat": memoryStatContents, + "memory.usage_in_bytes": memoryUsageContents, + "memory.max_usage_in_bytes": memoryMaxUsageContents, + "memory.failcnt": memoryFailcnt, + }) + + memory := &MemoryGroup{} + actualStats := *cgroups.NewStats() + err := memory.GetStats(helper.CgroupPath, &actualStats) + if err != nil { + t.Fatal(err) + } + expectedStats := cgroups.MemoryStats{Usage: 2048, MaxUsage: 4096, Failcnt: 100, Stats: map[string]uint64{"cache": 512, "rss": 1024}} + expectMemoryStatEquals(t, expectedStats, actualStats.MemoryStats) +} + +func TestMemoryStatsNoStatFile(t *testing.T) { + helper := NewCgroupTestUtil("memory", t) + defer helper.cleanup() + helper.writeFileContents(map[string]string{ + "memory.usage_in_bytes": memoryUsageContents, + "memory.max_usage_in_bytes": memoryMaxUsageContents, + }) + + memory := &MemoryGroup{} + actualStats := *cgroups.NewStats() + err := memory.GetStats(helper.CgroupPath, &actualStats) + if err != nil { + t.Fatal(err) + } +} + +func TestMemoryStatsNoUsageFile(t *testing.T) { + helper := NewCgroupTestUtil("memory", t) + defer helper.cleanup() + helper.writeFileContents(map[string]string{ + "memory.stat": memoryStatContents, + "memory.max_usage_in_bytes": memoryMaxUsageContents, + }) + + memory := &MemoryGroup{} + actualStats := *cgroups.NewStats() + err := memory.GetStats(helper.CgroupPath, &actualStats) + if err == nil { + t.Fatal("Expected failure") + } +} + +func TestMemoryStatsNoMaxUsageFile(t *testing.T) { + helper := NewCgroupTestUtil("memory", t) + defer helper.cleanup() + helper.writeFileContents(map[string]string{ + "memory.stat": memoryStatContents, + "memory.usage_in_bytes": memoryUsageContents, + }) + + memory := &MemoryGroup{} + actualStats := *cgroups.NewStats() + err := memory.GetStats(helper.CgroupPath, &actualStats) + if err == nil { + t.Fatal("Expected failure") + } +} + +func TestMemoryStatsBadStatFile(t *testing.T) { + helper := NewCgroupTestUtil("memory", t) + defer helper.cleanup() + helper.writeFileContents(map[string]string{ + "memory.stat": "rss rss", + "memory.usage_in_bytes": memoryUsageContents, + "memory.max_usage_in_bytes": memoryMaxUsageContents, + }) + + memory := &MemoryGroup{} + actualStats := *cgroups.NewStats() + err := memory.GetStats(helper.CgroupPath, &actualStats) + if err == nil { + t.Fatal("Expected failure") + } +} + +func TestMemoryStatsBadUsageFile(t *testing.T) { + helper := NewCgroupTestUtil("memory", t) + defer helper.cleanup() + helper.writeFileContents(map[string]string{ + "memory.stat": memoryStatContents, + "memory.usage_in_bytes": "bad", + "memory.max_usage_in_bytes": memoryMaxUsageContents, + }) + + memory := &MemoryGroup{} + actualStats := *cgroups.NewStats() + err := memory.GetStats(helper.CgroupPath, &actualStats) + if err == nil { + t.Fatal("Expected failure") + } +} + +func TestMemoryStatsBadMaxUsageFile(t *testing.T) { + helper := NewCgroupTestUtil("memory", t) + defer helper.cleanup() + helper.writeFileContents(map[string]string{ + "memory.stat": memoryStatContents, + "memory.usage_in_bytes": memoryUsageContents, + "memory.max_usage_in_bytes": "bad", + }) + + memory := &MemoryGroup{} + actualStats := *cgroups.NewStats() + err := memory.GetStats(helper.CgroupPath, &actualStats) + if err == nil { + t.Fatal("Expected failure") + } +} diff --git a/cgroups/fs/notify_linux.go b/cgroups/fs/notify_linux.go new file mode 100644 index 00000000..d92063ba --- /dev/null +++ b/cgroups/fs/notify_linux.go @@ -0,0 +1,82 @@ +// +build linux + +package fs + +import ( + "fmt" + "os" + "path/filepath" + "syscall" + + "github.com/docker/libcontainer/cgroups" +) + +// NotifyOnOOM sends signals on the returned channel when the cgroup reaches +// its memory limit. The channel is closed when the cgroup is removed. +func NotifyOnOOM(c *cgroups.Cgroup) (<-chan struct{}, error) { + d, err := getCgroupData(c, 0) + if err != nil { + return nil, err + } + + return notifyOnOOM(d) +} + +func notifyOnOOM(d *data) (<-chan struct{}, error) { + dir, err := d.path("memory") + if err != nil { + return nil, err + } + + fd, _, syserr := syscall.RawSyscall(syscall.SYS_EVENTFD2, 0, syscall.FD_CLOEXEC, 0) + if syserr != 0 { + return nil, syserr + } + + eventfd := os.NewFile(fd, "eventfd") + + oomControl, err := os.Open(filepath.Join(dir, "memory.oom_control")) + if err != nil { + eventfd.Close() + return nil, err + } + + var ( + eventControlPath = filepath.Join(dir, "cgroup.event_control") + data = fmt.Sprintf("%d %d", eventfd.Fd(), oomControl.Fd()) + ) + + if err := writeFile(dir, "cgroup.event_control", data); err != nil { + eventfd.Close() + oomControl.Close() + return nil, err + } + + ch := make(chan struct{}) + + go func() { + defer func() { + close(ch) + eventfd.Close() + oomControl.Close() + }() + + buf := make([]byte, 8) + + for { + if _, err := eventfd.Read(buf); err != nil { + return + } + + // When a cgroup is destroyed, an event is sent to eventfd. + // So if the control path is gone, return instead of notifying. + if _, err := os.Lstat(eventControlPath); os.IsNotExist(err) { + return + } + + ch <- struct{}{} + } + }() + + return ch, nil +} diff --git a/cgroups/fs/notify_linux_test.go b/cgroups/fs/notify_linux_test.go new file mode 100644 index 00000000..a11880cb --- /dev/null +++ b/cgroups/fs/notify_linux_test.go @@ -0,0 +1,86 @@ +// +build linux + +package fs + +import ( + "encoding/binary" + "fmt" + "syscall" + "testing" + "time" +) + +func TestNotifyOnOOM(t *testing.T) { + helper := NewCgroupTestUtil("memory", t) + defer helper.cleanup() + + helper.writeFileContents(map[string]string{ + "memory.oom_control": "", + "cgroup.event_control": "", + }) + + var eventFd, oomControlFd int + + ooms, err := notifyOnOOM(helper.CgroupData) + if err != nil { + t.Fatal("expected no error, got:", err) + } + + memoryPath, _ := helper.CgroupData.path("memory") + data, err := readFile(memoryPath, "cgroup.event_control") + if err != nil { + t.Fatal("couldn't read event control file:", err) + } + + if _, err := fmt.Sscanf(data, "%d %d", &eventFd, &oomControlFd); err != nil { + t.Fatalf("invalid control data %q: %s", data, err) + } + + // re-open the eventfd + efd, err := syscall.Dup(eventFd) + if err != nil { + t.Fatal("unable to reopen eventfd:", err) + } + defer syscall.Close(efd) + + if err != nil { + t.Fatal("unable to dup event fd:", err) + } + + buf := make([]byte, 8) + binary.LittleEndian.PutUint64(buf, 1) + + if _, err := syscall.Write(efd, buf); err != nil { + t.Fatal("unable to write to eventfd:", err) + } + + select { + case <-ooms: + case <-time.After(100 * time.Millisecond): + t.Fatal("no notification on oom channel after 100ms") + } + + // simulate what happens when a cgroup is destroyed by cleaning up and then + // writing to the eventfd. + helper.cleanup() + if _, err := syscall.Write(efd, buf); err != nil { + t.Fatal("unable to write to eventfd:", err) + } + + // give things a moment to shut down + select { + case _, ok := <-ooms: + if ok { + t.Fatal("expected no oom to be triggered") + } + case <-time.After(100 * time.Millisecond): + } + + if _, _, err := syscall.Syscall(syscall.SYS_FCNTL, uintptr(oomControlFd), syscall.F_GETFD, 0); err != syscall.EBADF { + t.Error("expected oom control to be closed") + } + + if _, _, err := syscall.Syscall(syscall.SYS_FCNTL, uintptr(eventFd), syscall.F_GETFD, 0); err != syscall.EBADF { + t.Error("expected event fd to be closed") + } +} diff --git a/cgroups/fs/perf_event.go b/cgroups/fs/perf_event.go new file mode 100644 index 00000000..813274d8 --- /dev/null +++ b/cgroups/fs/perf_event.go @@ -0,0 +1,24 @@ +package fs + +import ( + "github.com/docker/libcontainer/cgroups" +) + +type PerfEventGroup struct { +} + +func (s *PerfEventGroup) Set(d *data) error { + // we just want to join this group even though we don't set anything + if _, err := d.join("perf_event"); err != nil && !cgroups.IsNotFound(err) { + return err + } + return nil +} + +func (s *PerfEventGroup) Remove(d *data) error { + return removePath(d.path("perf_event")) +} + +func (s *PerfEventGroup) GetStats(path string, stats *cgroups.Stats) error { + return nil +} diff --git a/cgroups/fs/stats_util_test.go b/cgroups/fs/stats_util_test.go new file mode 100644 index 00000000..7e7da754 --- /dev/null +++ b/cgroups/fs/stats_util_test.go @@ -0,0 +1,73 @@ +package fs + +import ( + "fmt" + "log" + "testing" + + "github.com/docker/libcontainer/cgroups" +) + +func blkioStatEntryEquals(expected, actual []cgroups.BlkioStatEntry) error { + if len(expected) != len(actual) { + return fmt.Errorf("blkioStatEntries length do not match") + } + for i, expValue := range expected { + actValue := actual[i] + if expValue != actValue { + return fmt.Errorf("Expected blkio stat entry %v but found %v", expValue, actValue) + } + } + return nil +} + +func expectBlkioStatsEquals(t *testing.T, expected, actual cgroups.BlkioStats) { + if err := blkioStatEntryEquals(expected.IoServiceBytesRecursive, actual.IoServiceBytesRecursive); err != nil { + log.Printf("blkio IoServiceBytesRecursive do not match - %s\n", err) + t.Fail() + } + + if err := blkioStatEntryEquals(expected.IoServicedRecursive, actual.IoServicedRecursive); err != nil { + log.Printf("blkio IoServicedRecursive do not match - %s\n", err) + t.Fail() + } + + if err := blkioStatEntryEquals(expected.IoQueuedRecursive, actual.IoQueuedRecursive); err != nil { + log.Printf("blkio IoQueuedRecursive do not match - %s\n", err) + t.Fail() + } + + if err := blkioStatEntryEquals(expected.SectorsRecursive, actual.SectorsRecursive); err != nil { + log.Printf("blkio SectorsRecursive do not match - %s\n", err) + t.Fail() + } +} + +func expectThrottlingDataEquals(t *testing.T, expected, actual cgroups.ThrottlingData) { + if expected != actual { + log.Printf("Expected throttling data %v but found %v\n", expected, actual) + t.Fail() + } +} + +func expectMemoryStatEquals(t *testing.T, expected, actual cgroups.MemoryStats) { + if expected.Usage != actual.Usage { + log.Printf("Expected memory usage %d but found %d\n", expected.Usage, actual.Usage) + t.Fail() + } + if expected.MaxUsage != actual.MaxUsage { + log.Printf("Expected memory max usage %d but found %d\n", expected.MaxUsage, actual.MaxUsage) + t.Fail() + } + for key, expValue := range expected.Stats { + actValue, ok := actual.Stats[key] + if !ok { + log.Printf("Expected memory stat key %s not found\n", key) + t.Fail() + } + if expValue != actValue { + log.Printf("Expected memory stat value %d but found %d\n", expValue, actValue) + t.Fail() + } + } +} diff --git a/cgroups/fs/util_test.go b/cgroups/fs/util_test.go new file mode 100644 index 00000000..548870a8 --- /dev/null +++ b/cgroups/fs/util_test.go @@ -0,0 +1,60 @@ +/* +Utility for testing cgroup operations. + +Creates a mock of the cgroup filesystem for the duration of the test. +*/ +package fs + +import ( + "fmt" + "io/ioutil" + "os" + "testing" +) + +type cgroupTestUtil struct { + // data to use in tests. + CgroupData *data + + // Path to the mock cgroup directory. + CgroupPath string + + // Temporary directory to store mock cgroup filesystem. + tempDir string + t *testing.T +} + +// Creates a new test util for the specified subsystem +func NewCgroupTestUtil(subsystem string, t *testing.T) *cgroupTestUtil { + d := &data{} + tempDir, err := ioutil.TempDir("", fmt.Sprintf("%s_cgroup_test", subsystem)) + if err != nil { + t.Fatal(err) + } + d.root = tempDir + testCgroupPath, err := d.path(subsystem) + if err != nil { + t.Fatal(err) + } + + // Ensure the full mock cgroup path exists. + err = os.MkdirAll(testCgroupPath, 0755) + if err != nil { + t.Fatal(err) + } + return &cgroupTestUtil{CgroupData: d, CgroupPath: testCgroupPath, tempDir: tempDir, t: t} +} + +func (c *cgroupTestUtil) cleanup() { + os.RemoveAll(c.tempDir) +} + +// Write the specified contents on the mock of the specified cgroup files. +func (c *cgroupTestUtil) writeFileContents(fileContents map[string]string) { + for file, contents := range fileContents { + err := writeFile(c.CgroupPath, file, contents) + if err != nil { + c.t.Fatal(err) + } + } +} diff --git a/cgroups/fs/utils.go b/cgroups/fs/utils.go new file mode 100644 index 00000000..f37a3a48 --- /dev/null +++ b/cgroups/fs/utils.go @@ -0,0 +1,62 @@ +package fs + +import ( + "errors" + "fmt" + "io/ioutil" + "path/filepath" + "strconv" + "strings" +) + +var ( + ErrNotSupportStat = errors.New("stats are not supported for subsystem") + ErrNotValidFormat = errors.New("line is not a valid key value format") +) + +// Saturates negative values at zero and returns a uint64. +// Due to kernel bugs, some of the memory cgroup stats can be negative. +func parseUint(s string, base, bitSize int) (uint64, error) { + value, err := strconv.ParseUint(s, base, bitSize) + if err != nil { + intValue, intErr := strconv.ParseInt(s, base, bitSize) + // 1. Handle negative values greater than MinInt64 (and) + // 2. Handle negative values lesser than MinInt64 + if intErr == nil && intValue < 0 { + return 0, nil + } else if intErr != nil && intErr.(*strconv.NumError).Err == strconv.ErrRange && intValue < 0 { + return 0, nil + } + + return value, err + } + + return value, nil +} + +// Parses a cgroup param and returns as name, value +// i.e. "io_service_bytes 1234" will return as io_service_bytes, 1234 +func getCgroupParamKeyValue(t string) (string, uint64, error) { + parts := strings.Fields(t) + switch len(parts) { + case 2: + value, err := parseUint(parts[1], 10, 64) + if err != nil { + return "", 0, fmt.Errorf("Unable to convert param value (%q) to uint64: %v", parts[1], err) + } + + return parts[0], value, nil + default: + return "", 0, ErrNotValidFormat + } +} + +// Gets a single uint64 value from the specified cgroup file. +func getCgroupParamUint(cgroupPath, cgroupFile string) (uint64, error) { + contents, err := ioutil.ReadFile(filepath.Join(cgroupPath, cgroupFile)) + if err != nil { + return 0, err + } + + return parseUint(strings.TrimSpace(string(contents)), 10, 64) +} diff --git a/cgroups/fs/utils_test.go b/cgroups/fs/utils_test.go new file mode 100644 index 00000000..f1afd494 --- /dev/null +++ b/cgroups/fs/utils_test.go @@ -0,0 +1,95 @@ +package fs + +import ( + "io/ioutil" + "math" + "os" + "path/filepath" + "strconv" + "testing" +) + +const ( + cgroupFile = "cgroup.file" + floatValue = 2048.0 + floatString = "2048" +) + +func TestGetCgroupParamsInt(t *testing.T) { + // Setup tempdir. + tempDir, err := ioutil.TempDir("", "cgroup_utils_test") + if err != nil { + t.Fatal(err) + } + defer os.RemoveAll(tempDir) + tempFile := filepath.Join(tempDir, cgroupFile) + + // Success. + err = ioutil.WriteFile(tempFile, []byte(floatString), 0755) + if err != nil { + t.Fatal(err) + } + value, err := getCgroupParamUint(tempDir, cgroupFile) + if err != nil { + t.Fatal(err) + } else if value != floatValue { + t.Fatalf("Expected %d to equal %f", value, floatValue) + } + + // Success with new line. + err = ioutil.WriteFile(tempFile, []byte(floatString+"\n"), 0755) + if err != nil { + t.Fatal(err) + } + value, err = getCgroupParamUint(tempDir, cgroupFile) + if err != nil { + t.Fatal(err) + } else if value != floatValue { + t.Fatalf("Expected %d to equal %f", value, floatValue) + } + + // Success with negative values + err = ioutil.WriteFile(tempFile, []byte("-12345"), 0755) + if err != nil { + t.Fatal(err) + } + value, err = getCgroupParamUint(tempDir, cgroupFile) + if err != nil { + t.Fatal(err) + } else if value != 0 { + t.Fatalf("Expected %d to equal %f", value, 0) + } + + // Success with negative values lesser than min int64 + s := strconv.FormatFloat(math.MinInt64, 'f', -1, 64) + err = ioutil.WriteFile(tempFile, []byte(s), 0755) + if err != nil { + t.Fatal(err) + } + value, err = getCgroupParamUint(tempDir, cgroupFile) + if err != nil { + t.Fatal(err) + } else if value != 0 { + t.Fatalf("Expected %d to equal %f", value, 0) + } + + // Not a float. + err = ioutil.WriteFile(tempFile, []byte("not-a-float"), 0755) + if err != nil { + t.Fatal(err) + } + _, err = getCgroupParamUint(tempDir, cgroupFile) + if err == nil { + t.Fatal("Expecting error, got none") + } + + // Unknown file. + err = os.Remove(tempFile) + if err != nil { + t.Fatal(err) + } + _, err = getCgroupParamUint(tempDir, cgroupFile) + if err == nil { + t.Fatal("Expecting error, got none") + } +} diff --git a/cgroups/stats.go b/cgroups/stats.go new file mode 100644 index 00000000..f5225139 --- /dev/null +++ b/cgroups/stats.go @@ -0,0 +1,69 @@ +package cgroups + +type ThrottlingData struct { + // Number of periods with throttling active + Periods uint64 `json:"periods,omitempty"` + // Number of periods when the container hit its throttling limit. + ThrottledPeriods uint64 `json:"throttled_periods,omitempty"` + // Aggregate time the container was throttled for in nanoseconds. + ThrottledTime uint64 `json:"throttled_time,omitempty"` +} + +// All CPU stats are aggregate since container inception. +type CpuUsage struct { + // Total CPU time consumed. + // Units: nanoseconds. + TotalUsage uint64 `json:"total_usage,omitempty"` + // Total CPU time consumed per core. + // Units: nanoseconds. + PercpuUsage []uint64 `json:"percpu_usage,omitempty"` + // Time spent by tasks of the cgroup in kernel mode. + // Units: nanoseconds. + UsageInKernelmode uint64 `json:"usage_in_kernelmode"` + // Time spent by tasks of the cgroup in user mode. + // Units: nanoseconds. + UsageInUsermode uint64 `json:"usage_in_usermode"` +} + +type CpuStats struct { + CpuUsage CpuUsage `json:"cpu_usage,omitempty"` + ThrottlingData ThrottlingData `json:"throlling_data,omitempty"` +} + +type MemoryStats struct { + // current res_counter usage for memory + Usage uint64 `json:"usage,omitempty"` + // maximum usage ever recorded. + MaxUsage uint64 `json:"max_usage,omitempty"` + // TODO(vishh): Export these as stronger types. + // all the stats exported via memory.stat. + Stats map[string]uint64 `json:"stats,omitempty"` + // number of times memory usage hits limits. + Failcnt uint64 `json:"failcnt"` +} + +type BlkioStatEntry struct { + Major uint64 `json:"major,omitempty"` + Minor uint64 `json:"minor,omitempty"` + Op string `json:"op,omitempty"` + Value uint64 `json:"value,omitempty"` +} + +type BlkioStats struct { + // number of bytes tranferred to and from the block device + IoServiceBytesRecursive []BlkioStatEntry `json:"io_service_bytes_recursive,omitempty"` + IoServicedRecursive []BlkioStatEntry `json:"io_serviced_recusrive,omitempty"` + IoQueuedRecursive []BlkioStatEntry `json:"io_queue_recursive,omitempty"` + SectorsRecursive []BlkioStatEntry `json:"sectors_recursive,omitempty"` +} + +type Stats struct { + CpuStats CpuStats `json:"cpu_stats,omitempty"` + MemoryStats MemoryStats `json:"memory_stats,omitempty"` + BlkioStats BlkioStats `json:"blkio_stats,omitempty"` +} + +func NewStats() *Stats { + memoryStats := MemoryStats{Stats: make(map[string]uint64)} + return &Stats{MemoryStats: memoryStats} +} diff --git a/cgroups/systemd/apply_nosystemd.go b/cgroups/systemd/apply_nosystemd.go new file mode 100644 index 00000000..68559109 --- /dev/null +++ b/cgroups/systemd/apply_nosystemd.go @@ -0,0 +1,29 @@ +// +build !linux + +package systemd + +import ( + "fmt" + + "github.com/docker/libcontainer/cgroups" +) + +func UseSystemd() bool { + return false +} + +func Apply(c *cgroups.Cgroup, pid int) (cgroups.ActiveCgroup, error) { + return nil, fmt.Errorf("Systemd not supported") +} + +func GetPids(c *cgroups.Cgroup) ([]int, error) { + return nil, fmt.Errorf("Systemd not supported") +} + +func Freeze(c *cgroups.Cgroup, state cgroups.FreezerState) error { + return fmt.Errorf("Systemd not supported") +} + +func GetStats(c *cgroups.Cgroup) (*cgroups.Stats, error) { + return nil, fmt.Errorf("Systemd not supported") +} diff --git a/cgroups/systemd/apply_systemd.go b/cgroups/systemd/apply_systemd.go new file mode 100644 index 00000000..7af4818e --- /dev/null +++ b/cgroups/systemd/apply_systemd.go @@ -0,0 +1,358 @@ +// +build linux + +package systemd + +import ( + "bytes" + "fmt" + "io/ioutil" + "os" + "path/filepath" + "strconv" + "strings" + "sync" + "time" + + systemd "github.com/coreos/go-systemd/dbus" + "github.com/docker/libcontainer/cgroups" + "github.com/docker/libcontainer/cgroups/fs" + "github.com/godbus/dbus" +) + +type systemdCgroup struct { + cgroup *cgroups.Cgroup +} + +type subsystem interface { + GetStats(string, *cgroups.Stats) error +} + +var ( + connLock sync.Mutex + theConn *systemd.Conn + hasStartTransientUnit bool + subsystems = map[string]subsystem{ + "devices": &fs.DevicesGroup{}, + "memory": &fs.MemoryGroup{}, + "cpu": &fs.CpuGroup{}, + "cpuset": &fs.CpusetGroup{}, + "cpuacct": &fs.CpuacctGroup{}, + "blkio": &fs.BlkioGroup{}, + "perf_event": &fs.PerfEventGroup{}, + "freezer": &fs.FreezerGroup{}, + } +) + +func UseSystemd() bool { + s, err := os.Stat("/run/systemd/system") + if err != nil || !s.IsDir() { + return false + } + + connLock.Lock() + defer connLock.Unlock() + + if theConn == nil { + var err error + theConn, err = systemd.New() + if err != nil { + return false + } + + // Assume we have StartTransientUnit + hasStartTransientUnit = true + + // But if we get UnknownMethod error we don't + if _, err := theConn.StartTransientUnit("test.scope", "invalid"); err != nil { + if dbusError, ok := err.(dbus.Error); ok { + if dbusError.Name == "org.freedesktop.DBus.Error.UnknownMethod" { + hasStartTransientUnit = false + } + } + } + } + return hasStartTransientUnit +} + +func getIfaceForUnit(unitName string) string { + if strings.HasSuffix(unitName, ".scope") { + return "Scope" + } + if strings.HasSuffix(unitName, ".service") { + return "Service" + } + return "Unit" +} + +func Apply(c *cgroups.Cgroup, pid int) (cgroups.ActiveCgroup, error) { + var ( + unitName = getUnitName(c) + slice = "system.slice" + properties []systemd.Property + res = &systemdCgroup{} + ) + + res.cgroup = c + + if c.Slice != "" { + slice = c.Slice + } + + properties = append(properties, + systemd.Property{"Slice", dbus.MakeVariant(slice)}, + systemd.Property{"Description", dbus.MakeVariant("docker container " + c.Name)}, + systemd.Property{"PIDs", dbus.MakeVariant([]uint32{uint32(pid)})}, + ) + + // Always enable accounting, this gets us the same behaviour as the fs implementation, + // plus the kernel has some problems with joining the memory cgroup at a later time. + properties = append(properties, + systemd.Property{"MemoryAccounting", dbus.MakeVariant(true)}, + systemd.Property{"CPUAccounting", dbus.MakeVariant(true)}, + systemd.Property{"BlockIOAccounting", dbus.MakeVariant(true)}) + + if c.Memory != 0 { + properties = append(properties, + systemd.Property{"MemoryLimit", dbus.MakeVariant(uint64(c.Memory))}) + } + // TODO: MemoryReservation and MemorySwap not available in systemd + + if c.CpuShares != 0 { + properties = append(properties, + systemd.Property{"CPUShares", dbus.MakeVariant(uint64(c.CpuShares))}) + } + + if _, err := theConn.StartTransientUnit(unitName, "replace", properties...); err != nil { + return nil, err + } + + if !c.AllowAllDevices { + if err := joinDevices(c, pid); err != nil { + return nil, err + } + } + + // -1 disables memorySwap + if c.MemorySwap >= 0 && (c.Memory != 0 || c.MemorySwap > 0) { + if err := joinMemory(c, pid); err != nil { + return nil, err + } + + } + + // we need to manually join the freezer cgroup in systemd because it does not currently support it + // via the dbus api + if err := joinFreezer(c, pid); err != nil { + return nil, err + } + + if c.CpusetCpus != "" { + if err := joinCpuset(c, pid); err != nil { + return nil, err + } + } + + return res, nil +} + +func writeFile(dir, file, data string) error { + return ioutil.WriteFile(filepath.Join(dir, file), []byte(data), 0700) +} + +func (c *systemdCgroup) Paths() (map[string]string, error) { + paths := make(map[string]string) + + for sysname := range subsystems { + subsystemPath, err := getSubsystemPath(c.cgroup, sysname) + if err != nil { + // Don't fail if a cgroup hierarchy was not found, just skip this subsystem + if cgroups.IsNotFound(err) { + continue + } + + return nil, err + } + + paths[sysname] = subsystemPath + } + + return paths, nil +} + +func (c *systemdCgroup) Cleanup() error { + // systemd cleans up, we don't need to do much + paths, err := c.Paths() + if err != nil { + return err + } + + for _, path := range paths { + os.RemoveAll(path) + } + + return nil +} + +func joinFreezer(c *cgroups.Cgroup, pid int) error { + path, err := getSubsystemPath(c, "freezer") + if err != nil { + return err + } + + if err := os.MkdirAll(path, 0755); err != nil && !os.IsExist(err) { + return err + } + + return ioutil.WriteFile(filepath.Join(path, "cgroup.procs"), []byte(strconv.Itoa(pid)), 0700) +} + +func getSubsystemPath(c *cgroups.Cgroup, subsystem string) (string, error) { + mountpoint, err := cgroups.FindCgroupMountpoint(subsystem) + if err != nil { + return "", err + } + + initPath, err := cgroups.GetInitCgroupDir(subsystem) + if err != nil { + return "", err + } + + slice := "system.slice" + if c.Slice != "" { + slice = c.Slice + } + + return filepath.Join(mountpoint, initPath, slice, getUnitName(c)), nil +} + +func Freeze(c *cgroups.Cgroup, state cgroups.FreezerState) error { + path, err := getSubsystemPath(c, "freezer") + if err != nil { + return err + } + + if err := ioutil.WriteFile(filepath.Join(path, "freezer.state"), []byte(state), 0); err != nil { + return err + } + for { + state_, err := ioutil.ReadFile(filepath.Join(path, "freezer.state")) + if err != nil { + return err + } + if string(state) == string(bytes.TrimSpace(state_)) { + break + } + time.Sleep(1 * time.Millisecond) + } + return nil +} + +func GetPids(c *cgroups.Cgroup) ([]int, error) { + path, err := getSubsystemPath(c, "cpu") + if err != nil { + return nil, err + } + + return cgroups.ReadProcsFile(path) +} + +func getUnitName(c *cgroups.Cgroup) string { + return fmt.Sprintf("%s-%s.scope", c.Parent, c.Name) +} + +/* + * This would be nicer to get from the systemd API when accounting + * is enabled, but sadly there is no way to do that yet. + * The lack of this functionality in the API & the approach taken + * is guided by + * http://www.freedesktop.org/wiki/Software/systemd/ControlGroupInterface/#readingaccountinginformation. + */ +func GetStats(c *cgroups.Cgroup) (*cgroups.Stats, error) { + stats := cgroups.NewStats() + + for sysname, sys := range subsystems { + subsystemPath, err := getSubsystemPath(c, sysname) + if err != nil { + // Don't fail if a cgroup hierarchy was not found, just skip this subsystem + if cgroups.IsNotFound(err) { + continue + } + + return nil, err + } + + if err := sys.GetStats(subsystemPath, stats); err != nil { + return nil, err + } + } + + return stats, nil +} + +// Atm we can't use the systemd device support because of two missing things: +// * Support for wildcards to allow mknod on any device +// * Support for wildcards to allow /dev/pts support +// +// The second is available in more recent systemd as "char-pts", but not in e.g. v208 which is +// in wide use. When both these are availalable we will be able to switch, but need to keep the old +// implementation for backwards compat. +// +// Note: we can't use systemd to set up the initial limits, and then change the cgroup +// because systemd will re-write the device settings if it needs to re-apply the cgroup context. +// This happens at least for v208 when any sibling unit is started. +func joinDevices(c *cgroups.Cgroup, pid int) error { + path, err := getSubsystemPath(c, "devices") + if err != nil { + return err + } + + if err := os.MkdirAll(path, 0755); err != nil && !os.IsExist(err) { + return err + } + + if err := ioutil.WriteFile(filepath.Join(path, "cgroup.procs"), []byte(strconv.Itoa(pid)), 0700); err != nil { + return err + } + + if err := writeFile(path, "devices.deny", "a"); err != nil { + return err + } + + for _, dev := range c.AllowedDevices { + if err := writeFile(path, "devices.allow", dev.GetCgroupAllowString()); err != nil { + return err + } + } + + return nil +} + +func joinMemory(c *cgroups.Cgroup, pid int) error { + memorySwap := c.MemorySwap + + if memorySwap == 0 { + // By default, MemorySwap is set to twice the size of RAM. + memorySwap = c.Memory * 2 + } + + path, err := getSubsystemPath(c, "memory") + if err != nil { + return err + } + + return ioutil.WriteFile(filepath.Join(path, "memory.memsw.limit_in_bytes"), []byte(strconv.FormatInt(memorySwap, 10)), 0700) +} + +// systemd does not atm set up the cpuset controller, so we must manually +// join it. Additionally that is a very finicky controller where each +// level must have a full setup as the default for a new directory is "no cpus" +func joinCpuset(c *cgroups.Cgroup, pid int) error { + path, err := getSubsystemPath(c, "cpuset") + if err != nil { + return err + } + + s := &fs.CpusetGroup{} + + return s.SetDir(path, c.CpusetCpus, pid) +} diff --git a/cgroups/utils.go b/cgroups/utils.go new file mode 100644 index 00000000..77a3c0d7 --- /dev/null +++ b/cgroups/utils.go @@ -0,0 +1,194 @@ +package cgroups + +import ( + "bufio" + "fmt" + "io" + "io/ioutil" + "os" + "path/filepath" + "strconv" + "strings" + + "github.com/docker/docker/pkg/mount" +) + +// https://www.kernel.org/doc/Documentation/cgroups/cgroups.txt +func FindCgroupMountpoint(subsystem string) (string, error) { + mounts, err := mount.GetMounts() + if err != nil { + return "", err + } + + for _, mount := range mounts { + if mount.Fstype == "cgroup" { + for _, opt := range strings.Split(mount.VfsOpts, ",") { + if opt == subsystem { + return mount.Mountpoint, nil + } + } + } + } + + return "", NewNotFoundError(subsystem) +} + +type Mount struct { + Mountpoint string + Subsystems []string +} + +func (m Mount) GetThisCgroupDir() (string, error) { + if len(m.Subsystems) == 0 { + return "", fmt.Errorf("no subsystem for mount") + } + + return GetThisCgroupDir(m.Subsystems[0]) +} + +func GetCgroupMounts() ([]Mount, error) { + mounts, err := mount.GetMounts() + if err != nil { + return nil, err + } + + all, err := GetAllSubsystems() + if err != nil { + return nil, err + } + + allMap := make(map[string]bool) + for _, s := range all { + allMap[s] = true + } + + res := []Mount{} + for _, mount := range mounts { + if mount.Fstype == "cgroup" { + m := Mount{Mountpoint: mount.Mountpoint} + + for _, opt := range strings.Split(mount.VfsOpts, ",") { + if strings.HasPrefix(opt, "name=") { + m.Subsystems = append(m.Subsystems, opt) + } + if allMap[opt] { + m.Subsystems = append(m.Subsystems, opt) + } + } + res = append(res, m) + } + } + return res, nil +} + +// Returns all the cgroup subsystems supported by the kernel +func GetAllSubsystems() ([]string, error) { + f, err := os.Open("/proc/cgroups") + if err != nil { + return nil, err + } + defer f.Close() + + subsystems := []string{} + + s := bufio.NewScanner(f) + for s.Scan() { + if err := s.Err(); err != nil { + return nil, err + } + text := s.Text() + if text[0] != '#' { + parts := strings.Fields(text) + if len(parts) >= 4 && parts[3] != "0" { + subsystems = append(subsystems, parts[0]) + } + } + } + return subsystems, nil +} + +// Returns the relative path to the cgroup docker is running in. +func GetThisCgroupDir(subsystem string) (string, error) { + f, err := os.Open("/proc/self/cgroup") + if err != nil { + return "", err + } + defer f.Close() + + return ParseCgroupFile(subsystem, f) +} + +func GetInitCgroupDir(subsystem string) (string, error) { + f, err := os.Open("/proc/1/cgroup") + if err != nil { + return "", err + } + defer f.Close() + + return ParseCgroupFile(subsystem, f) +} + +func ReadProcsFile(dir string) ([]int, error) { + f, err := os.Open(filepath.Join(dir, "cgroup.procs")) + if err != nil { + return nil, err + } + defer f.Close() + + var ( + s = bufio.NewScanner(f) + out = []int{} + ) + + for s.Scan() { + if t := s.Text(); t != "" { + pid, err := strconv.Atoi(t) + if err != nil { + return nil, err + } + out = append(out, pid) + } + } + return out, nil +} + +func ParseCgroupFile(subsystem string, r io.Reader) (string, error) { + s := bufio.NewScanner(r) + + for s.Scan() { + if err := s.Err(); err != nil { + return "", err + } + + text := s.Text() + parts := strings.Split(text, ":") + + for _, subs := range strings.Split(parts[1], ",") { + if subs == subsystem { + return parts[2], nil + } + } + } + + return "", NewNotFoundError(subsystem) +} + +func pathExists(path string) bool { + if _, err := os.Stat(path); err != nil { + return false + } + return true +} + +func EnterPid(cgroupPaths map[string]string, pid int) error { + for _, path := range cgroupPaths { + if pathExists(path) { + if err := ioutil.WriteFile(filepath.Join(path, "cgroup.procs"), + []byte(strconv.Itoa(pid)), 0700); err != nil { + return err + } + } + } + + return nil +} diff --git a/config.go b/config.go new file mode 100644 index 00000000..1fb377dc --- /dev/null +++ b/config.go @@ -0,0 +1,89 @@ +package libcontainer + +import ( + "github.com/docker/libcontainer/cgroups" + "github.com/docker/libcontainer/mount" + "github.com/docker/libcontainer/network" +) + +type MountConfig mount.MountConfig + +type Network network.Network + +// Config defines configuration options for executing a process inside a contained environment. +type Config struct { + // Mount specific options. + MountConfig *MountConfig `json:"mount_config,omitempty"` + + // Pathname to container's root filesystem + RootFs string `json:"root_fs,omitempty"` + + // Hostname optionally sets the container's hostname if provided + Hostname string `json:"hostname,omitempty"` + + // User will set the uid and gid of the executing process running inside the container + User string `json:"user,omitempty"` + + // WorkingDir will change the processes current working directory inside the container's rootfs + WorkingDir string `json:"working_dir,omitempty"` + + // Env will populate the processes environment with the provided values + // Any values from the parent processes will be cleared before the values + // provided in Env are provided to the process + Env []string `json:"environment,omitempty"` + + // Tty when true will allocate a pty slave on the host for access by the container's process + // and ensure that it is mounted inside the container's rootfs + Tty bool `json:"tty,omitempty"` + + // Namespaces specifies the container's namespaces that it should setup when cloning the init process + // If a namespace is not provided that namespace is shared from the container's parent process + Namespaces map[string]bool `json:"namespaces,omitempty"` + + // Capabilities specify the capabilities to keep when executing the process inside the container + // All capbilities not specified will be dropped from the processes capability mask + Capabilities []string `json:"capabilities,omitempty"` + + // Networks specifies the container's network setup to be created + Networks []*Network `json:"networks,omitempty"` + + // Routes can be specified to create entries in the route table as the container is started + Routes []*Route `json:"routes,omitempty"` + + // Cgroups specifies specific cgroup settings for the various subsystems that the container is + // placed into to limit the resources the container has available + Cgroups *cgroups.Cgroup `json:"cgroups,omitempty"` + + // AppArmorProfile specifies the profile to apply to the process running in the container and is + // change at the time the process is execed + AppArmorProfile string `json:"apparmor_profile,omitempty"` + + // ProcessLabel specifies the label to apply to the process running in the container. It is + // commonly used by selinux + ProcessLabel string `json:"process_label,omitempty"` + + // RestrictSys will remount /proc/sys, /sys, and mask over sysrq-trigger as well as /proc/irq and + // /proc/bus + RestrictSys bool `json:"restrict_sys,omitempty"` +} + +// Routes can be specified to create entries in the route table as the container is started +// +// All of destination, source, and gateway should be either IPv4 or IPv6. +// One of the three options must be present, and ommitted entries will use their +// IP family default for the route table. For IPv4 for example, setting the +// gateway to 1.2.3.4 and the interface to eth0 will set up a standard +// destination of 0.0.0.0(or *) when viewed in the route table. +type Route struct { + // Sets the destination and mask, should be a CIDR. Accepts IPv4 and IPv6 + Destination string `json:"destination,omitempty"` + + // Sets the source and mask, should be a CIDR. Accepts IPv4 and IPv6 + Source string `json:"source,omitempty"` + + // Sets the gateway. Accepts IPv4 and IPv6 + Gateway string `json:"gateway,omitempty"` + + // The device to set this route up for, for example: eth0 + InterfaceName string `json:"interface_name,omitempty"` +} diff --git a/config_test.go b/config_test.go new file mode 100644 index 00000000..59812811 --- /dev/null +++ b/config_test.go @@ -0,0 +1,160 @@ +package libcontainer + +import ( + "encoding/json" + "os" + "path/filepath" + "testing" + + "github.com/docker/libcontainer/devices" +) + +// Checks whether the expected capability is specified in the capabilities. +func contains(expected string, values []string) bool { + for _, v := range values { + if v == expected { + return true + } + } + return false +} + +func containsDevice(expected *devices.Device, values []*devices.Device) bool { + for _, d := range values { + if d.Path == expected.Path && + d.CgroupPermissions == expected.CgroupPermissions && + d.FileMode == expected.FileMode && + d.MajorNumber == expected.MajorNumber && + d.MinorNumber == expected.MinorNumber && + d.Type == expected.Type { + return true + } + } + return false +} + +func loadConfig(name string) (*Config, error) { + f, err := os.Open(filepath.Join("sample_configs", name)) + if err != nil { + return nil, err + } + defer f.Close() + + var container *Config + if err := json.NewDecoder(f).Decode(&container); err != nil { + return nil, err + } + + return container, nil +} + +func TestConfigJsonFormat(t *testing.T) { + container, err := loadConfig("attach_to_bridge.json") + if err != nil { + t.Fatal(err) + } + + if container.Hostname != "koye" { + t.Log("hostname is not set") + t.Fail() + } + + if !container.Tty { + t.Log("tty should be set to true") + t.Fail() + } + + if !container.Namespaces["NEWNET"] { + t.Log("namespaces should contain NEWNET") + t.Fail() + } + + if container.Namespaces["NEWUSER"] { + t.Log("namespaces should not contain NEWUSER") + t.Fail() + } + + if contains("SYS_ADMIN", container.Capabilities) { + t.Log("SYS_ADMIN should not be enabled in capabilities mask") + t.Fail() + } + + if !contains("MKNOD", container.Capabilities) { + t.Log("MKNOD should be enabled in capabilities mask") + t.Fail() + } + + if !contains("SYS_CHROOT", container.Capabilities) { + t.Log("capabilities mask should contain SYS_CHROOT") + t.Fail() + } + + for _, n := range container.Networks { + if n.Type == "veth" { + if n.Bridge != "docker0" { + t.Logf("veth bridge should be docker0 but received %q", n.Bridge) + t.Fail() + } + + if n.Address != "172.17.0.101/16" { + t.Logf("veth address should be 172.17.0.101/61 but received %q", n.Address) + t.Fail() + } + + if n.VethPrefix != "veth" { + t.Logf("veth prefix should be veth but received %q", n.VethPrefix) + t.Fail() + } + + if n.Gateway != "172.17.42.1" { + t.Logf("veth gateway should be 172.17.42.1 but received %q", n.Gateway) + t.Fail() + } + + if n.Mtu != 1500 { + t.Logf("veth mtu should be 1500 but received %d", n.Mtu) + t.Fail() + } + + break + } + } + + for _, d := range devices.DefaultSimpleDevices { + if !containsDevice(d, container.MountConfig.DeviceNodes) { + t.Logf("expected device configuration for %s", d.Path) + t.Fail() + } + } + + if !container.RestrictSys { + t.Log("expected restrict sys to be true") + t.Fail() + } +} + +func TestApparmorProfile(t *testing.T) { + container, err := loadConfig("apparmor.json") + if err != nil { + t.Fatal(err) + } + + if container.AppArmorProfile != "docker-default" { + t.Fatalf("expected apparmor profile to be docker-default but received %q", container.AppArmorProfile) + } +} + +func TestSelinuxLabels(t *testing.T) { + container, err := loadConfig("selinux.json") + if err != nil { + t.Fatal(err) + } + label := "system_u:system_r:svirt_lxc_net_t:s0:c164,c475" + + if container.ProcessLabel != label { + t.Fatalf("expected process label %q but received %q", label, container.ProcessLabel) + } + if container.MountConfig.MountLabel != label { + t.Fatalf("expected mount label %q but received %q", label, container.MountConfig.MountLabel) + } +} diff --git a/console/console.go b/console/console.go new file mode 100644 index 00000000..346f537d --- /dev/null +++ b/console/console.go @@ -0,0 +1,128 @@ +// +build linux + +package console + +import ( + "fmt" + "os" + "path/filepath" + "syscall" + "unsafe" + + "github.com/docker/libcontainer/label" +) + +// Setup initializes the proper /dev/console inside the rootfs path +func Setup(rootfs, consolePath, mountLabel string) error { + oldMask := syscall.Umask(0000) + defer syscall.Umask(oldMask) + + if err := os.Chmod(consolePath, 0600); err != nil { + return err + } + + if err := os.Chown(consolePath, 0, 0); err != nil { + return err + } + + if err := label.SetFileLabel(consolePath, mountLabel); err != nil { + return fmt.Errorf("set file label %s %s", consolePath, err) + } + + dest := filepath.Join(rootfs, "dev/console") + + f, err := os.Create(dest) + if err != nil && !os.IsExist(err) { + return fmt.Errorf("create %s %s", dest, err) + } + + if f != nil { + f.Close() + } + + if err := syscall.Mount(consolePath, dest, "bind", syscall.MS_BIND, ""); err != nil { + return fmt.Errorf("bind %s to %s %s", consolePath, dest, err) + } + + return nil +} + +func OpenAndDup(consolePath string) error { + slave, err := OpenTerminal(consolePath, syscall.O_RDWR) + if err != nil { + return fmt.Errorf("open terminal %s", err) + } + + if err := syscall.Dup2(int(slave.Fd()), 0); err != nil { + return err + } + + if err := syscall.Dup2(int(slave.Fd()), 1); err != nil { + return err + } + + return syscall.Dup2(int(slave.Fd()), 2) +} + +// Unlockpt unlocks the slave pseudoterminal device corresponding to the master pseudoterminal referred to by f. +// Unlockpt should be called before opening the slave side of a pseudoterminal. +func Unlockpt(f *os.File) error { + var u int + + return Ioctl(f.Fd(), syscall.TIOCSPTLCK, uintptr(unsafe.Pointer(&u))) +} + +// Ptsname retrieves the name of the first available pts for the given master. +func Ptsname(f *os.File) (string, error) { + var n int + + if err := Ioctl(f.Fd(), syscall.TIOCGPTN, uintptr(unsafe.Pointer(&n))); err != nil { + return "", err + } + + return fmt.Sprintf("/dev/pts/%d", n), nil +} + +// CreateMasterAndConsole will open /dev/ptmx on the host and retreive the +// pts name for use as the pty slave inside the container +func CreateMasterAndConsole() (*os.File, string, error) { + master, err := os.OpenFile("/dev/ptmx", syscall.O_RDWR|syscall.O_NOCTTY|syscall.O_CLOEXEC, 0) + if err != nil { + return nil, "", err + } + + console, err := Ptsname(master) + if err != nil { + return nil, "", err + } + + if err := Unlockpt(master); err != nil { + return nil, "", err + } + + return master, console, nil +} + +// OpenPtmx opens /dev/ptmx, i.e. the PTY master. +func OpenPtmx() (*os.File, error) { + // O_NOCTTY and O_CLOEXEC are not present in os package so we use the syscall's one for all. + return os.OpenFile("/dev/ptmx", syscall.O_RDONLY|syscall.O_NOCTTY|syscall.O_CLOEXEC, 0) +} + +// OpenTerminal is a clone of os.OpenFile without the O_CLOEXEC +// used to open the pty slave inside the container namespace +func OpenTerminal(name string, flag int) (*os.File, error) { + r, e := syscall.Open(name, flag, 0) + if e != nil { + return nil, &os.PathError{Op: "open", Path: name, Err: e} + } + return os.NewFile(uintptr(r), name), nil +} + +func Ioctl(fd uintptr, flag, data uintptr) error { + if _, _, err := syscall.Syscall(syscall.SYS_IOCTL, fd, flag, data); err != 0 { + return err + } + + return nil +} diff --git a/container.go b/container.go new file mode 100644 index 00000000..307e8cbc --- /dev/null +++ b/container.go @@ -0,0 +1,78 @@ +/* +NOTE: The API is in flux and mainly not implemented. Proceed with caution until further notice. +*/ +package libcontainer + +// A libcontainer container object. +// +// Each container is thread-safe within the same process. Since a container can +// be destroyed by a separate process, any function may return that the container +// was not found. +type Container interface { + // Returns the ID of the container + ID() string + + // Returns the current run state of the container. + // + // Errors: + // ContainerDestroyed - Container no longer exists, + // SystemError - System error. + RunState() (*RunState, Error) + + // Returns the current config of the container. + Config() *Config + + // Start a process inside the container. Returns the PID of the new process (in the caller process's namespace) and a channel that will return the exit status of the process whenever it dies. + // + // Errors: + // ContainerDestroyed - Container no longer exists, + // ConfigInvalid - config is invalid, + // ContainerPaused - Container is paused, + // SystemError - System error. + Start(config *ProcessConfig) (pid int, exitChan chan int, err Error) + + // Destroys the container after killing all running processes. + // + // Any event registrations are removed before the container is destroyed. + // No error is returned if the container is already destroyed. + // + // Errors: + // SystemError - System error. + Destroy() Error + + // Returns the PIDs inside this container. The PIDs are in the namespace of the calling process. + // + // Errors: + // ContainerDestroyed - Container no longer exists, + // SystemError - System error. + // + // Some of the returned PIDs may no longer refer to processes in the Container, unless + // the Container state is PAUSED in which case every PID in the slice is valid. + Processes() ([]int, Error) + + // Returns statistics for the container. + // + // Errors: + // ContainerDestroyed - Container no longer exists, + // SystemError - System error. + Stats() (*ContainerStats, Error) + + // If the Container state is RUNNING or PAUSING, sets the Container state to PAUSING and pauses + // the execution of any user processes. Asynchronously, when the container finished being paused the + // state is changed to PAUSED. + // If the Container state is PAUSED, do nothing. + // + // Errors: + // ContainerDestroyed - Container no longer exists, + // SystemError - System error. + Pause() Error + + // If the Container state is PAUSED, resumes the execution of any user processes in the + // Container before setting the Container state to RUNNING. + // If the Container state is RUNNING, do nothing. + // + // Errors: + // ContainerDestroyed - Container no longer exists, + // SystemError - System error. + Resume() Error +} diff --git a/devices/defaults.go b/devices/defaults.go new file mode 100644 index 00000000..e0ad0b08 --- /dev/null +++ b/devices/defaults.go @@ -0,0 +1,159 @@ +package devices + +var ( + // These are devices that are to be both allowed and created. + + DefaultSimpleDevices = []*Device{ + // /dev/null and zero + { + Path: "/dev/null", + Type: 'c', + MajorNumber: 1, + MinorNumber: 3, + CgroupPermissions: "rwm", + FileMode: 0666, + }, + { + Path: "/dev/zero", + Type: 'c', + MajorNumber: 1, + MinorNumber: 5, + CgroupPermissions: "rwm", + FileMode: 0666, + }, + + { + Path: "/dev/full", + Type: 'c', + MajorNumber: 1, + MinorNumber: 7, + CgroupPermissions: "rwm", + FileMode: 0666, + }, + + // consoles and ttys + { + Path: "/dev/tty", + Type: 'c', + MajorNumber: 5, + MinorNumber: 0, + CgroupPermissions: "rwm", + FileMode: 0666, + }, + + // /dev/urandom,/dev/random + { + Path: "/dev/urandom", + Type: 'c', + MajorNumber: 1, + MinorNumber: 9, + CgroupPermissions: "rwm", + FileMode: 0666, + }, + { + Path: "/dev/random", + Type: 'c', + MajorNumber: 1, + MinorNumber: 8, + CgroupPermissions: "rwm", + FileMode: 0666, + }, + } + + DefaultAllowedDevices = append([]*Device{ + // allow mknod for any device + { + Type: 'c', + MajorNumber: Wildcard, + MinorNumber: Wildcard, + CgroupPermissions: "m", + }, + { + Type: 'b', + MajorNumber: Wildcard, + MinorNumber: Wildcard, + CgroupPermissions: "m", + }, + + { + Path: "/dev/console", + Type: 'c', + MajorNumber: 5, + MinorNumber: 1, + CgroupPermissions: "rwm", + }, + { + Path: "/dev/tty0", + Type: 'c', + MajorNumber: 4, + MinorNumber: 0, + CgroupPermissions: "rwm", + }, + { + Path: "/dev/tty1", + Type: 'c', + MajorNumber: 4, + MinorNumber: 1, + CgroupPermissions: "rwm", + }, + // /dev/pts/ - pts namespaces are "coming soon" + { + Path: "", + Type: 'c', + MajorNumber: 136, + MinorNumber: Wildcard, + CgroupPermissions: "rwm", + }, + { + Path: "", + Type: 'c', + MajorNumber: 5, + MinorNumber: 2, + CgroupPermissions: "rwm", + }, + + // tuntap + { + Path: "", + Type: 'c', + MajorNumber: 10, + MinorNumber: 200, + CgroupPermissions: "rwm", + }, + + /*// fuse + { + Path: "", + Type: 'c', + MajorNumber: 10, + MinorNumber: 229, + CgroupPermissions: "rwm", + }, + + // rtc + { + Path: "", + Type: 'c', + MajorNumber: 254, + MinorNumber: 0, + CgroupPermissions: "rwm", + }, + */ + }, DefaultSimpleDevices...) + + DefaultAutoCreatedDevices = append([]*Device{ + { + // /dev/fuse is created but not allowed. + // This is to allow java to work. Because java + // Insists on there being a /dev/fuse + // https://github.com/docker/docker/issues/514 + // https://github.com/docker/docker/issues/2393 + // + Path: "/dev/fuse", + Type: 'c', + MajorNumber: 10, + MinorNumber: 229, + CgroupPermissions: "rwm", + }, + }, DefaultSimpleDevices...) +) diff --git a/devices/devices.go b/devices/devices.go new file mode 100644 index 00000000..558f7f5f --- /dev/null +++ b/devices/devices.go @@ -0,0 +1,129 @@ +package devices + +import ( + "errors" + "fmt" + "io/ioutil" + "os" + "path/filepath" + "syscall" +) + +const ( + Wildcard = -1 +) + +var ( + ErrNotADeviceNode = errors.New("not a device node") +) + +// Testing dependencies +var ( + osLstat = os.Lstat + ioutilReadDir = ioutil.ReadDir +) + +type Device struct { + Type rune `json:"type,omitempty"` + Path string `json:"path,omitempty"` // It is fine if this is an empty string in the case that you are using Wildcards + MajorNumber int64 `json:"major_number,omitempty"` // Use the wildcard constant for wildcards. + MinorNumber int64 `json:"minor_number,omitempty"` // Use the wildcard constant for wildcards. + CgroupPermissions string `json:"cgroup_permissions,omitempty"` // Typically just "rwm" + FileMode os.FileMode `json:"file_mode,omitempty"` // The permission bits of the file's mode + Uid uint32 `json:"uid,omitempty"` + Gid uint32 `json:"gid,omitempty"` +} + +func GetDeviceNumberString(deviceNumber int64) string { + if deviceNumber == Wildcard { + return "*" + } else { + return fmt.Sprintf("%d", deviceNumber) + } +} + +func (device *Device) GetCgroupAllowString() string { + return fmt.Sprintf("%c %s:%s %s", device.Type, GetDeviceNumberString(device.MajorNumber), GetDeviceNumberString(device.MinorNumber), device.CgroupPermissions) +} + +// Given the path to a device and it's cgroup_permissions(which cannot be easilly queried) look up the information about a linux device and return that information as a Device struct. +func GetDevice(path, cgroupPermissions string) (*Device, error) { + fileInfo, err := osLstat(path) + if err != nil { + return nil, err + } + + var ( + devType rune + mode = fileInfo.Mode() + fileModePermissionBits = os.FileMode.Perm(mode) + ) + + switch { + case mode&os.ModeDevice == 0: + return nil, ErrNotADeviceNode + case mode&os.ModeCharDevice != 0: + fileModePermissionBits |= syscall.S_IFCHR + devType = 'c' + default: + fileModePermissionBits |= syscall.S_IFBLK + devType = 'b' + } + + stat_t, ok := fileInfo.Sys().(*syscall.Stat_t) + if !ok { + return nil, fmt.Errorf("cannot determine the device number for device %s", path) + } + devNumber := int(stat_t.Rdev) + + return &Device{ + Type: devType, + Path: path, + MajorNumber: Major(devNumber), + MinorNumber: Minor(devNumber), + CgroupPermissions: cgroupPermissions, + FileMode: fileModePermissionBits, + Uid: stat_t.Uid, + Gid: stat_t.Gid, + }, nil +} + +func GetHostDeviceNodes() ([]*Device, error) { + return getDeviceNodes("/dev") +} + +func getDeviceNodes(path string) ([]*Device, error) { + files, err := ioutilReadDir(path) + if err != nil { + return nil, err + } + + out := []*Device{} + for _, f := range files { + if f.IsDir() { + switch f.Name() { + case "pts", "shm", "fd": + continue + default: + sub, err := getDeviceNodes(filepath.Join(path, f.Name())) + if err != nil { + return nil, err + } + + out = append(out, sub...) + continue + } + } + + device, err := GetDevice(filepath.Join(path, f.Name()), "rwm") + if err != nil { + if err == ErrNotADeviceNode { + continue + } + return nil, err + } + out = append(out, device) + } + + return out, nil +} diff --git a/devices/devices_test.go b/devices/devices_test.go new file mode 100644 index 00000000..fec40022 --- /dev/null +++ b/devices/devices_test.go @@ -0,0 +1,61 @@ +package devices + +import ( + "errors" + "os" + "testing" +) + +func TestGetDeviceLstatFailure(t *testing.T) { + testError := errors.New("test error") + + // Override os.Lstat to inject error. + osLstat = func(path string) (os.FileInfo, error) { + return nil, testError + } + + _, err := GetDevice("", "") + if err != testError { + t.Fatalf("Unexpected error %v, expected %v", err, testError) + } +} + +func TestGetHostDeviceNodesIoutilReadDirFailure(t *testing.T) { + testError := errors.New("test error") + + // Override ioutil.ReadDir to inject error. + ioutilReadDir = func(dirname string) ([]os.FileInfo, error) { + return nil, testError + } + + _, err := GetHostDeviceNodes() + if err != testError { + t.Fatalf("Unexpected error %v, expected %v", err, testError) + } +} + +func TestGetHostDeviceNodesIoutilReadDirDeepFailure(t *testing.T) { + testError := errors.New("test error") + called := false + + // Override ioutil.ReadDir to inject error after the first call. + ioutilReadDir = func(dirname string) ([]os.FileInfo, error) { + if called { + return nil, testError + } + called = true + + // Provoke a second call. + fi, err := os.Lstat("/tmp") + if err != nil { + t.Fatalf("Unexpected error %v", err) + } + + return []os.FileInfo{fi}, nil + } + + _, err := GetHostDeviceNodes() + if err != testError { + t.Fatalf("Unexpected error %v, expected %v", err, testError) + } +} diff --git a/devices/number.go b/devices/number.go new file mode 100644 index 00000000..3aae380b --- /dev/null +++ b/devices/number.go @@ -0,0 +1,26 @@ +package devices + +/* + +This code provides support for manipulating linux device numbers. It should be replaced by normal syscall functions once http://code.google.com/p/go/issues/detail?id=8106 is solved. + +You can read what they are here: + + - http://www.makelinux.net/ldd3/chp-3-sect-2 + - http://www.linux-tutorial.info/modules.php?name=MContent&pageid=94 + +Note! These are NOT the same as the MAJOR(dev_t device);, MINOR(dev_t device); and MKDEV(int major, int minor); functions as defined in as the representation of device numbers used by go is different than the one used internally to the kernel! - https://github.com/torvalds/linux/blob/master/include/linux/kdev_t.h#L9 + +*/ + +func Major(devNumber int) int64 { + return int64((devNumber >> 8) & 0xfff) +} + +func Minor(devNumber int) int64 { + return int64((devNumber & 0xff) | ((devNumber >> 12) & 0xfff00)) +} + +func Mkdev(majorNumber int64, minorNumber int64) int { + return int((majorNumber << 8) | (minorNumber & 0xff) | ((minorNumber & 0xfff00) << 12)) +} diff --git a/error.go b/error.go new file mode 100644 index 00000000..5ff56d80 --- /dev/null +++ b/error.go @@ -0,0 +1,37 @@ +package libcontainer + +// API error code type. +type ErrorCode int + +// API error codes. +const ( + // Factory errors + IdInUse ErrorCode = iota + InvalidIdFormat + // TODO: add Load errors + + // Container errors + ContainerDestroyed + ContainerPaused + + // Common errors + ConfigInvalid + SystemError +) + +// API Error type. +type Error interface { + error + + // Returns the stack trace, if any, which identifies the + // point at which the error occurred. + Stack() []byte + + // Returns a verbose string including the error message + // and a representation of the stack trace suitable for + // printing. + Detail() string + + // Returns the error code for this error. + Code() ErrorCode +} diff --git a/factory.go b/factory.go new file mode 100644 index 00000000..e37773b2 --- /dev/null +++ b/factory.go @@ -0,0 +1,32 @@ +package libcontainer + +type Factory interface { + + // Creates a new container with the given id and starts the initial process inside it. + // id must be a string containing only letters, digits and underscores and must contain + // between 1 and 1024 characters, inclusive. + // + // The id must not already be in use by an existing container. Containers created using + // a factory with the same path (and file system) must have distinct ids. + // + // Returns the new container with a running process. + // + // Errors: + // IdInUse - id is already in use by a container + // InvalidIdFormat - id has incorrect format + // ConfigInvalid - config is invalid + // SystemError - System error + // + // On error, any partially created container parts are cleaned up (the operation is atomic). + Create(id string, config *Config) (Container, Error) + + // Load takes an ID for an existing container and reconstructs the container + // from the state. + // + // Errors: + // Path does not exist + // Container is stopped + // System error + // TODO: fix description + Load(id string) (Container, Error) +} diff --git a/label/label.go b/label/label.go new file mode 100644 index 00000000..ce60296e --- /dev/null +++ b/label/label.go @@ -0,0 +1,45 @@ +// +build !selinux !linux + +package label + +// InitLabels returns the process label and file labels to be used within +// the container. A list of options can be passed into this function to alter +// the labels. +func InitLabels(options []string) (string, string, error) { + return "", "", nil +} + +func GenLabels(options string) (string, string, error) { + return "", "", nil +} + +func FormatMountLabel(src string, mountLabel string) string { + return src +} + +func SetProcessLabel(processLabel string) error { + return nil +} + +func SetFileLabel(path string, fileLabel string) error { + return nil +} + +func Relabel(path string, fileLabel string, relabel string) error { + return nil +} + +func GetPidLabel(pid int) (string, error) { + return "", nil +} + +func Init() { +} + +func ReserveLabel(label string) error { + return nil +} + +func UnreserveLabel(label string) error { + return nil +} diff --git a/label/label_selinux.go b/label/label_selinux.go new file mode 100644 index 00000000..65b84797 --- /dev/null +++ b/label/label_selinux.go @@ -0,0 +1,132 @@ +// +build selinux,linux + +package label + +import ( + "fmt" + "strings" + + "github.com/docker/libcontainer/selinux" +) + +// InitLabels returns the process label and file labels to be used within +// the container. A list of options can be passed into this function to alter +// the labels. The labels returned will include a random MCS String, that is +// guaranteed to be unique. +func InitLabels(options []string) (string, string, error) { + if !selinux.SelinuxEnabled() { + return "", "", nil + } + var err error + processLabel, mountLabel := selinux.GetLxcContexts() + if processLabel != "" { + pcon := selinux.NewContext(processLabel) + mcon := selinux.NewContext(mountLabel) + for _, opt := range options { + if opt == "disable" { + return "", "", nil + } + if i := strings.Index(opt, ":"); i == -1 { + return "", "", fmt.Errorf("Bad SELinux Option") + } + con := strings.SplitN(opt, ":", 2) + pcon[con[0]] = con[1] + if con[0] == "level" || con[0] == "user" { + mcon[con[0]] = con[1] + } + } + processLabel = pcon.Get() + mountLabel = mcon.Get() + } + return processLabel, mountLabel, err +} + +// DEPRECATED: The GenLabels function is only to be used during the transition to the official API. +func GenLabels(options string) (string, string, error) { + return InitLabels(strings.Fields(options)) +} + +// FormatMountLabel returns a string to be used by the mount command. +// The format of this string will be used to alter the labeling of the mountpoint. +// The string returned is suitable to be used as the options field of the mount command. +// If you need to have additional mount point options, you can pass them in as +// the first parameter. Second parameter is the label that you wish to apply +// to all content in the mount point. +func FormatMountLabel(src, mountLabel string) string { + if mountLabel != "" { + switch src { + case "": + src = fmt.Sprintf("context=%q", mountLabel) + default: + src = fmt.Sprintf("%s,context=%q", src, mountLabel) + } + } + return src +} + +// SetProcessLabel takes a process label and tells the kernel to assign the +// label to the next program executed by the current process. +func SetProcessLabel(processLabel string) error { + if processLabel == "" { + return nil + } + return selinux.Setexeccon(processLabel) +} + +// GetProcessLabel returns the process label that the kernel will assign +// to the next program executed by the current process. If "" is returned +// this indicates that the default labeling will happen for the process. +func GetProcessLabel() (string, error) { + return selinux.Getexeccon() +} + +// SetFileLabel modifies the "path" label to the specified file label +func SetFileLabel(path string, fileLabel string) error { + if selinux.SelinuxEnabled() && fileLabel != "" { + return selinux.Setfilecon(path, fileLabel) + } + return nil +} + +// Change the label of path to the filelabel string. If the relabel string +// is "z", relabel will change the MCS label to s0. This will allow all +// containers to share the content. If the relabel string is a "Z" then +// the MCS label should continue to be used. SELinux will use this field +// to make sure the content can not be shared by other containes. +func Relabel(path string, fileLabel string, relabel string) error { + if fileLabel == "" { + return nil + } + if relabel == "z" { + c := selinux.NewContext(fileLabel) + c["level"] = "s0" + fileLabel = c.Get() + } + return selinux.Chcon(path, fileLabel, true) +} + +// GetPidLabel will return the label of the process running with the specified pid +func GetPidLabel(pid int) (string, error) { + return selinux.Getpidcon(pid) +} + +// Init initialises the labeling system +func Init() { + selinux.SelinuxEnabled() +} + +// ReserveLabel will record the fact that the MCS label has already been used. +// This will prevent InitLabels from using the MCS label in a newly created +// container +func ReserveLabel(label string) error { + selinux.ReserveLabel(label) + return nil +} + +// UnreserveLabel will remove the reservation of the MCS label. +// This will allow InitLabels to use the MCS label in a newly created +// containers +func UnreserveLabel(label string) error { + selinux.FreeLxcContexts(label) + return nil +} diff --git a/label/label_selinux_test.go b/label/label_selinux_test.go new file mode 100644 index 00000000..c83654f6 --- /dev/null +++ b/label/label_selinux_test.go @@ -0,0 +1,48 @@ +// +build selinux,linux + +package label + +import ( + "testing" + + "github.com/docker/libcontainer/selinux" +) + +func TestInit(t *testing.T) { + if selinux.SelinuxEnabled() { + var testNull []string + plabel, mlabel, err := InitLabels(testNull) + if err != nil { + t.Log("InitLabels Failed") + t.Fatal(err) + } + testDisabled := []string{"disable"} + plabel, mlabel, err = InitLabels(testDisabled) + if err != nil { + t.Log("InitLabels Disabled Failed") + t.Fatal(err) + } + if plabel != "" { + t.Log("InitLabels Disabled Failed") + t.Fatal() + } + testUser := []string{"user:user_u", "role:user_r", "type:user_t", "level:s0:c1,c15"} + plabel, mlabel, err = InitLabels(testUser) + if err != nil { + t.Log("InitLabels User Failed") + t.Fatal(err) + } + if plabel != "user_u:user_r:user_t:s0:c1,c15" || mlabel != "user_u:object_r:svirt_sandbox_file_t:s0:c1,c15" { + t.Log("InitLabels User Failed") + t.Log(plabel, mlabel) + t.Fatal(err) + } + + testBadData := []string{"user", "role:user_r", "type:user_t", "level:s0:c1,c15"} + plabel, mlabel, err = InitLabels(testBadData) + if err == nil { + t.Log("InitLabels Bad Failed") + t.Fatal(err) + } + } +} diff --git a/mount/init.go b/mount/init.go new file mode 100644 index 00000000..ea2b7327 --- /dev/null +++ b/mount/init.go @@ -0,0 +1,208 @@ +// +build linux + +package mount + +import ( + "fmt" + "os" + "path/filepath" + "syscall" + + "github.com/docker/libcontainer/label" + "github.com/docker/libcontainer/mount/nodes" +) + +// default mount point flags +const defaultMountFlags = syscall.MS_NOEXEC | syscall.MS_NOSUID | syscall.MS_NODEV + +type mount struct { + source string + path string + device string + flags int + data string +} + +// InitializeMountNamespace sets up the devices, mount points, and filesystems for use inside a +// new mount namespace. +func InitializeMountNamespace(rootfs, console string, sysReadonly bool, mountConfig *MountConfig) error { + var ( + err error + flag = syscall.MS_PRIVATE + ) + + if mountConfig.NoPivotRoot { + flag = syscall.MS_SLAVE + } + + if err := syscall.Mount("", "/", "", uintptr(flag|syscall.MS_REC), ""); err != nil { + return fmt.Errorf("mounting / with flags %X %s", (flag | syscall.MS_REC), err) + } + + if err := syscall.Mount(rootfs, rootfs, "bind", syscall.MS_BIND|syscall.MS_REC, ""); err != nil { + return fmt.Errorf("mouting %s as bind %s", rootfs, err) + } + + if err := mountSystem(rootfs, sysReadonly, mountConfig); err != nil { + return fmt.Errorf("mount system %s", err) + } + + // apply any user specified mounts within the new mount namespace + for _, m := range mountConfig.Mounts { + if err := m.Mount(rootfs, mountConfig.MountLabel); err != nil { + return err + } + } + + if err := nodes.CreateDeviceNodes(rootfs, mountConfig.DeviceNodes); err != nil { + return fmt.Errorf("create device nodes %s", err) + } + + if err := SetupPtmx(rootfs, console, mountConfig.MountLabel); err != nil { + return err + } + + // stdin, stdout and stderr could be pointing to /dev/null from parent namespace. + // Re-open them inside this namespace. + if err := reOpenDevNull(rootfs); err != nil { + return fmt.Errorf("Failed to reopen /dev/null %s", err) + } + + if err := setupDevSymlinks(rootfs); err != nil { + return fmt.Errorf("dev symlinks %s", err) + } + + if err := syscall.Chdir(rootfs); err != nil { + return fmt.Errorf("chdir into %s %s", rootfs, err) + } + + if mountConfig.NoPivotRoot { + err = MsMoveRoot(rootfs) + } else { + err = PivotRoot(rootfs) + } + + if err != nil { + return err + } + + if mountConfig.ReadonlyFs { + if err := SetReadonly(); err != nil { + return fmt.Errorf("set readonly %s", err) + } + } + + syscall.Umask(0022) + + return nil +} + +// mountSystem sets up linux specific system mounts like sys, proc, shm, and devpts +// inside the mount namespace +func mountSystem(rootfs string, sysReadonly bool, mountConfig *MountConfig) error { + for _, m := range newSystemMounts(rootfs, mountConfig.MountLabel, sysReadonly) { + if err := os.MkdirAll(m.path, 0755); err != nil && !os.IsExist(err) { + return fmt.Errorf("mkdirall %s %s", m.path, err) + } + if err := syscall.Mount(m.source, m.path, m.device, uintptr(m.flags), m.data); err != nil { + return fmt.Errorf("mounting %s into %s %s", m.source, m.path, err) + } + } + return nil +} + +func createIfNotExists(path string, isDir bool) error { + if _, err := os.Stat(path); err != nil { + if os.IsNotExist(err) { + if isDir { + if err := os.MkdirAll(path, 0755); err != nil { + return err + } + } else { + if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil { + return err + } + f, err := os.OpenFile(path, os.O_CREATE, 0755) + if err != nil { + return err + } + f.Close() + } + } + } + return nil +} + +func setupDevSymlinks(rootfs string) error { + var links = [][2]string{ + {"/proc/self/fd", "/dev/fd"}, + {"/proc/self/fd/0", "/dev/stdin"}, + {"/proc/self/fd/1", "/dev/stdout"}, + {"/proc/self/fd/2", "/dev/stderr"}, + } + + // kcore support can be toggled with CONFIG_PROC_KCORE; only create a symlink + // in /dev if it exists in /proc. + if _, err := os.Stat("/proc/kcore"); err == nil { + links = append(links, [2]string{"/proc/kcore", "/dev/kcore"}) + } + + for _, link := range links { + var ( + src = link[0] + dst = filepath.Join(rootfs, link[1]) + ) + + if err := os.Symlink(src, dst); err != nil && !os.IsExist(err) { + return fmt.Errorf("symlink %s %s %s", src, dst, err) + } + } + + return nil +} + +// TODO: this is crappy right now and should be cleaned up with a better way of handling system and +// standard bind mounts allowing them to be more dynamic +func newSystemMounts(rootfs, mountLabel string, sysReadonly bool) []mount { + systemMounts := []mount{ + {source: "proc", path: filepath.Join(rootfs, "proc"), device: "proc", flags: defaultMountFlags}, + {source: "tmpfs", path: filepath.Join(rootfs, "dev"), device: "tmpfs", flags: syscall.MS_NOSUID | syscall.MS_STRICTATIME, data: label.FormatMountLabel("mode=755", mountLabel)}, + {source: "shm", path: filepath.Join(rootfs, "dev", "shm"), device: "tmpfs", flags: defaultMountFlags, data: label.FormatMountLabel("mode=1777,size=65536k", mountLabel)}, + {source: "devpts", path: filepath.Join(rootfs, "dev", "pts"), device: "devpts", flags: syscall.MS_NOSUID | syscall.MS_NOEXEC, data: label.FormatMountLabel("newinstance,ptmxmode=0666,mode=620,gid=5", mountLabel)}, + } + + sysMountFlags := defaultMountFlags + if sysReadonly { + sysMountFlags |= syscall.MS_RDONLY + } + + systemMounts = append(systemMounts, mount{source: "sysfs", path: filepath.Join(rootfs, "sys"), device: "sysfs", flags: sysMountFlags}) + + return systemMounts +} + +// Is stdin, stdout or stderr were to be pointing to '/dev/null', +// this method will make them point to '/dev/null' from within this namespace. +func reOpenDevNull(rootfs string) error { + var stat, devNullStat syscall.Stat_t + file, err := os.Open(filepath.Join(rootfs, "/dev/null")) + if err != nil { + return fmt.Errorf("Failed to open /dev/null - %s", err) + } + defer file.Close() + if err = syscall.Fstat(int(file.Fd()), &devNullStat); err != nil { + return fmt.Errorf("Failed to stat /dev/null - %s", err) + } + for fd := 0; fd < 3; fd++ { + if err = syscall.Fstat(fd, &stat); err != nil { + return fmt.Errorf("Failed to stat fd %d - %s", fd, err) + } + if stat.Rdev == devNullStat.Rdev { + // Close and re-open the fd. + if err = syscall.Dup2(int(file.Fd()), fd); err != nil { + return fmt.Errorf("Failed to dup fd %d to fd %d - %s", file.Fd(), fd, err) + } + } + } + return nil +} diff --git a/mount/mount.go b/mount/mount.go new file mode 100644 index 00000000..c1b42421 --- /dev/null +++ b/mount/mount.go @@ -0,0 +1,109 @@ +package mount + +import ( + "fmt" + "os" + "path/filepath" + "syscall" + + "github.com/docker/docker/pkg/symlink" + "github.com/docker/libcontainer/label" +) + +type Mount struct { + Type string `json:"type,omitempty"` + Source string `json:"source,omitempty"` // Source path, in the host namespace + Destination string `json:"destination,omitempty"` // Destination path, in the container + Writable bool `json:"writable,omitempty"` + Relabel string `json:"relabel,omitempty"` // Relabel source if set, "z" indicates shared, "Z" indicates unshared + Private bool `json:"private,omitempty"` + Slave bool `json:"slave,omitempty"` +} + +func (m *Mount) Mount(rootfs, mountLabel string) error { + switch m.Type { + case "bind": + return m.bindMount(rootfs, mountLabel) + case "tmpfs": + return m.tmpfsMount(rootfs, mountLabel) + default: + return fmt.Errorf("unsupported mount type %s for %s", m.Type, m.Destination) + } +} + +func (m *Mount) bindMount(rootfs, mountLabel string) error { + var ( + flags = syscall.MS_BIND | syscall.MS_REC + dest = filepath.Join(rootfs, m.Destination) + ) + + if !m.Writable { + flags = flags | syscall.MS_RDONLY + } + + if m.Slave { + flags = flags | syscall.MS_SLAVE + } + + stat, err := os.Stat(m.Source) + if err != nil { + return err + } + + // FIXME: (crosbymichael) This does not belong here and should be done a layer above + dest, err = symlink.FollowSymlinkInScope(dest, rootfs) + if err != nil { + return err + } + + if err := createIfNotExists(dest, stat.IsDir()); err != nil { + return fmt.Errorf("creating new bind mount target %s", err) + } + + if err := syscall.Mount(m.Source, dest, "bind", uintptr(flags), ""); err != nil { + return fmt.Errorf("mounting %s into %s %s", m.Source, dest, err) + } + + if !m.Writable { + if err := syscall.Mount(m.Source, dest, "bind", uintptr(flags|syscall.MS_REMOUNT), ""); err != nil { + return fmt.Errorf("remounting %s into %s %s", m.Source, dest, err) + } + } + + if m.Relabel != "" { + if err := label.Relabel(m.Source, mountLabel, m.Relabel); err != nil { + return fmt.Errorf("relabeling %s to %s %s", m.Source, mountLabel, err) + } + } + + if m.Private { + if err := syscall.Mount("", dest, "none", uintptr(syscall.MS_PRIVATE), ""); err != nil { + return fmt.Errorf("mounting %s private %s", dest, err) + } + } + + return nil +} + +func (m *Mount) tmpfsMount(rootfs, mountLabel string) error { + var ( + err error + l = label.FormatMountLabel("", mountLabel) + dest = filepath.Join(rootfs, m.Destination) + ) + + // FIXME: (crosbymichael) This does not belong here and should be done a layer above + if dest, err = symlink.FollowSymlinkInScope(dest, rootfs); err != nil { + return err + } + + if err := createIfNotExists(dest, true); err != nil { + return fmt.Errorf("creating new tmpfs mount target %s", err) + } + + if err := syscall.Mount("tmpfs", dest, "tmpfs", uintptr(defaultMountFlags), l); err != nil { + return fmt.Errorf("%s mounting %s in tmpfs", err, dest) + } + + return nil +} diff --git a/mount/mount_config.go b/mount/mount_config.go new file mode 100644 index 00000000..eef9b8ce --- /dev/null +++ b/mount/mount_config.go @@ -0,0 +1,28 @@ +package mount + +import ( + "errors" + + "github.com/docker/libcontainer/devices" +) + +var ErrUnsupported = errors.New("Unsupported method") + +type MountConfig struct { + // NoPivotRoot will use MS_MOVE and a chroot to jail the process into the container's rootfs + // This is a common option when the container is running in ramdisk + NoPivotRoot bool `json:"no_pivot_root,omitempty"` + + // ReadonlyFs will remount the container's rootfs as readonly where only externally mounted + // bind mounts are writtable + ReadonlyFs bool `json:"readonly_fs,omitempty"` + + // Mounts specify additional source and destination paths that will be mounted inside the container's + // rootfs and mount namespace if specified + Mounts []*Mount `json:"mounts,omitempty"` + + // The device nodes that should be automatically created within the container upon container start. Note, make sure that the node is marked as allowed in the cgroup as well! + DeviceNodes []*devices.Device `json:"device_nodes,omitempty"` + + MountLabel string `json:"mount_label,omitempty"` +} diff --git a/mount/msmoveroot.go b/mount/msmoveroot.go new file mode 100644 index 00000000..94afd3a9 --- /dev/null +++ b/mount/msmoveroot.go @@ -0,0 +1,20 @@ +// +build linux + +package mount + +import ( + "fmt" + "syscall" +) + +func MsMoveRoot(rootfs string) error { + if err := syscall.Mount(rootfs, "/", "", syscall.MS_MOVE, ""); err != nil { + return fmt.Errorf("mount move %s into / %s", rootfs, err) + } + + if err := syscall.Chroot("."); err != nil { + return fmt.Errorf("chroot . %s", err) + } + + return syscall.Chdir("/") +} diff --git a/mount/nodes/nodes.go b/mount/nodes/nodes.go new file mode 100644 index 00000000..322c0c0e --- /dev/null +++ b/mount/nodes/nodes.go @@ -0,0 +1,57 @@ +// +build linux + +package nodes + +import ( + "fmt" + "os" + "path/filepath" + "syscall" + + "github.com/docker/libcontainer/devices" +) + +// Create the device nodes in the container. +func CreateDeviceNodes(rootfs string, nodesToCreate []*devices.Device) error { + oldMask := syscall.Umask(0000) + defer syscall.Umask(oldMask) + + for _, node := range nodesToCreate { + if err := CreateDeviceNode(rootfs, node); err != nil { + return err + } + } + return nil +} + +// Creates the device node in the rootfs of the container. +func CreateDeviceNode(rootfs string, node *devices.Device) error { + var ( + dest = filepath.Join(rootfs, node.Path) + parent = filepath.Dir(dest) + ) + + if err := os.MkdirAll(parent, 0755); err != nil { + return err + } + + fileMode := node.FileMode + switch node.Type { + case 'c': + fileMode |= syscall.S_IFCHR + case 'b': + fileMode |= syscall.S_IFBLK + default: + return fmt.Errorf("%c is not a valid device type for device %s", node.Type, node.Path) + } + + if err := syscall.Mknod(dest, uint32(fileMode), devices.Mkdev(node.MajorNumber, node.MinorNumber)); err != nil && !os.IsExist(err) { + return fmt.Errorf("mknod %s %s", node.Path, err) + } + + if err := syscall.Chown(dest, int(node.Uid), int(node.Gid)); err != nil { + return fmt.Errorf("chown %s to %d:%d", node.Path, node.Uid, node.Gid) + } + + return nil +} diff --git a/mount/nodes/nodes_unsupported.go b/mount/nodes/nodes_unsupported.go new file mode 100644 index 00000000..83660715 --- /dev/null +++ b/mount/nodes/nodes_unsupported.go @@ -0,0 +1,13 @@ +// +build !linux + +package nodes + +import ( + "errors" + + "github.com/docker/libcontainer/devices" +) + +func CreateDeviceNodes(rootfs string, nodesToCreate []*devices.Device) error { + return errors.New("Unsupported method") +} diff --git a/mount/pivotroot.go b/mount/pivotroot.go new file mode 100644 index 00000000..a88ed4a8 --- /dev/null +++ b/mount/pivotroot.go @@ -0,0 +1,34 @@ +// +build linux + +package mount + +import ( + "fmt" + "io/ioutil" + "os" + "path/filepath" + "syscall" +) + +func PivotRoot(rootfs string) error { + pivotDir, err := ioutil.TempDir(rootfs, ".pivot_root") + if err != nil { + return fmt.Errorf("can't create pivot_root dir %s, error %v", pivotDir, err) + } + + if err := syscall.PivotRoot(rootfs, pivotDir); err != nil { + return fmt.Errorf("pivot_root %s", err) + } + + if err := syscall.Chdir("/"); err != nil { + return fmt.Errorf("chdir / %s", err) + } + + // path to pivot dir now changed, update + pivotDir = filepath.Join("/", filepath.Base(pivotDir)) + if err := syscall.Unmount(pivotDir, syscall.MNT_DETACH); err != nil { + return fmt.Errorf("unmount pivot_root dir %s", err) + } + + return os.Remove(pivotDir) +} diff --git a/mount/ptmx.go b/mount/ptmx.go new file mode 100644 index 00000000..c316481a --- /dev/null +++ b/mount/ptmx.go @@ -0,0 +1,30 @@ +// +build linux + +package mount + +import ( + "fmt" + "os" + "path/filepath" + + "github.com/docker/libcontainer/console" +) + +func SetupPtmx(rootfs, consolePath, mountLabel string) error { + ptmx := filepath.Join(rootfs, "dev/ptmx") + if err := os.Remove(ptmx); err != nil && !os.IsNotExist(err) { + return err + } + + if err := os.Symlink("pts/ptmx", ptmx); err != nil { + return fmt.Errorf("symlink dev ptmx %s", err) + } + + if consolePath != "" { + if err := console.Setup(rootfs, consolePath, mountLabel); err != nil { + return err + } + } + + return nil +} diff --git a/mount/readonly.go b/mount/readonly.go new file mode 100644 index 00000000..9b4a6f70 --- /dev/null +++ b/mount/readonly.go @@ -0,0 +1,11 @@ +// +build linux + +package mount + +import ( + "syscall" +) + +func SetReadonly() error { + return syscall.Mount("/", "/", "bind", syscall.MS_BIND|syscall.MS_REMOUNT|syscall.MS_RDONLY|syscall.MS_REC, "") +} diff --git a/mount/remount.go b/mount/remount.go new file mode 100644 index 00000000..99a01209 --- /dev/null +++ b/mount/remount.go @@ -0,0 +1,31 @@ +// +build linux + +package mount + +import "syscall" + +func RemountProc() error { + if err := syscall.Unmount("/proc", syscall.MNT_DETACH); err != nil { + return err + } + + if err := syscall.Mount("proc", "/proc", "proc", uintptr(defaultMountFlags), ""); err != nil { + return err + } + + return nil +} + +func RemountSys() error { + if err := syscall.Unmount("/sys", syscall.MNT_DETACH); err != nil { + if err != syscall.EINVAL { + return err + } + } else { + if err := syscall.Mount("sysfs", "/sys", "sysfs", uintptr(defaultMountFlags), ""); err != nil { + return err + } + } + + return nil +} diff --git a/namespaces/create.go b/namespaces/create.go new file mode 100644 index 00000000..b6418b6e --- /dev/null +++ b/namespaces/create.go @@ -0,0 +1,10 @@ +package namespaces + +import ( + "os" + "os/exec" + + "github.com/docker/libcontainer" +) + +type CreateCommand func(container *libcontainer.Config, console, dataPath, init string, childPipe *os.File, args []string) *exec.Cmd diff --git a/namespaces/exec.go b/namespaces/exec.go new file mode 100644 index 00000000..4440ccd0 --- /dev/null +++ b/namespaces/exec.go @@ -0,0 +1,200 @@ +// +build linux + +package namespaces + +import ( + "io" + "os" + "os/exec" + "syscall" + + "github.com/docker/libcontainer" + "github.com/docker/libcontainer/cgroups" + "github.com/docker/libcontainer/cgroups/fs" + "github.com/docker/libcontainer/cgroups/systemd" + "github.com/docker/libcontainer/network" + "github.com/docker/libcontainer/syncpipe" + "github.com/docker/libcontainer/system" +) + +// TODO(vishh): This is part of the libcontainer API and it does much more than just namespaces related work. +// Move this to libcontainer package. +// Exec performs setup outside of a namespace so that a container can be +// executed. Exec is a high level function for working with container namespaces. +func Exec(container *libcontainer.Config, stdin io.Reader, stdout, stderr io.Writer, console, dataPath string, args []string, createCommand CreateCommand, startCallback func()) (int, error) { + var ( + err error + ) + + // create a pipe so that we can syncronize with the namespaced process and + // pass the veth name to the child + syncPipe, err := syncpipe.NewSyncPipe() + if err != nil { + return -1, err + } + defer syncPipe.Close() + + command := createCommand(container, console, dataPath, os.Args[0], syncPipe.Child(), args) + // Note: these are only used in non-tty mode + // if there is a tty for the container it will be opened within the namespace and the + // fds will be duped to stdin, stdiout, and stderr + command.Stdin = stdin + command.Stdout = stdout + command.Stderr = stderr + + if err := command.Start(); err != nil { + return -1, err + } + + // Now we passed the pipe to the child, close our side + syncPipe.CloseChild() + + started, err := system.GetProcessStartTime(command.Process.Pid) + if err != nil { + return -1, err + } + + // Do this before syncing with child so that no children + // can escape the cgroup + cgroupRef, err := SetupCgroups(container, command.Process.Pid) + if err != nil { + command.Process.Kill() + command.Wait() + return -1, err + } + defer cgroupRef.Cleanup() + + cgroupPaths, err := cgroupRef.Paths() + if err != nil { + command.Process.Kill() + command.Wait() + return -1, err + } + + var networkState network.NetworkState + if err := InitializeNetworking(container, command.Process.Pid, syncPipe, &networkState); err != nil { + command.Process.Kill() + command.Wait() + return -1, err + } + + state := &libcontainer.State{ + InitPid: command.Process.Pid, + InitStartTime: started, + NetworkState: networkState, + CgroupPaths: cgroupPaths, + } + + if err := libcontainer.SaveState(dataPath, state); err != nil { + command.Process.Kill() + command.Wait() + return -1, err + } + defer libcontainer.DeleteState(dataPath) + + // Sync with child + if err := syncPipe.ReadFromChild(); err != nil { + command.Process.Kill() + command.Wait() + return -1, err + } + + if startCallback != nil { + startCallback() + } + + if err := command.Wait(); err != nil { + if _, ok := err.(*exec.ExitError); !ok { + return -1, err + } + } + + return command.ProcessState.Sys().(syscall.WaitStatus).ExitStatus(), nil +} + +// DefaultCreateCommand will return an exec.Cmd with the Cloneflags set to the proper namespaces +// defined on the container's configuration and use the current binary as the init with the +// args provided +// +// console: the /dev/console to setup inside the container +// init: the program executed inside the namespaces +// root: the path to the container json file and information +// pipe: sync pipe to synchronize the parent and child processes +// args: the arguments to pass to the container to run as the user's program +func DefaultCreateCommand(container *libcontainer.Config, console, dataPath, init string, pipe *os.File, args []string) *exec.Cmd { + // get our binary name from arg0 so we can always reexec ourself + env := []string{ + "console=" + console, + "pipe=3", + "data_path=" + dataPath, + } + + /* + TODO: move user and wd into env + if user != "" { + env = append(env, "user="+user) + } + if workingDir != "" { + env = append(env, "wd="+workingDir) + } + */ + + command := exec.Command(init, append([]string{"init", "--"}, args...)...) + // make sure the process is executed inside the context of the rootfs + command.Dir = container.RootFs + command.Env = append(os.Environ(), env...) + + if command.SysProcAttr == nil { + command.SysProcAttr = &syscall.SysProcAttr{} + } + command.SysProcAttr.Cloneflags = uintptr(GetNamespaceFlags(container.Namespaces)) + + command.SysProcAttr.Pdeathsig = syscall.SIGKILL + command.ExtraFiles = []*os.File{pipe} + + return command +} + +// SetupCgroups applies the cgroup restrictions to the process running in the container based +// on the container's configuration +func SetupCgroups(container *libcontainer.Config, nspid int) (cgroups.ActiveCgroup, error) { + if container.Cgroups != nil { + c := container.Cgroups + + if systemd.UseSystemd() { + return systemd.Apply(c, nspid) + } + + return fs.Apply(c, nspid) + } + + return nil, nil +} + +// InitializeNetworking creates the container's network stack outside of the namespace and moves +// interfaces into the container's net namespaces if necessary +func InitializeNetworking(container *libcontainer.Config, nspid int, pipe *syncpipe.SyncPipe, networkState *network.NetworkState) error { + for _, config := range container.Networks { + strategy, err := network.GetStrategy(config.Type) + if err != nil { + return err + } + if err := strategy.Create((*network.Network)(config), nspid, networkState); err != nil { + return err + } + } + return pipe.SendToChild(networkState) +} + +// GetNamespaceFlags parses the container's Namespaces options to set the correct +// flags on clone, unshare, and setns +func GetNamespaceFlags(namespaces map[string]bool) (flag int) { + for key, enabled := range namespaces { + if enabled { + if ns := GetNamespace(key); ns != nil { + flag |= ns.Value + } + } + } + return flag +} diff --git a/namespaces/execin.go b/namespaces/execin.go new file mode 100644 index 00000000..53e676ac --- /dev/null +++ b/namespaces/execin.go @@ -0,0 +1,119 @@ +// +build linux + +package namespaces + +import ( + "fmt" + "io" + "os" + "os/exec" + "path/filepath" + "strconv" + "syscall" + + "github.com/docker/libcontainer" + "github.com/docker/libcontainer/apparmor" + "github.com/docker/libcontainer/cgroups" + "github.com/docker/libcontainer/label" + "github.com/docker/libcontainer/syncpipe" + "github.com/docker/libcontainer/system" +) + +// ExecIn reexec's the initPath with the argv 0 rewrite to "nsenter" so that it is able to run the +// setns code in a single threaded environment joining the existing containers' namespaces. +func ExecIn(container *libcontainer.Config, state *libcontainer.State, userArgs []string, initPath, action string, + stdin io.Reader, stdout, stderr io.Writer, console string, startCallback func(*exec.Cmd)) (int, error) { + + args := []string{fmt.Sprintf("nsenter-%s", action), "--nspid", strconv.Itoa(state.InitPid)} + + if console != "" { + args = append(args, "--console", console) + } + + cmd := &exec.Cmd{ + Path: initPath, + Args: append(args, append([]string{"--"}, userArgs...)...), + } + + if filepath.Base(initPath) == initPath { + if lp, err := exec.LookPath(initPath); err == nil { + cmd.Path = lp + } + } + + pipe, err := syncpipe.NewSyncPipe() + if err != nil { + return -1, err + } + defer pipe.Close() + + // Note: these are only used in non-tty mode + // if there is a tty for the container it will be opened within the namespace and the + // fds will be duped to stdin, stdiout, and stderr + cmd.Stdin = stdin + cmd.Stdout = stdout + cmd.Stderr = stderr + + cmd.ExtraFiles = []*os.File{pipe.Child()} + + if err := cmd.Start(); err != nil { + return -1, err + } + pipe.CloseChild() + + // Enter cgroups. + if err := EnterCgroups(state, cmd.Process.Pid); err != nil { + return -1, err + } + + if err := pipe.SendToChild(container); err != nil { + cmd.Process.Kill() + cmd.Wait() + return -1, err + } + + if startCallback != nil { + startCallback(cmd) + } + + if err := cmd.Wait(); err != nil { + if _, ok := err.(*exec.ExitError); !ok { + return -1, err + } + } + + return cmd.ProcessState.Sys().(syscall.WaitStatus).ExitStatus(), nil +} + +// Finalize expects that the setns calls have been setup and that is has joined an +// existing namespace +func FinalizeSetns(container *libcontainer.Config, args []string) error { + // clear the current processes env and replace it with the environment defined on the container + if err := LoadContainerEnvironment(container); err != nil { + return err + } + + if err := FinalizeNamespace(container); err != nil { + return err + } + + if err := apparmor.ApplyProfile(container.AppArmorProfile); err != nil { + return fmt.Errorf("set apparmor profile %s: %s", container.AppArmorProfile, err) + } + + if container.ProcessLabel != "" { + if err := label.SetProcessLabel(container.ProcessLabel); err != nil { + return err + } + } + + if err := system.Execv(args[0], args[0:], container.Env); err != nil { + return err + } + + panic("unreachable") +} + +func EnterCgroups(state *libcontainer.State, pid int) error { + return cgroups.EnterPid(state.CgroupPaths, pid) +} diff --git a/namespaces/init.go b/namespaces/init.go new file mode 100644 index 00000000..4c2b3327 --- /dev/null +++ b/namespaces/init.go @@ -0,0 +1,263 @@ +// +build linux + +package namespaces + +import ( + "fmt" + "os" + "strings" + "syscall" + + "github.com/docker/libcontainer" + "github.com/docker/libcontainer/apparmor" + "github.com/docker/libcontainer/console" + "github.com/docker/libcontainer/label" + "github.com/docker/libcontainer/mount" + "github.com/docker/libcontainer/netlink" + "github.com/docker/libcontainer/network" + "github.com/docker/libcontainer/security/capabilities" + "github.com/docker/libcontainer/security/restrict" + "github.com/docker/libcontainer/syncpipe" + "github.com/docker/libcontainer/system" + "github.com/docker/libcontainer/user" + "github.com/docker/libcontainer/utils" +) + +// TODO(vishh): This is part of the libcontainer API and it does much more than just namespaces related work. +// Move this to libcontainer package. +// Init is the init process that first runs inside a new namespace to setup mounts, users, networking, +// and other options required for the new container. +// The caller of Init function has to ensure that the go runtime is locked to an OS thread +// (using runtime.LockOSThread) else system calls like setns called within Init may not work as intended. +func Init(container *libcontainer.Config, uncleanRootfs, consolePath string, syncPipe *syncpipe.SyncPipe, args []string) (err error) { + defer func() { + if err != nil { + syncPipe.ReportChildError(err) + } + }() + + rootfs, err := utils.ResolveRootfs(uncleanRootfs) + if err != nil { + return err + } + + // clear the current processes env and replace it with the environment + // defined on the container + if err := LoadContainerEnvironment(container); err != nil { + return err + } + + // We always read this as it is a way to sync with the parent as well + var networkState *network.NetworkState + if err := syncPipe.ReadFromParent(&networkState); err != nil { + return err + } + + if consolePath != "" { + if err := console.OpenAndDup(consolePath); err != nil { + return err + } + } + if _, err := syscall.Setsid(); err != nil { + return fmt.Errorf("setsid %s", err) + } + if consolePath != "" { + if err := system.Setctty(); err != nil { + return fmt.Errorf("setctty %s", err) + } + } + if err := setupNetwork(container, networkState); err != nil { + return fmt.Errorf("setup networking %s", err) + } + if err := setupRoute(container); err != nil { + return fmt.Errorf("setup route %s", err) + } + + label.Init() + + if err := mount.InitializeMountNamespace(rootfs, + consolePath, + container.RestrictSys, + (*mount.MountConfig)(container.MountConfig)); err != nil { + return fmt.Errorf("setup mount namespace %s", err) + } + + if container.Hostname != "" { + if err := syscall.Sethostname([]byte(container.Hostname)); err != nil { + return fmt.Errorf("sethostname %s", err) + } + } + + if err := apparmor.ApplyProfile(container.AppArmorProfile); err != nil { + return fmt.Errorf("set apparmor profile %s: %s", container.AppArmorProfile, err) + } + + if err := label.SetProcessLabel(container.ProcessLabel); err != nil { + return fmt.Errorf("set process label %s", err) + } + + // TODO: (crosbymichael) make this configurable at the Config level + if container.RestrictSys { + if err := restrict.Restrict("proc/sys", "proc/sysrq-trigger", "proc/irq", "proc/bus"); err != nil { + return err + } + } + + pdeathSignal, err := system.GetParentDeathSignal() + if err != nil { + return fmt.Errorf("get parent death signal %s", err) + } + + if err := FinalizeNamespace(container); err != nil { + return fmt.Errorf("finalize namespace %s", err) + } + + // FinalizeNamespace can change user/group which clears the parent death + // signal, so we restore it here. + if err := RestoreParentDeathSignal(pdeathSignal); err != nil { + return fmt.Errorf("restore parent death signal %s", err) + } + + return system.Execv(args[0], args[0:], os.Environ()) +} + +// RestoreParentDeathSignal sets the parent death signal to old. +func RestoreParentDeathSignal(old int) error { + if old == 0 { + return nil + } + + current, err := system.GetParentDeathSignal() + if err != nil { + return fmt.Errorf("get parent death signal %s", err) + } + + if old == current { + return nil + } + + if err := system.ParentDeathSignal(uintptr(old)); err != nil { + return fmt.Errorf("set parent death signal %s", err) + } + + // Signal self if parent is already dead. Does nothing if running in a new + // PID namespace, as Getppid will always return 0. + if syscall.Getppid() == 1 { + return syscall.Kill(syscall.Getpid(), syscall.SIGKILL) + } + + return nil +} + +// SetupUser changes the groups, gid, and uid for the user inside the container +func SetupUser(u string) error { + uid, gid, suppGids, home, err := user.GetUserGroupSupplementaryHome(u, syscall.Getuid(), syscall.Getgid(), "/") + if err != nil { + return fmt.Errorf("get supplementary groups %s", err) + } + + if err := syscall.Setgroups(suppGids); err != nil { + return fmt.Errorf("setgroups %s", err) + } + + if err := syscall.Setgid(gid); err != nil { + return fmt.Errorf("setgid %s", err) + } + + if err := syscall.Setuid(uid); err != nil { + return fmt.Errorf("setuid %s", err) + } + + // if we didn't get HOME already, set it based on the user's HOME + if envHome := os.Getenv("HOME"); envHome == "" { + if err := os.Setenv("HOME", home); err != nil { + return fmt.Errorf("set HOME %s", err) + } + } + + return nil +} + +// setupVethNetwork uses the Network config if it is not nil to initialize +// the new veth interface inside the container for use by changing the name to eth0 +// setting the MTU and IP address along with the default gateway +func setupNetwork(container *libcontainer.Config, networkState *network.NetworkState) error { + for _, config := range container.Networks { + strategy, err := network.GetStrategy(config.Type) + if err != nil { + return err + } + + err1 := strategy.Initialize((*network.Network)(config), networkState) + if err1 != nil { + return err1 + } + } + return nil +} + +func setupRoute(container *libcontainer.Config) error { + for _, config := range container.Routes { + if err := netlink.AddRoute(config.Destination, config.Source, config.Gateway, config.InterfaceName); err != nil { + return err + } + } + return nil +} + +// FinalizeNamespace drops the caps, sets the correct user +// and working dir, and closes any leaky file descriptors +// before execing the command inside the namespace +func FinalizeNamespace(container *libcontainer.Config) error { + // Ensure that all non-standard fds we may have accidentally + // inherited are marked close-on-exec so they stay out of the + // container + if err := utils.CloseExecFrom(3); err != nil { + return fmt.Errorf("close open file descriptors %s", err) + } + + // drop capabilities in bounding set before changing user + if err := capabilities.DropBoundingSet(container.Capabilities); err != nil { + return fmt.Errorf("drop bounding set %s", err) + } + + // preserve existing capabilities while we change users + if err := system.SetKeepCaps(); err != nil { + return fmt.Errorf("set keep caps %s", err) + } + + if err := SetupUser(container.User); err != nil { + return fmt.Errorf("setup user %s", err) + } + + if err := system.ClearKeepCaps(); err != nil { + return fmt.Errorf("clear keep caps %s", err) + } + + // drop all other capabilities + if err := capabilities.DropCapabilities(container.Capabilities); err != nil { + return fmt.Errorf("drop capabilities %s", err) + } + + if container.WorkingDir != "" { + if err := syscall.Chdir(container.WorkingDir); err != nil { + return fmt.Errorf("chdir to %s %s", container.WorkingDir, err) + } + } + + return nil +} + +func LoadContainerEnvironment(container *libcontainer.Config) error { + os.Clearenv() + for _, pair := range container.Env { + p := strings.SplitN(pair, "=", 2) + if len(p) < 2 { + return fmt.Errorf("invalid environment '%v'", pair) + } + if err := os.Setenv(p[0], p[1]); err != nil { + return err + } + } + return nil +} diff --git a/namespaces/nsenter/README.md b/namespaces/nsenter/README.md new file mode 100644 index 00000000..ac94cba0 --- /dev/null +++ b/namespaces/nsenter/README.md @@ -0,0 +1,6 @@ +## nsenter + +The `nsenter` package registers a special init constructor that is called before the Go runtime has +a chance to boot. This provides us the ability to `setns` on existing namespaces and avoid the issues +that the Go runtime has with multiple threads. This constructor is only called if this package is +registered, imported, in your go application and the argv 0 is `nsenter`. diff --git a/namespaces/nsenter/nsenter.c b/namespaces/nsenter/nsenter.c new file mode 100644 index 00000000..2869dd14 --- /dev/null +++ b/namespaces/nsenter/nsenter.c @@ -0,0 +1,218 @@ +// +build cgo +// +// formated with indent -linux nsenter.c + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static const kBufSize = 256; +static const char *kNsEnter = "nsenter"; + +void get_args(int *argc, char ***argv) +{ + // Read argv + int fd = open("/proc/self/cmdline", O_RDONLY); + + // Read the whole commandline. + ssize_t contents_size = 0; + ssize_t contents_offset = 0; + char *contents = NULL; + ssize_t bytes_read = 0; + do { + contents_size += kBufSize; + contents = (char *)realloc(contents, contents_size); + bytes_read = + read(fd, contents + contents_offset, + contents_size - contents_offset); + contents_offset += bytes_read; + } + while (bytes_read > 0); + close(fd); + + // Parse the commandline into an argv. /proc/self/cmdline has \0 delimited args. + ssize_t i; + *argc = 0; + for (i = 0; i < contents_offset; i++) { + if (contents[i] == '\0') { + (*argc)++; + } + } + *argv = (char **)malloc(sizeof(char *) * ((*argc) + 1)); + int idx; + for (idx = 0; idx < (*argc); idx++) { + (*argv)[idx] = contents; + contents += strlen(contents) + 1; + } + (*argv)[*argc] = NULL; +} + +// Use raw setns syscall for versions of glibc that don't include it (namely glibc-2.12) +#if __GLIBC__ == 2 && __GLIBC_MINOR__ < 14 +#define _GNU_SOURCE +#include +#include "syscall.h" +#ifdef SYS_setns +int setns(int fd, int nstype) +{ + return syscall(SYS_setns, fd, nstype); +} +#endif +#endif + +void print_usage() +{ + fprintf(stderr, + "nsenter --nspid --console -- cmd1 arg1 arg2...\n"); +} + +void nsenter() +{ + int argc, c; + char **argv; + get_args(&argc, &argv); + + // check argv 0 to ensure that we are supposed to setns + // we use strncmp to test for a value of "nsenter" but also allows alternate implmentations + // after the setns code path to continue to use the argv 0 to determine actions to be run + // resulting in the ability to specify "nsenter-mknod", "nsenter-exec", etc... + if (strncmp(argv[0], kNsEnter, strlen(kNsEnter)) != 0) { + return; + } + + static const struct option longopts[] = { + {"nspid", required_argument, NULL, 'n'}, + {"console", required_argument, NULL, 't'}, + {NULL, 0, NULL, 0} + }; + + pid_t init_pid = -1; + char *init_pid_str = NULL; + char *console = NULL; + while ((c = getopt_long_only(argc, argv, "n:c:", longopts, NULL)) != -1) { + switch (c) { + case 'n': + init_pid_str = optarg; + break; + case 't': + console = optarg; + break; + } + } + + if (init_pid_str == NULL) { + print_usage(); + exit(1); + } + + init_pid = strtol(init_pid_str, NULL, 10); + if ((init_pid == 0 && errno == EINVAL) || errno == ERANGE) { + fprintf(stderr, + "nsenter: Failed to parse PID from \"%s\" with output \"%d\" and error: \"%s\"\n", + init_pid_str, init_pid, strerror(errno)); + print_usage(); + exit(1); + } + + argc -= 3; + argv += 3; + + if (setsid() == -1) { + fprintf(stderr, "setsid failed. Error: %s\n", strerror(errno)); + exit(1); + } + // before we setns we need to dup the console + int consolefd = -1; + if (console != NULL) { + consolefd = open(console, O_RDWR); + if (consolefd < 0) { + fprintf(stderr, + "nsenter: failed to open console %s %s\n", + console, strerror(errno)); + exit(1); + } + } + // Setns on all supported namespaces. + char ns_dir[PATH_MAX]; + memset(ns_dir, 0, PATH_MAX); + snprintf(ns_dir, PATH_MAX - 1, "/proc/%d/ns/", init_pid); + + char *namespaces[] = { "ipc", "uts", "net", "pid", "mnt" }; + const int num = sizeof(namespaces) / sizeof(char *); + int i; + for (i = 0; i < num; i++) { + char buf[PATH_MAX]; + memset(buf, 0, PATH_MAX); + snprintf(buf, PATH_MAX - 1, "%s%s", ns_dir, namespaces[i]); + int fd = open(buf, O_RDONLY); + if (fd == -1) { + // Ignore nonexistent namespaces. + if (errno == ENOENT) + continue; + + fprintf(stderr, + "nsenter: Failed to open ns file \"%s\" for ns \"%s\" with error: \"%s\"\n", + buf, namespaces[i], strerror(errno)); + exit(1); + } + // Set the namespace. + if (setns(fd, 0) == -1) { + fprintf(stderr, + "nsenter: Failed to setns for \"%s\" with error: \"%s\"\n", + namespaces[i], strerror(errno)); + exit(1); + } + close(fd); + } + + // We must fork to actually enter the PID namespace. + int child = fork(); + if (child == 0) { + if (consolefd != -1) { + if (dup2(consolefd, STDIN_FILENO) != 0) { + fprintf(stderr, "nsenter: failed to dup 0 %s\n", + strerror(errno)); + exit(1); + } + if (dup2(consolefd, STDOUT_FILENO) != STDOUT_FILENO) { + fprintf(stderr, "nsenter: failed to dup 1 %s\n", + strerror(errno)); + exit(1); + } + if (dup2(consolefd, STDERR_FILENO) != STDERR_FILENO) { + fprintf(stderr, "nsenter: failed to dup 2 %s\n", + strerror(errno)); + exit(1); + } + } + // Finish executing, let the Go runtime take over. + return; + } else { + // Parent, wait for the child. + int status = 0; + if (waitpid(child, &status, 0) == -1) { + fprintf(stderr, + "nsenter: Failed to waitpid with error: \"%s\"\n", + strerror(errno)); + exit(1); + } + // Forward the child's exit code or re-send its death signal. + if (WIFEXITED(status)) { + exit(WEXITSTATUS(status)); + } else if (WIFSIGNALED(status)) { + kill(getpid(), WTERMSIG(status)); + } + + exit(1); + } + + return; +} diff --git a/namespaces/nsenter/nsenter.go b/namespaces/nsenter/nsenter.go new file mode 100644 index 00000000..7d21e8e5 --- /dev/null +++ b/namespaces/nsenter/nsenter.go @@ -0,0 +1,10 @@ +// +build linux + +package nsenter + +/* +__attribute__((constructor)) init() { + nsenter(); +} +*/ +import "C" diff --git a/namespaces/nsenter/nsenter_unsupported.go b/namespaces/nsenter/nsenter_unsupported.go new file mode 100644 index 00000000..2459c636 --- /dev/null +++ b/namespaces/nsenter/nsenter_unsupported.go @@ -0,0 +1,3 @@ +// +build !linux !cgo + +package nsenter diff --git a/namespaces/types.go b/namespaces/types.go new file mode 100644 index 00000000..16ce981e --- /dev/null +++ b/namespaces/types.go @@ -0,0 +1,50 @@ +package namespaces + +import "errors" + +type ( + Namespace struct { + Key string `json:"key,omitempty"` + Value int `json:"value,omitempty"` + File string `json:"file,omitempty"` + } + Namespaces []*Namespace +) + +// namespaceList is used to convert the libcontainer types +// into the names of the files located in /proc//ns/* for +// each namespace +var ( + namespaceList = Namespaces{} + ErrUnkownNamespace = errors.New("Unknown namespace") + ErrUnsupported = errors.New("Unsupported method") +) + +func (ns *Namespace) String() string { + return ns.Key +} + +func GetNamespace(key string) *Namespace { + for _, ns := range namespaceList { + if ns.Key == key { + cpy := *ns + return &cpy + } + } + return nil +} + +// Contains returns true if the specified Namespace is +// in the slice +func (n Namespaces) Contains(ns string) bool { + return n.Get(ns) != nil +} + +func (n Namespaces) Get(ns string) *Namespace { + for _, nsp := range n { + if nsp != nil && nsp.Key == ns { + return nsp + } + } + return nil +} diff --git a/namespaces/types_linux.go b/namespaces/types_linux.go new file mode 100644 index 00000000..d3079944 --- /dev/null +++ b/namespaces/types_linux.go @@ -0,0 +1,16 @@ +package namespaces + +import ( + "syscall" +) + +func init() { + namespaceList = Namespaces{ + {Key: "NEWNS", Value: syscall.CLONE_NEWNS, File: "mnt"}, + {Key: "NEWUTS", Value: syscall.CLONE_NEWUTS, File: "uts"}, + {Key: "NEWIPC", Value: syscall.CLONE_NEWIPC, File: "ipc"}, + {Key: "NEWUSER", Value: syscall.CLONE_NEWUSER, File: "user"}, + {Key: "NEWPID", Value: syscall.CLONE_NEWPID, File: "pid"}, + {Key: "NEWNET", Value: syscall.CLONE_NEWNET, File: "net"}, + } +} diff --git a/namespaces/types_test.go b/namespaces/types_test.go new file mode 100644 index 00000000..4d0a72c9 --- /dev/null +++ b/namespaces/types_test.go @@ -0,0 +1,30 @@ +package namespaces + +import ( + "testing" +) + +func TestNamespacesContains(t *testing.T) { + ns := Namespaces{ + GetNamespace("NEWPID"), + GetNamespace("NEWNS"), + GetNamespace("NEWUTS"), + } + + if ns.Contains("NEWNET") { + t.Fatal("namespaces should not contain NEWNET") + } + + if !ns.Contains("NEWPID") { + t.Fatal("namespaces should contain NEWPID but does not") + } + + withNil := Namespaces{ + GetNamespace("UNDEFINED"), // this element will be nil + GetNamespace("NEWPID"), + } + + if !withNil.Contains("NEWPID") { + t.Fatal("namespaces should contain NEWPID but does not") + } +} diff --git a/netlink/MAINTAINERS b/netlink/MAINTAINERS new file mode 100644 index 00000000..1cb55136 --- /dev/null +++ b/netlink/MAINTAINERS @@ -0,0 +1,2 @@ +Michael Crosby (@crosbymichael) +Guillaume J. Charmes (@creack) diff --git a/netlink/netlink.go b/netlink/netlink.go new file mode 100644 index 00000000..90883660 --- /dev/null +++ b/netlink/netlink.go @@ -0,0 +1,31 @@ +// Packet netlink provide access to low level Netlink sockets and messages. +// +// Actual implementations are in: +// netlink_linux.go +// netlink_darwin.go +package netlink + +import ( + "errors" + "net" +) + +var ( + ErrWrongSockType = errors.New("Wrong socket type") + ErrShortResponse = errors.New("Got short response from netlink") + ErrInterfaceExists = errors.New("Network interface already exists") +) + +// A Route is a subnet associated with the interface to reach it. +type Route struct { + *net.IPNet + Iface *net.Interface + Default bool +} + +// An IfAddr defines IP network settings for a given network interface +type IfAddr struct { + Iface *net.Interface + IP net.IP + IPNet *net.IPNet +} diff --git a/netlink/netlink_linux.go b/netlink/netlink_linux.go new file mode 100644 index 00000000..3083cf90 --- /dev/null +++ b/netlink/netlink_linux.go @@ -0,0 +1,1219 @@ +package netlink + +import ( + "encoding/binary" + "fmt" + "io" + "math/rand" + "net" + "os" + "sync/atomic" + "syscall" + "unsafe" +) + +const ( + IFNAMSIZ = 16 + DEFAULT_CHANGE = 0xFFFFFFFF + IFLA_INFO_KIND = 1 + IFLA_INFO_DATA = 2 + VETH_INFO_PEER = 1 + IFLA_MACVLAN_MODE = 1 + IFLA_VLAN_ID = 1 + IFLA_NET_NS_FD = 28 + IFLA_ADDRESS = 1 + SIOC_BRADDBR = 0x89a0 + SIOC_BRDELBR = 0x89a1 + SIOC_BRADDIF = 0x89a2 +) + +const ( + MACVLAN_MODE_PRIVATE = 1 << iota + MACVLAN_MODE_VEPA + MACVLAN_MODE_BRIDGE + MACVLAN_MODE_PASSTHRU +) + +var nextSeqNr uint32 + +type ifreqHwaddr struct { + IfrnName [IFNAMSIZ]byte + IfruHwaddr syscall.RawSockaddr +} + +type ifreqIndex struct { + IfrnName [IFNAMSIZ]byte + IfruIndex int32 +} + +type ifreqFlags struct { + IfrnName [IFNAMSIZ]byte + Ifruflags uint16 +} + +var native binary.ByteOrder + +func init() { + var x uint32 = 0x01020304 + if *(*byte)(unsafe.Pointer(&x)) == 0x01 { + native = binary.BigEndian + } else { + native = binary.LittleEndian + } +} + +func getIpFamily(ip net.IP) int { + if len(ip) <= net.IPv4len { + return syscall.AF_INET + } + if ip.To4() != nil { + return syscall.AF_INET + } + return syscall.AF_INET6 +} + +type NetlinkRequestData interface { + Len() int + ToWireFormat() []byte +} + +type IfInfomsg struct { + syscall.IfInfomsg +} + +func newIfInfomsg(family int) *IfInfomsg { + return &IfInfomsg{ + IfInfomsg: syscall.IfInfomsg{ + Family: uint8(family), + }, + } +} + +func newIfInfomsgChild(parent *RtAttr, family int) *IfInfomsg { + msg := newIfInfomsg(family) + parent.children = append(parent.children, msg) + return msg +} + +func (msg *IfInfomsg) ToWireFormat() []byte { + length := syscall.SizeofIfInfomsg + b := make([]byte, length) + b[0] = msg.Family + b[1] = 0 + native.PutUint16(b[2:4], msg.Type) + native.PutUint32(b[4:8], uint32(msg.Index)) + native.PutUint32(b[8:12], msg.Flags) + native.PutUint32(b[12:16], msg.Change) + return b +} + +func (msg *IfInfomsg) Len() int { + return syscall.SizeofIfInfomsg +} + +type IfAddrmsg struct { + syscall.IfAddrmsg +} + +func newIfAddrmsg(family int) *IfAddrmsg { + return &IfAddrmsg{ + IfAddrmsg: syscall.IfAddrmsg{ + Family: uint8(family), + }, + } +} + +func (msg *IfAddrmsg) ToWireFormat() []byte { + length := syscall.SizeofIfAddrmsg + b := make([]byte, length) + b[0] = msg.Family + b[1] = msg.Prefixlen + b[2] = msg.Flags + b[3] = msg.Scope + native.PutUint32(b[4:8], msg.Index) + return b +} + +func (msg *IfAddrmsg) Len() int { + return syscall.SizeofIfAddrmsg +} + +type RtMsg struct { + syscall.RtMsg +} + +func newRtMsg() *RtMsg { + return &RtMsg{ + RtMsg: syscall.RtMsg{ + Table: syscall.RT_TABLE_MAIN, + Scope: syscall.RT_SCOPE_UNIVERSE, + Protocol: syscall.RTPROT_BOOT, + Type: syscall.RTN_UNICAST, + }, + } +} + +func (msg *RtMsg) ToWireFormat() []byte { + length := syscall.SizeofRtMsg + b := make([]byte, length) + b[0] = msg.Family + b[1] = msg.Dst_len + b[2] = msg.Src_len + b[3] = msg.Tos + b[4] = msg.Table + b[5] = msg.Protocol + b[6] = msg.Scope + b[7] = msg.Type + native.PutUint32(b[8:12], msg.Flags) + return b +} + +func (msg *RtMsg) Len() int { + return syscall.SizeofRtMsg +} + +func rtaAlignOf(attrlen int) int { + return (attrlen + syscall.RTA_ALIGNTO - 1) & ^(syscall.RTA_ALIGNTO - 1) +} + +type RtAttr struct { + syscall.RtAttr + Data []byte + children []NetlinkRequestData +} + +func newRtAttr(attrType int, data []byte) *RtAttr { + return &RtAttr{ + RtAttr: syscall.RtAttr{ + Type: uint16(attrType), + }, + children: []NetlinkRequestData{}, + Data: data, + } +} + +func newRtAttrChild(parent *RtAttr, attrType int, data []byte) *RtAttr { + attr := newRtAttr(attrType, data) + parent.children = append(parent.children, attr) + return attr +} + +func (a *RtAttr) Len() int { + if len(a.children) == 0 { + return (syscall.SizeofRtAttr + len(a.Data)) + } + + l := 0 + for _, child := range a.children { + l += child.Len() + } + l += syscall.SizeofRtAttr + return rtaAlignOf(l + len(a.Data)) +} + +func (a *RtAttr) ToWireFormat() []byte { + length := a.Len() + buf := make([]byte, rtaAlignOf(length)) + + if a.Data != nil { + copy(buf[4:], a.Data) + } else { + next := 4 + for _, child := range a.children { + childBuf := child.ToWireFormat() + copy(buf[next:], childBuf) + next += rtaAlignOf(len(childBuf)) + } + } + + if l := uint16(length); l != 0 { + native.PutUint16(buf[0:2], l) + } + native.PutUint16(buf[2:4], a.Type) + return buf +} + +func uint32Attr(t int, n uint32) *RtAttr { + buf := make([]byte, 4) + native.PutUint32(buf, n) + return newRtAttr(t, buf) +} + +type NetlinkRequest struct { + syscall.NlMsghdr + Data []NetlinkRequestData +} + +func (rr *NetlinkRequest) ToWireFormat() []byte { + length := rr.Len + dataBytes := make([][]byte, len(rr.Data)) + for i, data := range rr.Data { + dataBytes[i] = data.ToWireFormat() + length += uint32(len(dataBytes[i])) + } + b := make([]byte, length) + native.PutUint32(b[0:4], length) + native.PutUint16(b[4:6], rr.Type) + native.PutUint16(b[6:8], rr.Flags) + native.PutUint32(b[8:12], rr.Seq) + native.PutUint32(b[12:16], rr.Pid) + + next := 16 + for _, data := range dataBytes { + copy(b[next:], data) + next += len(data) + } + return b +} + +func (rr *NetlinkRequest) AddData(data NetlinkRequestData) { + if data != nil { + rr.Data = append(rr.Data, data) + } +} + +func newNetlinkRequest(proto, flags int) *NetlinkRequest { + return &NetlinkRequest{ + NlMsghdr: syscall.NlMsghdr{ + Len: uint32(syscall.NLMSG_HDRLEN), + Type: uint16(proto), + Flags: syscall.NLM_F_REQUEST | uint16(flags), + Seq: atomic.AddUint32(&nextSeqNr, 1), + }, + } +} + +type NetlinkSocket struct { + fd int + lsa syscall.SockaddrNetlink +} + +func getNetlinkSocket() (*NetlinkSocket, error) { + fd, err := syscall.Socket(syscall.AF_NETLINK, syscall.SOCK_RAW, syscall.NETLINK_ROUTE) + if err != nil { + return nil, err + } + s := &NetlinkSocket{ + fd: fd, + } + s.lsa.Family = syscall.AF_NETLINK + if err := syscall.Bind(fd, &s.lsa); err != nil { + syscall.Close(fd) + return nil, err + } + + return s, nil +} + +func (s *NetlinkSocket) Close() { + syscall.Close(s.fd) +} + +func (s *NetlinkSocket) Send(request *NetlinkRequest) error { + if err := syscall.Sendto(s.fd, request.ToWireFormat(), 0, &s.lsa); err != nil { + return err + } + return nil +} + +func (s *NetlinkSocket) Receive() ([]syscall.NetlinkMessage, error) { + rb := make([]byte, syscall.Getpagesize()) + nr, _, err := syscall.Recvfrom(s.fd, rb, 0) + if err != nil { + return nil, err + } + if nr < syscall.NLMSG_HDRLEN { + return nil, ErrShortResponse + } + rb = rb[:nr] + return syscall.ParseNetlinkMessage(rb) +} + +func (s *NetlinkSocket) GetPid() (uint32, error) { + lsa, err := syscall.Getsockname(s.fd) + if err != nil { + return 0, err + } + switch v := lsa.(type) { + case *syscall.SockaddrNetlink: + return v.Pid, nil + } + return 0, ErrWrongSockType +} + +func (s *NetlinkSocket) CheckMessage(m syscall.NetlinkMessage, seq, pid uint32) error { + if m.Header.Seq != seq { + return fmt.Errorf("netlink: invalid seq %d, expected %d", m.Header.Seq, seq) + } + if m.Header.Pid != pid { + return fmt.Errorf("netlink: wrong pid %d, expected %d", m.Header.Pid, pid) + } + if m.Header.Type == syscall.NLMSG_DONE { + return io.EOF + } + if m.Header.Type == syscall.NLMSG_ERROR { + e := int32(native.Uint32(m.Data[0:4])) + if e == 0 { + return io.EOF + } + return syscall.Errno(-e) + } + return nil +} + +func (s *NetlinkSocket) HandleAck(seq uint32) error { + pid, err := s.GetPid() + if err != nil { + return err + } + +outer: + for { + msgs, err := s.Receive() + if err != nil { + return err + } + for _, m := range msgs { + if err := s.CheckMessage(m, seq, pid); err != nil { + if err == io.EOF { + break outer + } + return err + } + } + } + + return nil +} + +func zeroTerminated(s string) []byte { + return []byte(s + "\000") +} + +func nonZeroTerminated(s string) []byte { + return []byte(s) +} + +// Add a new network link of a specified type. +// This is identical to running: ip link add $name type $linkType +func NetworkLinkAdd(name string, linkType string) error { + if name == "" || linkType == "" { + return fmt.Errorf("Neither link name nor link type can be empty!") + } + + s, err := getNetlinkSocket() + if err != nil { + return err + } + defer s.Close() + + wb := newNetlinkRequest(syscall.RTM_NEWLINK, syscall.NLM_F_CREATE|syscall.NLM_F_EXCL|syscall.NLM_F_ACK) + + msg := newIfInfomsg(syscall.AF_UNSPEC) + wb.AddData(msg) + + linkInfo := newRtAttr(syscall.IFLA_LINKINFO, nil) + newRtAttrChild(linkInfo, IFLA_INFO_KIND, nonZeroTerminated(linkType)) + wb.AddData(linkInfo) + + nameData := newRtAttr(syscall.IFLA_IFNAME, zeroTerminated(name)) + wb.AddData(nameData) + + if err := s.Send(wb); err != nil { + return err + } + + return s.HandleAck(wb.Seq) +} + +// Delete a network link. +// This is identical to running: ip link del $name +func NetworkLinkDel(name string) error { + if name == "" { + return fmt.Errorf("Network link name can not be empty!") + } + + s, err := getNetlinkSocket() + if err != nil { + return err + } + defer s.Close() + + iface, err := net.InterfaceByName(name) + if err != nil { + return err + } + + wb := newNetlinkRequest(syscall.RTM_DELLINK, syscall.NLM_F_ACK) + + msg := newIfInfomsg(syscall.AF_UNSPEC) + msg.Index = int32(iface.Index) + wb.AddData(msg) + + if err := s.Send(wb); err != nil { + return err + } + + return s.HandleAck(wb.Seq) +} + +// Bring up a particular network interface. +// This is identical to running: ip link set dev $name up +func NetworkLinkUp(iface *net.Interface) error { + s, err := getNetlinkSocket() + if err != nil { + return err + } + defer s.Close() + + wb := newNetlinkRequest(syscall.RTM_NEWLINK, syscall.NLM_F_ACK) + + msg := newIfInfomsg(syscall.AF_UNSPEC) + msg.Index = int32(iface.Index) + msg.Flags = syscall.IFF_UP + msg.Change = syscall.IFF_UP + wb.AddData(msg) + + if err := s.Send(wb); err != nil { + return err + } + + return s.HandleAck(wb.Seq) +} + +// Bring down a particular network interface. +// This is identical to running: ip link set $name down +func NetworkLinkDown(iface *net.Interface) error { + s, err := getNetlinkSocket() + if err != nil { + return err + } + defer s.Close() + + wb := newNetlinkRequest(syscall.RTM_NEWLINK, syscall.NLM_F_ACK) + + msg := newIfInfomsg(syscall.AF_UNSPEC) + msg.Index = int32(iface.Index) + msg.Flags = 0 & ^syscall.IFF_UP + msg.Change = DEFAULT_CHANGE + wb.AddData(msg) + + if err := s.Send(wb); err != nil { + return err + } + + return s.HandleAck(wb.Seq) +} + +// Set link layer address ie. MAC Address. +// This is identical to running: ip link set dev $name address $macaddress +func NetworkSetMacAddress(iface *net.Interface, macaddr string) error { + s, err := getNetlinkSocket() + if err != nil { + return err + } + defer s.Close() + + hwaddr, err := net.ParseMAC(macaddr) + if err != nil { + return err + } + + var ( + MULTICAST byte = 0x1 + LOCALOUI byte = 0x2 + ) + + if hwaddr[0]&0x1 == MULTICAST || hwaddr[0]&0x2 != LOCALOUI { + return fmt.Errorf("Incorrect Local MAC Address specified: %s", macaddr) + } + + wb := newNetlinkRequest(syscall.RTM_SETLINK, syscall.NLM_F_ACK) + + msg := newIfInfomsg(syscall.AF_UNSPEC) + msg.Index = int32(iface.Index) + msg.Change = DEFAULT_CHANGE + wb.AddData(msg) + + macdata := make([]byte, 6) + copy(macdata, hwaddr) + data := newRtAttr(IFLA_ADDRESS, macdata) + wb.AddData(data) + + if err := s.Send(wb); err != nil { + return err + } + return s.HandleAck(wb.Seq) +} + +// Set link Maximum Transmission Unit +// This is identical to running: ip link set dev $name mtu $MTU +// bridge is a bitch here https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=292088 +// https://bugzilla.redhat.com/show_bug.cgi?id=697021 +// There is a discussion about how to deal with ifcs joining bridge with MTU > 1500 +// Regular network nterfaces do seem to work though! +func NetworkSetMTU(iface *net.Interface, mtu int) error { + s, err := getNetlinkSocket() + if err != nil { + return err + } + defer s.Close() + + wb := newNetlinkRequest(syscall.RTM_SETLINK, syscall.NLM_F_ACK) + + msg := newIfInfomsg(syscall.AF_UNSPEC) + msg.Type = syscall.RTM_SETLINK + msg.Flags = syscall.NLM_F_REQUEST + msg.Index = int32(iface.Index) + msg.Change = DEFAULT_CHANGE + wb.AddData(msg) + wb.AddData(uint32Attr(syscall.IFLA_MTU, uint32(mtu))) + + if err := s.Send(wb); err != nil { + return err + } + return s.HandleAck(wb.Seq) +} + +func networkMasterAction(iface *net.Interface, rtattr *RtAttr) error { + s, err := getNetlinkSocket() + if err != nil { + return err + } + defer s.Close() + + wb := newNetlinkRequest(syscall.RTM_SETLINK, syscall.NLM_F_ACK) + + msg := newIfInfomsg(syscall.AF_UNSPEC) + msg.Type = syscall.RTM_SETLINK + msg.Flags = syscall.NLM_F_REQUEST + msg.Index = int32(iface.Index) + msg.Change = DEFAULT_CHANGE + wb.AddData(msg) + wb.AddData(rtattr) + + if err := s.Send(wb); err != nil { + return err + } + + return s.HandleAck(wb.Seq) +} + +// Add an interface to bridge. +// This is identical to running: ip link set $name master $master +func NetworkSetMaster(iface, master *net.Interface) error { + data := uint32Attr(syscall.IFLA_MASTER, uint32(master.Index)) + return networkMasterAction(iface, data) +} + +// Remove an interface from the bridge +// This is is identical to to running: ip link $name set nomaster +func NetworkSetNoMaster(iface *net.Interface) error { + data := uint32Attr(syscall.IFLA_MASTER, 0) + return networkMasterAction(iface, data) +} + +func networkSetNsAction(iface *net.Interface, rtattr *RtAttr) error { + s, err := getNetlinkSocket() + if err != nil { + return err + } + defer s.Close() + + wb := newNetlinkRequest(syscall.RTM_NEWLINK, syscall.NLM_F_ACK) + msg := newIfInfomsg(syscall.AF_UNSPEC) + msg.Index = int32(iface.Index) + wb.AddData(msg) + wb.AddData(rtattr) + + if err := s.Send(wb); err != nil { + return err + } + + return s.HandleAck(wb.Seq) +} + +// Move a particular network interface to a particular network namespace +// specified by PID. This is idential to running: ip link set dev $name netns $pid +func NetworkSetNsPid(iface *net.Interface, nspid int) error { + data := uint32Attr(syscall.IFLA_NET_NS_PID, uint32(nspid)) + return networkSetNsAction(iface, data) +} + +// Move a particular network interface to a particular mounted +// network namespace specified by file descriptor. +// This is idential to running: ip link set dev $name netns $fd +func NetworkSetNsFd(iface *net.Interface, fd int) error { + data := uint32Attr(IFLA_NET_NS_FD, uint32(fd)) + return networkSetNsAction(iface, data) +} + +// Rname a particular interface to a different name +// !!! Note that you can't rename an active interface. You need to bring it down before renaming it. +// This is identical to running: ip link set dev ${oldName} name ${newName} +func NetworkChangeName(iface *net.Interface, newName string) error { + if len(newName) >= IFNAMSIZ { + return fmt.Errorf("Interface name %s too long", newName) + } + + s, err := getNetlinkSocket() + if err != nil { + return err + } + defer s.Close() + + wb := newNetlinkRequest(syscall.RTM_SETLINK, syscall.NLM_F_ACK) + + msg := newIfInfomsg(syscall.AF_UNSPEC) + msg.Index = int32(iface.Index) + msg.Change = DEFAULT_CHANGE + wb.AddData(msg) + + nameData := newRtAttr(syscall.IFLA_IFNAME, zeroTerminated(newName)) + wb.AddData(nameData) + + if err := s.Send(wb); err != nil { + return err + } + + return s.HandleAck(wb.Seq) +} + +// Add a new VETH pair link on the host +// This is identical to running: ip link add name $name type veth peer name $peername +func NetworkCreateVethPair(name1, name2 string) error { + s, err := getNetlinkSocket() + if err != nil { + return err + } + defer s.Close() + + wb := newNetlinkRequest(syscall.RTM_NEWLINK, syscall.NLM_F_CREATE|syscall.NLM_F_EXCL|syscall.NLM_F_ACK) + + msg := newIfInfomsg(syscall.AF_UNSPEC) + wb.AddData(msg) + + nameData := newRtAttr(syscall.IFLA_IFNAME, zeroTerminated(name1)) + wb.AddData(nameData) + + nest1 := newRtAttr(syscall.IFLA_LINKINFO, nil) + newRtAttrChild(nest1, IFLA_INFO_KIND, zeroTerminated("veth")) + nest2 := newRtAttrChild(nest1, IFLA_INFO_DATA, nil) + nest3 := newRtAttrChild(nest2, VETH_INFO_PEER, nil) + + newIfInfomsgChild(nest3, syscall.AF_UNSPEC) + newRtAttrChild(nest3, syscall.IFLA_IFNAME, zeroTerminated(name2)) + + wb.AddData(nest1) + + if err := s.Send(wb); err != nil { + return err + } + + if err := s.HandleAck(wb.Seq); err != nil { + if os.IsExist(err) { + return ErrInterfaceExists + } + + return err + } + + return nil +} + +// Add a new VLAN interface with masterDev as its upper device +// This is identical to running: +// ip link add name $name link $masterdev type vlan id $id +func NetworkLinkAddVlan(masterDev, vlanDev string, vlanId uint16) error { + s, err := getNetlinkSocket() + if err != nil { + return err + } + defer s.Close() + + wb := newNetlinkRequest(syscall.RTM_NEWLINK, syscall.NLM_F_CREATE|syscall.NLM_F_EXCL|syscall.NLM_F_ACK) + + masterDevIfc, err := net.InterfaceByName(masterDev) + if err != nil { + return err + } + + msg := newIfInfomsg(syscall.AF_UNSPEC) + wb.AddData(msg) + + nest1 := newRtAttr(syscall.IFLA_LINKINFO, nil) + newRtAttrChild(nest1, IFLA_INFO_KIND, nonZeroTerminated("vlan")) + + nest2 := newRtAttrChild(nest1, IFLA_INFO_DATA, nil) + vlanData := make([]byte, 2) + native.PutUint16(vlanData, vlanId) + newRtAttrChild(nest2, IFLA_VLAN_ID, vlanData) + wb.AddData(nest1) + + wb.AddData(uint32Attr(syscall.IFLA_LINK, uint32(masterDevIfc.Index))) + wb.AddData(newRtAttr(syscall.IFLA_IFNAME, zeroTerminated(vlanDev))) + + if err := s.Send(wb); err != nil { + return err + } + return s.HandleAck(wb.Seq) +} + +// Add MAC VLAN network interface with masterDev as its upper device +// This is identical to running: +// ip link add name $name link $masterdev type macvlan mode $mode +func NetworkLinkAddMacVlan(masterDev, macVlanDev string, mode string) error { + s, err := getNetlinkSocket() + if err != nil { + return err + } + defer s.Close() + + macVlan := map[string]uint32{ + "private": MACVLAN_MODE_PRIVATE, + "vepa": MACVLAN_MODE_VEPA, + "bridge": MACVLAN_MODE_BRIDGE, + "passthru": MACVLAN_MODE_PASSTHRU, + } + + wb := newNetlinkRequest(syscall.RTM_NEWLINK, syscall.NLM_F_CREATE|syscall.NLM_F_EXCL|syscall.NLM_F_ACK) + + masterDevIfc, err := net.InterfaceByName(masterDev) + if err != nil { + return err + } + + msg := newIfInfomsg(syscall.AF_UNSPEC) + wb.AddData(msg) + + nest1 := newRtAttr(syscall.IFLA_LINKINFO, nil) + newRtAttrChild(nest1, IFLA_INFO_KIND, nonZeroTerminated("macvlan")) + + nest2 := newRtAttrChild(nest1, IFLA_INFO_DATA, nil) + macVlanData := make([]byte, 4) + native.PutUint32(macVlanData, macVlan[mode]) + newRtAttrChild(nest2, IFLA_MACVLAN_MODE, macVlanData) + wb.AddData(nest1) + + wb.AddData(uint32Attr(syscall.IFLA_LINK, uint32(masterDevIfc.Index))) + wb.AddData(newRtAttr(syscall.IFLA_IFNAME, zeroTerminated(macVlanDev))) + + if err := s.Send(wb); err != nil { + return err + } + return s.HandleAck(wb.Seq) +} + +func networkLinkIpAction(action, flags int, ifa IfAddr) error { + s, err := getNetlinkSocket() + if err != nil { + return err + } + defer s.Close() + + family := getIpFamily(ifa.IP) + + wb := newNetlinkRequest(action, flags) + + msg := newIfAddrmsg(family) + msg.Index = uint32(ifa.Iface.Index) + prefixLen, _ := ifa.IPNet.Mask.Size() + msg.Prefixlen = uint8(prefixLen) + wb.AddData(msg) + + var ipData []byte + if family == syscall.AF_INET { + ipData = ifa.IP.To4() + } else { + ipData = ifa.IP.To16() + } + + localData := newRtAttr(syscall.IFA_LOCAL, ipData) + wb.AddData(localData) + + addrData := newRtAttr(syscall.IFA_ADDRESS, ipData) + wb.AddData(addrData) + + if err := s.Send(wb); err != nil { + return err + } + + return s.HandleAck(wb.Seq) +} + +// Delete an IP address from an interface. This is identical to: +// ip addr del $ip/$ipNet dev $iface +func NetworkLinkDelIp(iface *net.Interface, ip net.IP, ipNet *net.IPNet) error { + return networkLinkIpAction( + syscall.RTM_DELADDR, + syscall.NLM_F_ACK, + IfAddr{iface, ip, ipNet}, + ) +} + +// Add an Ip address to an interface. This is identical to: +// ip addr add $ip/$ipNet dev $iface +func NetworkLinkAddIp(iface *net.Interface, ip net.IP, ipNet *net.IPNet) error { + return networkLinkIpAction( + syscall.RTM_NEWADDR, + syscall.NLM_F_CREATE|syscall.NLM_F_EXCL|syscall.NLM_F_ACK, + IfAddr{iface, ip, ipNet}, + ) +} + +// Returns an array of IPNet for all the currently routed subnets on ipv4 +// This is similar to the first column of "ip route" output +func NetworkGetRoutes() ([]Route, error) { + s, err := getNetlinkSocket() + if err != nil { + return nil, err + } + defer s.Close() + + wb := newNetlinkRequest(syscall.RTM_GETROUTE, syscall.NLM_F_DUMP) + + msg := newIfInfomsg(syscall.AF_UNSPEC) + wb.AddData(msg) + + if err := s.Send(wb); err != nil { + return nil, err + } + + pid, err := s.GetPid() + if err != nil { + return nil, err + } + + res := make([]Route, 0) + +outer: + for { + msgs, err := s.Receive() + if err != nil { + return nil, err + } + for _, m := range msgs { + if err := s.CheckMessage(m, wb.Seq, pid); err != nil { + if err == io.EOF { + break outer + } + return nil, err + } + if m.Header.Type != syscall.RTM_NEWROUTE { + continue + } + + var r Route + + msg := (*RtMsg)(unsafe.Pointer(&m.Data[0:syscall.SizeofRtMsg][0])) + + if msg.Flags&syscall.RTM_F_CLONED != 0 { + // Ignore cloned routes + continue + } + + if msg.Table != syscall.RT_TABLE_MAIN { + // Ignore non-main tables + continue + } + + if msg.Family != syscall.AF_INET { + // Ignore non-ipv4 routes + continue + } + + if msg.Dst_len == 0 { + // Default routes + r.Default = true + } + + attrs, err := syscall.ParseNetlinkRouteAttr(&m) + if err != nil { + return nil, err + } + for _, attr := range attrs { + switch attr.Attr.Type { + case syscall.RTA_DST: + ip := attr.Value + r.IPNet = &net.IPNet{ + IP: ip, + Mask: net.CIDRMask(int(msg.Dst_len), 8*len(ip)), + } + case syscall.RTA_OIF: + index := int(native.Uint32(attr.Value[0:4])) + r.Iface, _ = net.InterfaceByIndex(index) + } + } + if r.Default || r.IPNet != nil { + res = append(res, r) + } + } + } + + return res, nil +} + +// Add a new route table entry. +func AddRoute(destination, source, gateway, device string) error { + if destination == "" && source == "" && gateway == "" { + return fmt.Errorf("one of destination, source or gateway must not be blank") + } + + s, err := getNetlinkSocket() + if err != nil { + return err + } + defer s.Close() + + wb := newNetlinkRequest(syscall.RTM_NEWROUTE, syscall.NLM_F_CREATE|syscall.NLM_F_EXCL|syscall.NLM_F_ACK) + msg := newRtMsg() + currentFamily := -1 + var rtAttrs []*RtAttr + + if destination != "" { + destIP, destNet, err := net.ParseCIDR(destination) + if err != nil { + return fmt.Errorf("destination CIDR %s couldn't be parsed", destination) + } + destFamily := getIpFamily(destIP) + currentFamily = destFamily + destLen, bits := destNet.Mask.Size() + if destLen == 0 && bits == 0 { + return fmt.Errorf("destination CIDR %s generated a non-canonical Mask", destination) + } + msg.Family = uint8(destFamily) + msg.Dst_len = uint8(destLen) + var destData []byte + if destFamily == syscall.AF_INET { + destData = destIP.To4() + } else { + destData = destIP.To16() + } + rtAttrs = append(rtAttrs, newRtAttr(syscall.RTA_DST, destData)) + } + + if source != "" { + srcIP, srcNet, err := net.ParseCIDR(source) + if err != nil { + return fmt.Errorf("source CIDR %s couldn't be parsed", source) + } + srcFamily := getIpFamily(srcIP) + if currentFamily != -1 && currentFamily != srcFamily { + return fmt.Errorf("source and destination ip were not the same IP family") + } + currentFamily = srcFamily + srcLen, bits := srcNet.Mask.Size() + if srcLen == 0 && bits == 0 { + return fmt.Errorf("source CIDR %s generated a non-canonical Mask", source) + } + msg.Family = uint8(srcFamily) + msg.Src_len = uint8(srcLen) + var srcData []byte + if srcFamily == syscall.AF_INET { + srcData = srcIP.To4() + } else { + srcData = srcIP.To16() + } + rtAttrs = append(rtAttrs, newRtAttr(syscall.RTA_SRC, srcData)) + } + + if gateway != "" { + gwIP := net.ParseIP(gateway) + if gwIP == nil { + return fmt.Errorf("gateway IP %s couldn't be parsed", gateway) + } + gwFamily := getIpFamily(gwIP) + if currentFamily != -1 && currentFamily != gwFamily { + return fmt.Errorf("gateway, source, and destination ip were not the same IP family") + } + msg.Family = uint8(gwFamily) + var gwData []byte + if gwFamily == syscall.AF_INET { + gwData = gwIP.To4() + } else { + gwData = gwIP.To16() + } + rtAttrs = append(rtAttrs, newRtAttr(syscall.RTA_GATEWAY, gwData)) + } + + wb.AddData(msg) + for _, attr := range rtAttrs { + wb.AddData(attr) + } + + iface, err := net.InterfaceByName(device) + if err != nil { + return err + } + wb.AddData(uint32Attr(syscall.RTA_OIF, uint32(iface.Index))) + + if err := s.Send(wb); err != nil { + return err + } + return s.HandleAck(wb.Seq) +} + +// Add a new default gateway. Identical to: +// ip route add default via $ip +func AddDefaultGw(ip, device string) error { + return AddRoute("", "", ip, device) +} + +// THIS CODE DOES NOT COMMUNICATE WITH KERNEL VIA RTNETLINK INTERFACE +// IT IS HERE FOR BACKWARDS COMPATIBILITY WITH OLDER LINUX KERNELS +// WHICH SHIP WITH OLDER NOT ENTIRELY FUNCTIONAL VERSION OF NETLINK +func getIfSocket() (fd int, err error) { + for _, socket := range []int{ + syscall.AF_INET, + syscall.AF_PACKET, + syscall.AF_INET6, + } { + if fd, err = syscall.Socket(socket, syscall.SOCK_DGRAM, 0); err == nil { + break + } + } + if err == nil { + return fd, nil + } + return -1, err +} + +// Create the actual bridge device. This is more backward-compatible than +// netlink.NetworkLinkAdd and works on RHEL 6. +func CreateBridge(name string, setMacAddr bool) error { + if len(name) >= IFNAMSIZ { + return fmt.Errorf("Interface name %s too long", name) + } + + s, err := getIfSocket() + if err != nil { + return err + } + defer syscall.Close(s) + + nameBytePtr, err := syscall.BytePtrFromString(name) + if err != nil { + return err + } + if _, _, err := syscall.Syscall(syscall.SYS_IOCTL, uintptr(s), SIOC_BRADDBR, uintptr(unsafe.Pointer(nameBytePtr))); err != 0 { + return err + } + if setMacAddr { + return SetMacAddress(name, randMacAddr()) + } + return nil +} + +// Delete the actual bridge device. +func DeleteBridge(name string) error { + s, err := getIfSocket() + if err != nil { + return err + } + defer syscall.Close(s) + + nameBytePtr, err := syscall.BytePtrFromString(name) + if err != nil { + return err + } + + var ifr ifreqFlags + copy(ifr.IfrnName[:len(ifr.IfrnName)-1], []byte(name)) + if _, _, err := syscall.Syscall(syscall.SYS_IOCTL, uintptr(s), + syscall.SIOCSIFFLAGS, uintptr(unsafe.Pointer(&ifr))); err != 0 { + return err + } + + if _, _, err := syscall.Syscall(syscall.SYS_IOCTL, uintptr(s), + SIOC_BRDELBR, uintptr(unsafe.Pointer(nameBytePtr))); err != 0 { + return err + } + return nil +} + +// Add a slave to abridge device. This is more backward-compatible than +// netlink.NetworkSetMaster and works on RHEL 6. +func AddToBridge(iface, master *net.Interface) error { + if len(master.Name) >= IFNAMSIZ { + return fmt.Errorf("Interface name %s too long", master.Name) + } + + s, err := getIfSocket() + if err != nil { + return err + } + defer syscall.Close(s) + + ifr := ifreqIndex{} + copy(ifr.IfrnName[:len(ifr.IfrnName)-1], master.Name) + ifr.IfruIndex = int32(iface.Index) + + if _, _, err := syscall.Syscall(syscall.SYS_IOCTL, uintptr(s), SIOC_BRADDIF, uintptr(unsafe.Pointer(&ifr))); err != 0 { + return err + } + + return nil +} + +func randMacAddr() string { + hw := make(net.HardwareAddr, 6) + for i := 0; i < 6; i++ { + hw[i] = byte(rand.Intn(255)) + } + hw[0] &^= 0x1 // clear multicast bit + hw[0] |= 0x2 // set local assignment bit (IEEE802) + return hw.String() +} + +func SetMacAddress(name, addr string) error { + if len(name) >= IFNAMSIZ { + return fmt.Errorf("Interface name %s too long", name) + } + + hw, err := net.ParseMAC(addr) + if err != nil { + return err + } + + s, err := getIfSocket() + if err != nil { + return err + } + defer syscall.Close(s) + + ifr := ifreqHwaddr{} + ifr.IfruHwaddr.Family = syscall.ARPHRD_ETHER + copy(ifr.IfrnName[:len(ifr.IfrnName)-1], name) + + for i := 0; i < 6; i++ { + ifr.IfruHwaddr.Data[i] = ifrDataByte(hw[i]) + } + + if _, _, err := syscall.Syscall(syscall.SYS_IOCTL, uintptr(s), syscall.SIOCSIFHWADDR, uintptr(unsafe.Pointer(&ifr))); err != 0 { + return err + } + return nil +} + +func ChangeName(iface *net.Interface, newName string) error { + if len(newName) >= IFNAMSIZ { + return fmt.Errorf("Interface name %s too long", newName) + } + + fd, err := getIfSocket() + if err != nil { + return err + } + defer syscall.Close(fd) + + data := [IFNAMSIZ * 2]byte{} + // the "-1"s here are very important for ensuring we get proper null + // termination of our new C strings + copy(data[:IFNAMSIZ-1], iface.Name) + copy(data[IFNAMSIZ:IFNAMSIZ*2-1], newName) + + if _, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), syscall.SIOCSIFNAME, uintptr(unsafe.Pointer(&data[0]))); errno != 0 { + return errno + } + return nil +} diff --git a/netlink/netlink_linux_arm.go b/netlink/netlink_linux_arm.go new file mode 100644 index 00000000..779e58a7 --- /dev/null +++ b/netlink/netlink_linux_arm.go @@ -0,0 +1,5 @@ +package netlink + +func ifrDataByte(b byte) uint8 { + return uint8(b) +} diff --git a/netlink/netlink_linux_notarm.go b/netlink/netlink_linux_notarm.go new file mode 100644 index 00000000..f151722a --- /dev/null +++ b/netlink/netlink_linux_notarm.go @@ -0,0 +1,7 @@ +// +build !arm + +package netlink + +func ifrDataByte(b byte) int8 { + return int8(b) +} diff --git a/netlink/netlink_linux_test.go b/netlink/netlink_linux_test.go new file mode 100644 index 00000000..88c2e04a --- /dev/null +++ b/netlink/netlink_linux_test.go @@ -0,0 +1,356 @@ +package netlink + +import ( + "net" + "strings" + "syscall" + "testing" +) + +type testLink struct { + name string + linkType string +} + +func addLink(t *testing.T, name string, linkType string) { + if err := NetworkLinkAdd(name, linkType); err != nil { + t.Fatalf("Unable to create %s link: %s", name, err) + } +} + +func readLink(t *testing.T, name string) *net.Interface { + iface, err := net.InterfaceByName(name) + if err != nil { + t.Fatalf("Could not find %s interface: %s", name, err) + } + + return iface +} + +func deleteLink(t *testing.T, name string) { + if err := NetworkLinkDel(name); err != nil { + t.Fatalf("Unable to delete %s link: %s", name, err) + } +} + +func upLink(t *testing.T, name string) { + iface := readLink(t, name) + if err := NetworkLinkUp(iface); err != nil { + t.Fatalf("Could not bring UP %#v interface: %s", iface, err) + } +} + +func downLink(t *testing.T, name string) { + iface := readLink(t, name) + if err := NetworkLinkDown(iface); err != nil { + t.Fatalf("Could not bring DOWN %#v interface: %s", iface, err) + } +} + +func ipAssigned(iface *net.Interface, ip net.IP) bool { + addrs, _ := iface.Addrs() + + for _, addr := range addrs { + args := strings.SplitN(addr.String(), "/", 2) + if args[0] == ip.String() { + return true + } + } + + return false +} + +func TestNetworkLinkAddDel(t *testing.T) { + if testing.Short() { + return + } + + testLinks := []testLink{ + {"tstEth", "dummy"}, + {"tstBr", "bridge"}, + } + + for _, tl := range testLinks { + addLink(t, tl.name, tl.linkType) + defer deleteLink(t, tl.name) + readLink(t, tl.name) + } +} + +func TestNetworkLinkUpDown(t *testing.T) { + if testing.Short() { + return + } + + tl := testLink{name: "tstEth", linkType: "dummy"} + + addLink(t, tl.name, tl.linkType) + defer deleteLink(t, tl.name) + + upLink(t, tl.name) + ifcAfterUp := readLink(t, tl.name) + + if (ifcAfterUp.Flags & syscall.IFF_UP) != syscall.IFF_UP { + t.Fatalf("Could not bring UP %#v initerface", tl) + } + + downLink(t, tl.name) + ifcAfterDown := readLink(t, tl.name) + + if (ifcAfterDown.Flags & syscall.IFF_UP) == syscall.IFF_UP { + t.Fatalf("Could not bring DOWN %#v initerface", tl) + } +} + +func TestNetworkSetMacAddress(t *testing.T) { + if testing.Short() { + return + } + + tl := testLink{name: "tstEth", linkType: "dummy"} + macaddr := "22:ce:e0:99:63:6f" + + addLink(t, tl.name, tl.linkType) + defer deleteLink(t, tl.name) + + ifcBeforeSet := readLink(t, tl.name) + + if err := NetworkSetMacAddress(ifcBeforeSet, macaddr); err != nil { + t.Fatalf("Could not set %s MAC address on %#v interface: err", macaddr, tl, err) + } + + ifcAfterSet := readLink(t, tl.name) + + if ifcAfterSet.HardwareAddr.String() != macaddr { + t.Fatalf("Could not set %s MAC address on %#v interface", macaddr, tl) + } +} + +func TestNetworkSetMTU(t *testing.T) { + if testing.Short() { + return + } + + tl := testLink{name: "tstEth", linkType: "dummy"} + mtu := 1400 + + addLink(t, tl.name, tl.linkType) + defer deleteLink(t, tl.name) + + ifcBeforeSet := readLink(t, tl.name) + + if err := NetworkSetMTU(ifcBeforeSet, mtu); err != nil { + t.Fatalf("Could not set %d MTU on %#v interface: err", mtu, tl, err) + } + + ifcAfterSet := readLink(t, tl.name) + + if ifcAfterSet.MTU != mtu { + t.Fatalf("Could not set %d MTU on %#v interface", mtu, tl) + } +} + +func TestNetworkSetMasterNoMaster(t *testing.T) { + if testing.Short() { + return + } + + master := testLink{"tstBr", "bridge"} + slave := testLink{"tstEth", "dummy"} + testLinks := []testLink{master, slave} + + for _, tl := range testLinks { + addLink(t, tl.name, tl.linkType) + defer deleteLink(t, tl.name) + upLink(t, tl.name) + } + + masterIfc := readLink(t, master.name) + slaveIfc := readLink(t, slave.name) + if err := NetworkSetMaster(slaveIfc, masterIfc); err != nil { + t.Fatalf("Could not set %#v to be the master of %#v: %s", master, slave, err) + } + + // Trying to figure out a way to test which will not break on RHEL6. + // We could check for existence of /sys/class/net/tstEth/upper_tstBr + // which should point to the ../tstBr which is the UPPER device i.e. network bridge + + if err := NetworkSetNoMaster(slaveIfc); err != nil { + t.Fatalf("Could not UNset %#v master of %#v: %s", master, slave, err) + } +} + +func TestNetworkChangeName(t *testing.T) { + if testing.Short() { + return + } + + tl := testLink{"tstEth", "dummy"} + newName := "newTst" + + addLink(t, tl.name, tl.linkType) + + linkIfc := readLink(t, tl.name) + if err := NetworkChangeName(linkIfc, newName); err != nil { + deleteLink(t, tl.name) + t.Fatalf("Could not change %#v interface name to %s: %s", tl, newName, err) + } + + readLink(t, newName) + deleteLink(t, newName) +} + +func TestNetworkLinkAddVlan(t *testing.T) { + if testing.Short() { + return + } + + tl := struct { + name string + id uint16 + }{ + name: "tstVlan", + id: 32, + } + masterLink := testLink{"tstEth", "dummy"} + + addLink(t, masterLink.name, masterLink.linkType) + defer deleteLink(t, masterLink.name) + + if err := NetworkLinkAddVlan(masterLink.name, tl.name, tl.id); err != nil { + t.Fatalf("Unable to create %#v VLAN interface: %s", tl, err) + } + + readLink(t, tl.name) +} + +func TestNetworkLinkAddMacVlan(t *testing.T) { + if testing.Short() { + return + } + + tl := struct { + name string + mode string + }{ + name: "tstVlan", + mode: "private", + } + masterLink := testLink{"tstEth", "dummy"} + + addLink(t, masterLink.name, masterLink.linkType) + defer deleteLink(t, masterLink.name) + + if err := NetworkLinkAddMacVlan(masterLink.name, tl.name, tl.mode); err != nil { + t.Fatalf("Unable to create %#v MAC VLAN interface: %s", tl, err) + } + + readLink(t, tl.name) +} + +func TestAddDelNetworkIp(t *testing.T) { + if testing.Short() { + return + } + + ifaceName := "lo" + ip := net.ParseIP("127.0.1.1") + mask := net.IPv4Mask(255, 255, 255, 255) + ipNet := &net.IPNet{IP: ip, Mask: mask} + + iface, err := net.InterfaceByName(ifaceName) + if err != nil { + t.Skip("No 'lo' interface; skipping tests") + } + + if err := NetworkLinkAddIp(iface, ip, ipNet); err != nil { + t.Fatalf("Could not add IP address %s to interface %#v: %s", ip.String(), iface, err) + } + + if !ipAssigned(iface, ip) { + t.Fatalf("Could not locate address '%s' in lo address list.", ip.String()) + } + + if err := NetworkLinkDelIp(iface, ip, ipNet); err != nil { + t.Fatalf("Could not delete IP address %s from interface %#v: %s", ip.String(), iface, err) + } + + if ipAssigned(iface, ip) { + t.Fatalf("Located address '%s' in lo address list after removal.", ip.String()) + } +} + +func TestCreateVethPair(t *testing.T) { + if testing.Short() { + return + } + + var ( + name1 = "veth1" + name2 = "veth2" + ) + + if err := NetworkCreateVethPair(name1, name2); err != nil { + t.Fatalf("Could not create veth pair %s %s: %s", name1, name2, err) + } + defer NetworkLinkDel(name1) + + readLink(t, name1) + readLink(t, name2) +} + +// +// netlink package tests which do not use RTNETLINK +// +func TestCreateBridgeWithMac(t *testing.T) { + if testing.Short() { + return + } + + name := "testbridge" + + if err := CreateBridge(name, true); err != nil { + t.Fatal(err) + } + + if _, err := net.InterfaceByName(name); err != nil { + t.Fatal(err) + } + + // cleanup and tests + + if err := DeleteBridge(name); err != nil { + t.Fatal(err) + } + + if _, err := net.InterfaceByName(name); err == nil { + t.Fatalf("expected error getting interface because %s bridge was deleted", name) + } +} + +func TestSetMacAddress(t *testing.T) { + if testing.Short() { + return + } + + name := "testmac" + mac := randMacAddr() + + if err := NetworkLinkAdd(name, "bridge"); err != nil { + t.Fatal(err) + } + defer NetworkLinkDel(name) + + if err := SetMacAddress(name, mac); err != nil { + t.Fatal(err) + } + + iface, err := net.InterfaceByName(name) + if err != nil { + t.Fatal(err) + } + + if iface.HardwareAddr.String() != mac { + t.Fatalf("mac address %q does not match %q", iface.HardwareAddr, mac) + } +} diff --git a/netlink/netlink_unsupported.go b/netlink/netlink_unsupported.go new file mode 100644 index 00000000..f6e84adf --- /dev/null +++ b/netlink/netlink_unsupported.go @@ -0,0 +1,84 @@ +// +build !linux + +package netlink + +import ( + "errors" + "net" +) + +var ( + ErrNotImplemented = errors.New("not implemented") +) + +func NetworkGetRoutes() ([]Route, error) { + return nil, ErrNotImplemented +} + +func NetworkLinkAdd(name string, linkType string) error { + return ErrNotImplemented +} + +func NetworkLinkDel(name string) error { + return ErrNotImplemented +} + +func NetworkLinkUp(iface *net.Interface) error { + return ErrNotImplemented +} + +func NetworkLinkAddIp(iface *net.Interface, ip net.IP, ipNet *net.IPNet) error { + return ErrNotImplemented +} + +func NetworkLinkDelIp(iface *net.Interface, ip net.IP, ipNet *net.IPNet) error { + return ErrNotImplemented +} + +func AddRoute(destination, source, gateway, device string) error { + return ErrNotImplemented +} + +func AddDefaultGw(ip, device string) error { + return ErrNotImplemented +} + +func NetworkSetMTU(iface *net.Interface, mtu int) error { + return ErrNotImplemented +} + +func NetworkCreateVethPair(name1, name2 string) error { + return ErrNotImplemented +} + +func NetworkChangeName(iface *net.Interface, newName string) error { + return ErrNotImplemented +} + +func NetworkSetNsFd(iface *net.Interface, fd int) error { + return ErrNotImplemented +} + +func NetworkSetNsPid(iface *net.Interface, nspid int) error { + return ErrNotImplemented +} + +func NetworkSetMaster(iface, master *net.Interface) error { + return ErrNotImplemented +} + +func NetworkLinkDown(iface *net.Interface) error { + return ErrNotImplemented +} + +func CreateBridge(name string, setMacAddr bool) error { + return ErrNotImplemented +} + +func DeleteBridge(name string) error { + return ErrNotImplemented +} + +func AddToBridge(iface, master *net.Interface) error { + return ErrNotImplemented +} diff --git a/network/loopback.go b/network/loopback.go new file mode 100644 index 00000000..1667b4d8 --- /dev/null +++ b/network/loopback.go @@ -0,0 +1,23 @@ +// +build linux + +package network + +import ( + "fmt" +) + +// Loopback is a network strategy that provides a basic loopback device +type Loopback struct { +} + +func (l *Loopback) Create(n *Network, nspid int, networkState *NetworkState) error { + return nil +} + +func (l *Loopback) Initialize(config *Network, networkState *NetworkState) error { + // Do not set the MTU on the loopback interface - use the default. + if err := InterfaceUp("lo"); err != nil { + return fmt.Errorf("lo up %s", err) + } + return nil +} diff --git a/network/netns.go b/network/netns.go new file mode 100644 index 00000000..73cd8de5 --- /dev/null +++ b/network/netns.go @@ -0,0 +1,39 @@ +// +build linux + +package network + +import ( + "fmt" + "os" + "syscall" + + "github.com/docker/libcontainer/system" +) + +// crosbymichael: could make a network strategy that instead of returning veth pair names it returns a pid to an existing network namespace +type NetNS struct { +} + +func (v *NetNS) Create(n *Network, nspid int, networkState *NetworkState) error { + networkState.NsPath = n.NsPath + return nil +} + +func (v *NetNS) Initialize(config *Network, networkState *NetworkState) error { + if networkState.NsPath == "" { + return fmt.Errorf("nspath does is not specified in NetworkState") + } + + f, err := os.OpenFile(networkState.NsPath, os.O_RDONLY, 0) + if err != nil { + return fmt.Errorf("failed get network namespace fd: %v", err) + } + + if err := system.Setns(f.Fd(), syscall.CLONE_NEWNET); err != nil { + f.Close() + return fmt.Errorf("failed to setns current network namespace: %v", err) + } + + f.Close() + return nil +} diff --git a/network/network.go b/network/network.go new file mode 100644 index 00000000..014ba743 --- /dev/null +++ b/network/network.go @@ -0,0 +1,97 @@ +// +build linux + +package network + +import ( + "net" + + "github.com/docker/libcontainer/netlink" +) + +func InterfaceUp(name string) error { + iface, err := net.InterfaceByName(name) + if err != nil { + return err + } + return netlink.NetworkLinkUp(iface) +} + +func InterfaceDown(name string) error { + iface, err := net.InterfaceByName(name) + if err != nil { + return err + } + return netlink.NetworkLinkDown(iface) +} + +func ChangeInterfaceName(old, newName string) error { + iface, err := net.InterfaceByName(old) + if err != nil { + return err + } + return netlink.NetworkChangeName(iface, newName) +} + +func CreateVethPair(name1, name2 string) error { + return netlink.NetworkCreateVethPair(name1, name2) +} + +func SetInterfaceInNamespacePid(name string, nsPid int) error { + iface, err := net.InterfaceByName(name) + if err != nil { + return err + } + return netlink.NetworkSetNsPid(iface, nsPid) +} + +func SetInterfaceInNamespaceFd(name string, fd uintptr) error { + iface, err := net.InterfaceByName(name) + if err != nil { + return err + } + return netlink.NetworkSetNsFd(iface, int(fd)) +} + +func SetInterfaceMaster(name, master string) error { + iface, err := net.InterfaceByName(name) + if err != nil { + return err + } + masterIface, err := net.InterfaceByName(master) + if err != nil { + return err + } + return netlink.AddToBridge(iface, masterIface) +} + +func SetDefaultGateway(ip, ifaceName string) error { + return netlink.AddDefaultGw(ip, ifaceName) +} + +func SetInterfaceMac(name string, macaddr string) error { + iface, err := net.InterfaceByName(name) + if err != nil { + return err + } + return netlink.NetworkSetMacAddress(iface, macaddr) +} + +func SetInterfaceIp(name string, rawIp string) error { + iface, err := net.InterfaceByName(name) + if err != nil { + return err + } + ip, ipNet, err := net.ParseCIDR(rawIp) + if err != nil { + return err + } + return netlink.NetworkLinkAddIp(iface, ip, ipNet) +} + +func SetMtu(name string, mtu int) error { + iface, err := net.InterfaceByName(name) + if err != nil { + return err + } + return netlink.NetworkSetMTU(iface, mtu) +} diff --git a/network/stats.go b/network/stats.go new file mode 100644 index 00000000..c8ece5c7 --- /dev/null +++ b/network/stats.go @@ -0,0 +1,69 @@ +package network + +import ( + "io/ioutil" + "os" + "path/filepath" + "strconv" + "strings" +) + +type NetworkStats struct { + RxBytes uint64 `json:"rx_bytes"` + RxPackets uint64 `json:"rx_packets"` + RxErrors uint64 `json:"rx_errors"` + RxDropped uint64 `json:"rx_dropped"` + TxBytes uint64 `json:"tx_bytes"` + TxPackets uint64 `json:"tx_packets"` + TxErrors uint64 `json:"tx_errors"` + TxDropped uint64 `json:"tx_dropped"` +} + +// Returns the network statistics for the network interfaces represented by the NetworkRuntimeInfo. +func GetStats(networkState *NetworkState) (*NetworkStats, error) { + // This can happen if the network runtime information is missing - possible if the container was created by an old version of libcontainer. + if networkState.VethHost == "" { + return &NetworkStats{}, nil + } + data, err := readSysfsNetworkStats(networkState.VethHost) + if err != nil { + return nil, err + } + + // Ingress for host veth is from the container. Hence tx_bytes stat on the host veth is actually number of bytes received by the container. + return &NetworkStats{ + RxBytes: data["tx_bytes"], + RxPackets: data["tx_packets"], + RxErrors: data["tx_errors"], + RxDropped: data["tx_dropped"], + TxBytes: data["rx_bytes"], + TxPackets: data["rx_packets"], + TxErrors: data["rx_errors"], + TxDropped: data["rx_dropped"], + }, nil +} + +// Reads all the statistics available under /sys/class/net//statistics as a map with file name as key and data as integers. +func readSysfsNetworkStats(ethInterface string) (map[string]uint64, error) { + out := make(map[string]uint64) + + fullPath := filepath.Join("/sys/class/net", ethInterface, "statistics/") + err := filepath.Walk(fullPath, func(path string, _ os.FileInfo, _ error) error { + // skip fullPath. + if path == fullPath { + return nil + } + base := filepath.Base(path) + data, err := ioutil.ReadFile(path) + if err != nil { + return err + } + value, err := strconv.ParseUint(strings.TrimSpace(string(data)), 10, 64) + if err != nil { + return err + } + out[base] = value + return nil + }) + return out, err +} diff --git a/network/strategy.go b/network/strategy.go new file mode 100644 index 00000000..be5ec93b --- /dev/null +++ b/network/strategy.go @@ -0,0 +1,35 @@ +// +build linux + +package network + +import ( + "errors" +) + +var ( + ErrNotValidStrategyType = errors.New("not a valid network strategy type") +) + +var strategies = map[string]NetworkStrategy{ + "veth": &Veth{}, + "loopback": &Loopback{}, + "netns": &NetNS{}, +} + +// NetworkStrategy represents a specific network configuration for +// a container's networking stack +type NetworkStrategy interface { + Create(*Network, int, *NetworkState) error + Initialize(*Network, *NetworkState) error +} + +// GetStrategy returns the specific network strategy for the +// provided type. If no strategy is registered for the type an +// ErrNotValidStrategyType is returned. +func GetStrategy(tpe string) (NetworkStrategy, error) { + s, exists := strategies[tpe] + if !exists { + return nil, ErrNotValidStrategyType + } + return s, nil +} diff --git a/network/types.go b/network/types.go new file mode 100644 index 00000000..383e27c8 --- /dev/null +++ b/network/types.go @@ -0,0 +1,50 @@ +package network + +// Network defines configuration for a container's networking stack +// +// The network configuration can be omited from a container causing the +// container to be setup with the host's networking stack +type Network struct { + // Type sets the networks type, commonly veth and loopback + Type string `json:"type,omitempty"` + + // Path to network namespace + NsPath string `json:"ns_path,omitempty"` + + // The bridge to use. + Bridge string `json:"bridge,omitempty"` + + // Prefix for the veth interfaces. + VethPrefix string `json:"veth_prefix,omitempty"` + + // MacAddress contains the MAC address to set on the network interface + MacAddress string `json:"mac_address,omitempty"` + + // Address contains the IPv4 and mask to set on the network interface + Address string `json:"address,omitempty"` + + // IPv6Address contains the IPv6 and mask to set on the network interface + IPv6Address string `json:"ipv6_address,omitempty"` + + // Gateway sets the gateway address that is used as the default for the interface + Gateway string `json:"gateway,omitempty"` + + // IPv6Gateway sets the ipv6 gateway address that is used as the default for the interface + IPv6Gateway string `json:"ipv6_gateway,omitempty"` + + // Mtu sets the mtu value for the interface and will be mirrored on both the host and + // container's interfaces if a pair is created, specifically in the case of type veth + // Note: This does not apply to loopback interfaces. + Mtu int `json:"mtu,omitempty"` +} + +// Struct describing the network specific runtime state that will be maintained by libcontainer for all running containers +// Do not depend on it outside of libcontainer. +type NetworkState struct { + // The name of the veth interface on the Host. + VethHost string `json:"veth_host,omitempty"` + // The name of the veth interface created inside the container for the child. + VethChild string `json:"veth_child,omitempty"` + // Net namespace path. + NsPath string `json:"ns_path,omitempty"` +} diff --git a/network/veth.go b/network/veth.go new file mode 100644 index 00000000..e5185de7 --- /dev/null +++ b/network/veth.go @@ -0,0 +1,121 @@ +// +build linux + +package network + +import ( + "fmt" + + "github.com/docker/libcontainer/netlink" + "github.com/docker/libcontainer/utils" +) + +// Veth is a network strategy that uses a bridge and creates +// a veth pair, one that stays outside on the host and the other +// is placed inside the container's namespace +type Veth struct { +} + +const defaultDevice = "eth0" + +func (v *Veth) Create(n *Network, nspid int, networkState *NetworkState) error { + var ( + bridge = n.Bridge + prefix = n.VethPrefix + ) + if bridge == "" { + return fmt.Errorf("bridge is not specified") + } + if prefix == "" { + return fmt.Errorf("veth prefix is not specified") + } + name1, name2, err := createVethPair(prefix) + if err != nil { + return err + } + if err := SetInterfaceMaster(name1, bridge); err != nil { + return err + } + if err := SetMtu(name1, n.Mtu); err != nil { + return err + } + if err := InterfaceUp(name1); err != nil { + return err + } + if err := SetInterfaceInNamespacePid(name2, nspid); err != nil { + return err + } + networkState.VethHost = name1 + networkState.VethChild = name2 + + return nil +} + +func (v *Veth) Initialize(config *Network, networkState *NetworkState) error { + var vethChild = networkState.VethChild + if vethChild == "" { + return fmt.Errorf("vethChild is not specified") + } + if err := InterfaceDown(vethChild); err != nil { + return fmt.Errorf("interface down %s %s", vethChild, err) + } + if err := ChangeInterfaceName(vethChild, defaultDevice); err != nil { + return fmt.Errorf("change %s to %s %s", vethChild, defaultDevice, err) + } + if config.MacAddress != "" { + if err := SetInterfaceMac(defaultDevice, config.MacAddress); err != nil { + return fmt.Errorf("set %s mac %s", defaultDevice, err) + } + } + if err := SetInterfaceIp(defaultDevice, config.Address); err != nil { + return fmt.Errorf("set %s ip %s", defaultDevice, err) + } + if config.IPv6Address != "" { + if err := SetInterfaceIp(defaultDevice, config.IPv6Address); err != nil { + return fmt.Errorf("set %s ipv6 %s", defaultDevice, err) + } + } + + if err := SetMtu(defaultDevice, config.Mtu); err != nil { + return fmt.Errorf("set %s mtu to %d %s", defaultDevice, config.Mtu, err) + } + if err := InterfaceUp(defaultDevice); err != nil { + return fmt.Errorf("%s up %s", defaultDevice, err) + } + if config.Gateway != "" { + if err := SetDefaultGateway(config.Gateway, defaultDevice); err != nil { + return fmt.Errorf("set gateway to %s on device %s failed with %s", config.Gateway, defaultDevice, err) + } + } + if config.IPv6Gateway != "" { + if err := SetDefaultGateway(config.IPv6Gateway, defaultDevice); err != nil { + return fmt.Errorf("set gateway for ipv6 to %s on device %s failed with %s", config.IPv6Gateway, defaultDevice, err) + } + } + return nil +} + +// createVethPair will automatically generage two random names for +// the veth pair and ensure that they have been created +func createVethPair(prefix string) (name1 string, name2 string, err error) { + for i := 0; i < 10; i++ { + if name1, err = utils.GenerateRandomName(prefix, 7); err != nil { + return + } + + if name2, err = utils.GenerateRandomName(prefix, 7); err != nil { + return + } + + if err = CreateVethPair(name1, name2); err != nil { + if err == netlink.ErrInterfaceExists { + continue + } + + return + } + + break + } + + return +} diff --git a/network/veth_test.go b/network/veth_test.go new file mode 100644 index 00000000..e09a6042 --- /dev/null +++ b/network/veth_test.go @@ -0,0 +1,53 @@ +// +build linux + +package network + +import ( + "testing" + + "github.com/docker/libcontainer/netlink" +) + +func TestGenerateVethNames(t *testing.T) { + if testing.Short() { + return + } + + prefix := "veth" + + name1, name2, err := createVethPair(prefix) + if err != nil { + t.Fatal(err) + } + + if name1 == "" { + t.Fatal("name1 should not be empty") + } + + if name2 == "" { + t.Fatal("name2 should not be empty") + } +} + +func TestCreateDuplicateVethPair(t *testing.T) { + if testing.Short() { + return + } + + prefix := "veth" + + name1, name2, err := createVethPair(prefix) + if err != nil { + t.Fatal(err) + } + + // retry to create the name interfaces and make sure that we get the correct error + err = CreateVethPair(name1, name2) + if err == nil { + t.Fatal("expected error to not be nil with duplicate interface") + } + + if err != netlink.ErrInterfaceExists { + t.Fatalf("expected error to be ErrInterfaceExists but received %q", err) + } +} diff --git a/nsinit/config.go b/nsinit/config.go new file mode 100644 index 00000000..74c7b3c0 --- /dev/null +++ b/nsinit/config.go @@ -0,0 +1,29 @@ +package main + +import ( + "encoding/json" + "fmt" + "log" + + "github.com/codegangsta/cli" +) + +var configCommand = cli.Command{ + Name: "config", + Usage: "display the container configuration", + Action: configAction, +} + +func configAction(context *cli.Context) { + container, err := loadConfig() + if err != nil { + log.Fatal(err) + } + + data, err := json.MarshalIndent(container, "", "\t") + if err != nil { + log.Fatal(err) + } + + fmt.Printf("%s", data) +} diff --git a/nsinit/exec.go b/nsinit/exec.go new file mode 100644 index 00000000..6fc553b8 --- /dev/null +++ b/nsinit/exec.go @@ -0,0 +1,208 @@ +package main + +import ( + "fmt" + "io" + "log" + "os" + "os/exec" + "os/signal" + "syscall" + "text/tabwriter" + + "github.com/codegangsta/cli" + "github.com/docker/docker/pkg/term" + "github.com/docker/libcontainer" + consolepkg "github.com/docker/libcontainer/console" + "github.com/docker/libcontainer/namespaces" +) + +var execCommand = cli.Command{ + Name: "exec", + Usage: "execute a new command inside a container", + Action: execAction, + Flags: []cli.Flag{ + cli.BoolFlag{Name: "list", Usage: "list all registered exec functions"}, + cli.StringFlag{Name: "func", Value: "exec", Usage: "function name to exec inside a container"}, + }, +} + +func execAction(context *cli.Context) { + if context.Bool("list") { + w := tabwriter.NewWriter(os.Stdout, 10, 1, 3, ' ', 0) + fmt.Fprint(w, "NAME\tUSAGE\n") + + for k, f := range argvs { + fmt.Fprintf(w, "%s\t%s\n", k, f.Usage) + } + + w.Flush() + + return + } + + var exitCode int + + container, err := loadConfig() + if err != nil { + log.Fatal(err) + } + + state, err := libcontainer.GetState(dataPath) + if err != nil && !os.IsNotExist(err) { + log.Fatalf("unable to read state.json: %s", err) + } + + if state != nil { + exitCode, err = startInExistingContainer(container, state, context.String("func"), context) + } else { + exitCode, err = startContainer(container, dataPath, []string(context.Args())) + } + + if err != nil { + log.Fatalf("failed to exec: %s", err) + } + + os.Exit(exitCode) +} + +// the process for execing a new process inside an existing container is that we have to exec ourself +// with the nsenter argument so that the C code can setns an the namespaces that we require. Then that +// code path will drop us into the path that we can do the final setup of the namespace and exec the users +// application. +func startInExistingContainer(config *libcontainer.Config, state *libcontainer.State, action string, context *cli.Context) (int, error) { + var ( + master *os.File + console string + err error + + sigc = make(chan os.Signal, 10) + + stdin = os.Stdin + stdout = os.Stdout + stderr = os.Stderr + ) + signal.Notify(sigc) + + if config.Tty { + stdin = nil + stdout = nil + stderr = nil + + master, console, err = consolepkg.CreateMasterAndConsole() + if err != nil { + return -1, err + } + + go io.Copy(master, os.Stdin) + go io.Copy(os.Stdout, master) + + state, err := term.SetRawTerminal(os.Stdin.Fd()) + if err != nil { + return -1, err + } + + defer term.RestoreTerminal(os.Stdin.Fd(), state) + } + + startCallback := func(cmd *exec.Cmd) { + go func() { + resizeTty(master) + + for sig := range sigc { + switch sig { + case syscall.SIGWINCH: + resizeTty(master) + default: + cmd.Process.Signal(sig) + } + } + }() + } + + return namespaces.ExecIn(config, state, context.Args(), os.Args[0], action, stdin, stdout, stderr, console, startCallback) +} + +// startContainer starts the container. Returns the exit status or -1 and an +// error. +// +// Signals sent to the current process will be forwarded to container. +func startContainer(container *libcontainer.Config, dataPath string, args []string) (int, error) { + var ( + cmd *exec.Cmd + sigc = make(chan os.Signal, 10) + ) + + signal.Notify(sigc) + + createCommand := func(container *libcontainer.Config, console, dataPath, init string, pipe *os.File, args []string) *exec.Cmd { + cmd = namespaces.DefaultCreateCommand(container, console, dataPath, init, pipe, args) + if logPath != "" { + cmd.Env = append(cmd.Env, fmt.Sprintf("log=%s", logPath)) + } + return cmd + } + + var ( + master *os.File + console string + err error + + stdin = os.Stdin + stdout = os.Stdout + stderr = os.Stderr + ) + + if container.Tty { + stdin = nil + stdout = nil + stderr = nil + + master, console, err = consolepkg.CreateMasterAndConsole() + if err != nil { + return -1, err + } + + go io.Copy(master, os.Stdin) + go io.Copy(os.Stdout, master) + + state, err := term.SetRawTerminal(os.Stdin.Fd()) + if err != nil { + return -1, err + } + + defer term.RestoreTerminal(os.Stdin.Fd(), state) + } + + startCallback := func() { + go func() { + resizeTty(master) + + for sig := range sigc { + switch sig { + case syscall.SIGWINCH: + resizeTty(master) + default: + cmd.Process.Signal(sig) + } + } + }() + } + + return namespaces.Exec(container, stdin, stdout, stderr, console, dataPath, args, createCommand, startCallback) +} + +func resizeTty(master *os.File) { + if master == nil { + return + } + + ws, err := term.GetWinsize(os.Stdin.Fd()) + if err != nil { + return + } + + if err := term.SetWinsize(master.Fd(), ws); err != nil { + return + } +} diff --git a/nsinit/init.go b/nsinit/init.go new file mode 100644 index 00000000..c091ee10 --- /dev/null +++ b/nsinit/init.go @@ -0,0 +1,52 @@ +package main + +import ( + "log" + "os" + "runtime" + "strconv" + + "github.com/codegangsta/cli" + "github.com/docker/libcontainer/namespaces" + "github.com/docker/libcontainer/syncpipe" +) + +var ( + dataPath = os.Getenv("data_path") + console = os.Getenv("console") + rawPipeFd = os.Getenv("pipe") + + initCommand = cli.Command{ + Name: "init", + Usage: "runs the init process inside the namespace", + Action: initAction, + } +) + +func initAction(context *cli.Context) { + runtime.LockOSThread() + + container, err := loadConfig() + if err != nil { + log.Fatal(err) + } + + rootfs, err := os.Getwd() + if err != nil { + log.Fatal(err) + } + + pipeFd, err := strconv.Atoi(rawPipeFd) + if err != nil { + log.Fatal(err) + } + + syncPipe, err := syncpipe.NewSyncPipeFromFd(0, uintptr(pipeFd)) + if err != nil { + log.Fatalf("unable to create sync pipe: %s", err) + } + + if err := namespaces.Init(container, rootfs, console, syncPipe, []string(context.Args())); err != nil { + log.Fatalf("unable to initialize for container: %s", err) + } +} diff --git a/nsinit/main.go b/nsinit/main.go new file mode 100644 index 00000000..d65c0140 --- /dev/null +++ b/nsinit/main.go @@ -0,0 +1,67 @@ +package main + +import ( + "log" + "os" + "strings" + + "github.com/codegangsta/cli" +) + +var ( + logPath = os.Getenv("log") + argvs = make(map[string]*rFunc) +) + +func init() { + argvs["exec"] = &rFunc{ + Usage: "execute a process inside an existing container", + Action: nsenterExec, + } + + argvs["mknod"] = &rFunc{ + Usage: "mknod a device inside an existing container", + Action: nsenterMknod, + } + + argvs["ip"] = &rFunc{ + Usage: "display the container's network interfaces", + Action: nsenterIp, + } +} + +func main() { + // we need to check our argv 0 for any registred functions to run instead of the + // normal cli code path + f, exists := argvs[strings.TrimPrefix(os.Args[0], "nsenter-")] + if exists { + runFunc(f) + + return + } + + app := cli.NewApp() + + app.Name = "nsinit" + app.Version = "0.1" + app.Author = "libcontainer maintainers" + app.Flags = []cli.Flag{ + cli.StringFlag{Name: "nspid"}, + cli.StringFlag{Name: "console"}, + } + + app.Before = preload + + app.Commands = []cli.Command{ + execCommand, + initCommand, + statsCommand, + configCommand, + pauseCommand, + unpauseCommand, + } + + if err := app.Run(os.Args); err != nil { + log.Fatal(err) + } +} diff --git a/nsinit/nsenter.go b/nsinit/nsenter.go new file mode 100644 index 00000000..8dc149f4 --- /dev/null +++ b/nsinit/nsenter.go @@ -0,0 +1,84 @@ +package main + +import ( + "fmt" + "log" + "net" + "os" + "strconv" + "strings" + "text/tabwriter" + + "github.com/docker/libcontainer" + "github.com/docker/libcontainer/devices" + "github.com/docker/libcontainer/mount/nodes" + "github.com/docker/libcontainer/namespaces" + _ "github.com/docker/libcontainer/namespaces/nsenter" +) + +// nsenterExec exec's a process inside an existing container +func nsenterExec(config *libcontainer.Config, args []string) { + if err := namespaces.FinalizeSetns(config, args); err != nil { + log.Fatalf("failed to nsenter: %s", err) + } +} + +// nsenterMknod runs mknod inside an existing container +// +// mknod +func nsenterMknod(config *libcontainer.Config, args []string) { + if len(args) != 4 { + log.Fatalf("expected mknod to have 4 arguments not %d", len(args)) + } + + t := rune(args[1][0]) + + major, err := strconv.Atoi(args[2]) + if err != nil { + log.Fatal(err) + } + + minor, err := strconv.Atoi(args[3]) + if err != nil { + log.Fatal(err) + } + + n := &devices.Device{ + Path: args[0], + Type: t, + MajorNumber: int64(major), + MinorNumber: int64(minor), + } + + if err := nodes.CreateDeviceNode("/", n); err != nil { + log.Fatal(err) + } +} + +// nsenterIp displays the network interfaces inside a container's net namespace +func nsenterIp(config *libcontainer.Config, args []string) { + interfaces, err := net.Interfaces() + if err != nil { + log.Fatal(err) + } + + w := tabwriter.NewWriter(os.Stdout, 10, 1, 3, ' ', 0) + fmt.Fprint(w, "NAME\tMTU\tMAC\tFLAG\tADDRS\n") + + for _, iface := range interfaces { + addrs, err := iface.Addrs() + if err != nil { + log.Fatal(err) + } + + o := []string{} + + for _, a := range addrs { + o = append(o, a.String()) + } + + fmt.Fprintf(w, "%s\t%d\t%s\t%s\t%s\n", iface.Name, iface.MTU, iface.HardwareAddr, iface.Flags, strings.Join(o, ",")) + } + + w.Flush() +} diff --git a/nsinit/pause.go b/nsinit/pause.go new file mode 100644 index 00000000..ada24250 --- /dev/null +++ b/nsinit/pause.go @@ -0,0 +1,49 @@ +package main + +import ( + "log" + + "github.com/codegangsta/cli" + "github.com/docker/libcontainer/cgroups" + "github.com/docker/libcontainer/cgroups/fs" + "github.com/docker/libcontainer/cgroups/systemd" +) + +var pauseCommand = cli.Command{ + Name: "pause", + Usage: "pause the container's processes", + Action: pauseAction, +} + +var unpauseCommand = cli.Command{ + Name: "unpause", + Usage: "unpause the container's processes", + Action: unpauseAction, +} + +func pauseAction(context *cli.Context) { + if err := toggle(cgroups.Frozen); err != nil { + log.Fatal(err) + } +} + +func unpauseAction(context *cli.Context) { + if err := toggle(cgroups.Thawed); err != nil { + log.Fatal(err) + } +} + +func toggle(state cgroups.FreezerState) error { + container, err := loadConfig() + if err != nil { + return err + } + + if systemd.UseSystemd() { + err = systemd.Freeze(container.Cgroups, state) + } else { + err = fs.Freeze(container.Cgroups, state) + } + + return err +} diff --git a/nsinit/stats.go b/nsinit/stats.go new file mode 100644 index 00000000..612b4a4b --- /dev/null +++ b/nsinit/stats.go @@ -0,0 +1,39 @@ +package main + +import ( + "encoding/json" + "fmt" + "log" + + "github.com/codegangsta/cli" + "github.com/docker/libcontainer" +) + +var statsCommand = cli.Command{ + Name: "stats", + Usage: "display statistics for the container", + Action: statsAction, +} + +func statsAction(context *cli.Context) { + container, err := loadConfig() + if err != nil { + log.Fatal(err) + } + + state, err := libcontainer.GetState(dataPath) + if err != nil { + log.Fatal(err) + } + + stats, err := libcontainer.GetStats(container, state) + if err != nil { + log.Fatal(err) + } + data, err := json.MarshalIndent(stats, "", "\t") + if err != nil { + log.Fatal(err) + } + + fmt.Printf("%s", data) +} diff --git a/nsinit/utils.go b/nsinit/utils.go new file mode 100644 index 00000000..7f515594 --- /dev/null +++ b/nsinit/utils.go @@ -0,0 +1,94 @@ +package main + +import ( + "encoding/json" + "log" + "os" + "path/filepath" + + "github.com/codegangsta/cli" + "github.com/docker/libcontainer" + "github.com/docker/libcontainer/syncpipe" +) + +// rFunc is a function registration for calling after an execin +type rFunc struct { + Usage string + Action func(*libcontainer.Config, []string) +} + +func loadConfig() (*libcontainer.Config, error) { + f, err := os.Open(filepath.Join(dataPath, "container.json")) + if err != nil { + return nil, err + } + defer f.Close() + + var container *libcontainer.Config + if err := json.NewDecoder(f).Decode(&container); err != nil { + return nil, err + } + + return container, nil +} + +func openLog(name string) error { + f, err := os.OpenFile(name, os.O_CREATE|os.O_RDWR|os.O_APPEND, 0755) + if err != nil { + return err + } + + log.SetOutput(f) + + return nil +} + +func findUserArgs() []string { + i := 0 + for _, a := range os.Args { + i++ + + if a == "--" { + break + } + } + + return os.Args[i:] +} + +// loadConfigFromFd loads a container's config from the sync pipe that is provided by +// fd 3 when running a process +func loadConfigFromFd() (*libcontainer.Config, error) { + syncPipe, err := syncpipe.NewSyncPipeFromFd(0, 3) + if err != nil { + return nil, err + } + + var config *libcontainer.Config + if err := syncPipe.ReadFromParent(&config); err != nil { + return nil, err + } + + return config, nil +} + +func preload(context *cli.Context) error { + if logPath != "" { + if err := openLog(logPath); err != nil { + return err + } + } + + return nil +} + +func runFunc(f *rFunc) { + userArgs := findUserArgs() + + config, err := loadConfigFromFd() + if err != nil { + log.Fatalf("unable to receive config from sync pipe: %s", err) + } + + f.Action(config, userArgs) +} diff --git a/process.go b/process.go new file mode 100644 index 00000000..489666a5 --- /dev/null +++ b/process.go @@ -0,0 +1,27 @@ +package libcontainer + +import "io" + +// Configuration for a process to be run inside a container. +type ProcessConfig struct { + // The command to be run followed by any arguments. + Args []string + + // Map of environment variables to their values. + Env []string + + // Stdin is a pointer to a reader which provides the standard input stream. + // Stdout is a pointer to a writer which receives the standard output stream. + // Stderr is a pointer to a writer which receives the standard error stream. + // + // If a reader or writer is nil, the input stream is assumed to be empty and the output is + // discarded. + // + // The readers and writers, if supplied, are closed when the process terminates. Their Close + // methods should be idempotent. + // + // Stdout and Stderr may refer to the same writer in which case the output is interspersed. + Stdin io.ReadCloser + Stdout io.WriteCloser + Stderr io.WriteCloser +} diff --git a/sample_configs/README.md b/sample_configs/README.md new file mode 100644 index 00000000..4ccc6cde --- /dev/null +++ b/sample_configs/README.md @@ -0,0 +1,5 @@ +These configuration files can be used with `nsinit` to quickly develop, test, +and experiment with features of libcontainer. + +When consuming these configuration files, copy them into your rootfs and rename +the file to `container.json` for use with `nsinit`. diff --git a/sample_configs/apparmor.json b/sample_configs/apparmor.json new file mode 100644 index 00000000..f739df10 --- /dev/null +++ b/sample_configs/apparmor.json @@ -0,0 +1,196 @@ +{ + "capabilities": [ + "CHOWN", + "DAC_OVERRIDE", + "FOWNER", + "MKNOD", + "NET_RAW", + "SETGID", + "SETUID", + "SETFCAP", + "SETPCAP", + "NET_BIND_SERVICE", + "SYS_CHROOT", + "KILL" + ], + "cgroups": { + "allowed_devices": [ + { + "cgroup_permissions": "m", + "major_number": -1, + "minor_number": -1, + "type": 99 + }, + { + "cgroup_permissions": "m", + "major_number": -1, + "minor_number": -1, + "type": 98 + }, + { + "cgroup_permissions": "rwm", + "major_number": 5, + "minor_number": 1, + "path": "/dev/console", + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "major_number": 4, + "path": "/dev/tty0", + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "major_number": 4, + "minor_number": 1, + "path": "/dev/tty1", + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "major_number": 136, + "minor_number": -1, + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "major_number": 5, + "minor_number": 2, + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "major_number": 10, + "minor_number": 200, + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "file_mode": 438, + "major_number": 1, + "minor_number": 3, + "path": "/dev/null", + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "file_mode": 438, + "major_number": 1, + "minor_number": 5, + "path": "/dev/zero", + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "file_mode": 438, + "major_number": 1, + "minor_number": 7, + "path": "/dev/full", + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "file_mode": 438, + "major_number": 5, + "path": "/dev/tty", + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "file_mode": 438, + "major_number": 1, + "minor_number": 9, + "path": "/dev/urandom", + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "file_mode": 438, + "major_number": 1, + "minor_number": 8, + "path": "/dev/random", + "type": 99 + } + ], + "name": "docker-koye", + "parent": "docker" + }, + "restrict_sys": true, + "apparmor_profile": "docker-default", + "mount_config": { + "device_nodes": [ + { + "cgroup_permissions": "rwm", + "file_mode": 438, + "major_number": 1, + "minor_number": 3, + "path": "/dev/null", + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "file_mode": 438, + "major_number": 1, + "minor_number": 5, + "path": "/dev/zero", + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "file_mode": 438, + "major_number": 1, + "minor_number": 7, + "path": "/dev/full", + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "file_mode": 438, + "major_number": 5, + "path": "/dev/tty", + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "file_mode": 438, + "major_number": 1, + "minor_number": 9, + "path": "/dev/urandom", + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "file_mode": 438, + "major_number": 1, + "minor_number": 8, + "path": "/dev/random", + "type": 99 + } + ] + }, + "environment": [ + "HOME=/", + "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", + "HOSTNAME=koye", + "TERM=xterm" + ], + "hostname": "koye", + "namespaces": { + "NEWIPC": true, + "NEWNET": true, + "NEWNS": true, + "NEWPID": true, + "NEWUTS": true + }, + "networks": [ + { + "address": "127.0.0.1/0", + "gateway": "localhost", + "mtu": 1500, + "type": "loopback" + } + ], + "tty": true, + "user": "daemon" +} diff --git a/sample_configs/attach_to_bridge.json b/sample_configs/attach_to_bridge.json new file mode 100644 index 00000000..0795e6c1 --- /dev/null +++ b/sample_configs/attach_to_bridge.json @@ -0,0 +1,202 @@ +{ + "capabilities": [ + "CHOWN", + "DAC_OVERRIDE", + "FOWNER", + "MKNOD", + "NET_RAW", + "SETGID", + "SETUID", + "SETFCAP", + "SETPCAP", + "NET_BIND_SERVICE", + "SYS_CHROOT", + "KILL" + ], + "cgroups": { + "allowed_devices": [ + { + "cgroup_permissions": "m", + "major_number": -1, + "minor_number": -1, + "type": 99 + }, + { + "cgroup_permissions": "m", + "major_number": -1, + "minor_number": -1, + "type": 98 + }, + { + "cgroup_permissions": "rwm", + "major_number": 5, + "minor_number": 1, + "path": "/dev/console", + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "major_number": 4, + "path": "/dev/tty0", + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "major_number": 4, + "minor_number": 1, + "path": "/dev/tty1", + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "major_number": 136, + "minor_number": -1, + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "major_number": 5, + "minor_number": 2, + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "major_number": 10, + "minor_number": 200, + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "file_mode": 438, + "major_number": 1, + "minor_number": 3, + "path": "/dev/null", + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "file_mode": 438, + "major_number": 1, + "minor_number": 5, + "path": "/dev/zero", + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "file_mode": 438, + "major_number": 1, + "minor_number": 7, + "path": "/dev/full", + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "file_mode": 438, + "major_number": 5, + "path": "/dev/tty", + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "file_mode": 438, + "major_number": 1, + "minor_number": 9, + "path": "/dev/urandom", + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "file_mode": 438, + "major_number": 1, + "minor_number": 8, + "path": "/dev/random", + "type": 99 + } + ], + "name": "docker-koye", + "parent": "docker" + }, + "restrict_sys": true, + "mount_config": { + "device_nodes": [ + { + "cgroup_permissions": "rwm", + "file_mode": 438, + "major_number": 1, + "minor_number": 3, + "path": "/dev/null", + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "file_mode": 438, + "major_number": 1, + "minor_number": 5, + "path": "/dev/zero", + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "file_mode": 438, + "major_number": 1, + "minor_number": 7, + "path": "/dev/full", + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "file_mode": 438, + "major_number": 5, + "path": "/dev/tty", + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "file_mode": 438, + "major_number": 1, + "minor_number": 9, + "path": "/dev/urandom", + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "file_mode": 438, + "major_number": 1, + "minor_number": 8, + "path": "/dev/random", + "type": 99 + } + ] + }, + "environment": [ + "HOME=/", + "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", + "HOSTNAME=koye", + "TERM=xterm" + ], + "hostname": "koye", + "namespaces": { + "NEWIPC": true, + "NEWNET": true, + "NEWNS": true, + "NEWPID": true, + "NEWUTS": true + }, + "networks": [ + { + "address": "127.0.0.1/0", + "gateway": "localhost", + "mtu": 1500, + "type": "loopback" + }, + { + "address": "172.17.0.101/16", + "bridge": "docker0", + "veth_prefix": "veth", + "gateway": "172.17.42.1", + "mtu": 1500, + "type": "veth" + } + ], + "tty": true +} diff --git a/sample_configs/minimal.json b/sample_configs/minimal.json new file mode 100644 index 00000000..8d85ddf7 --- /dev/null +++ b/sample_configs/minimal.json @@ -0,0 +1,201 @@ +{ + "capabilities": [ + "CHOWN", + "DAC_OVERRIDE", + "FOWNER", + "MKNOD", + "NET_RAW", + "SETGID", + "SETUID", + "SETFCAP", + "SETPCAP", + "NET_BIND_SERVICE", + "SYS_CHROOT", + "KILL" + ], + "cgroups": { + "allowed_devices": [ + { + "cgroup_permissions": "m", + "major_number": -1, + "minor_number": -1, + "type": 99 + }, + { + "cgroup_permissions": "m", + "major_number": -1, + "minor_number": -1, + "type": 98 + }, + { + "cgroup_permissions": "rwm", + "major_number": 5, + "minor_number": 1, + "path": "/dev/console", + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "major_number": 4, + "path": "/dev/tty0", + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "major_number": 4, + "minor_number": 1, + "path": "/dev/tty1", + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "major_number": 136, + "minor_number": -1, + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "major_number": 5, + "minor_number": 2, + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "major_number": 10, + "minor_number": 200, + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "file_mode": 438, + "major_number": 1, + "minor_number": 3, + "path": "/dev/null", + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "file_mode": 438, + "major_number": 1, + "minor_number": 5, + "path": "/dev/zero", + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "file_mode": 438, + "major_number": 1, + "minor_number": 7, + "path": "/dev/full", + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "file_mode": 438, + "major_number": 5, + "path": "/dev/tty", + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "file_mode": 438, + "major_number": 1, + "minor_number": 9, + "path": "/dev/urandom", + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "file_mode": 438, + "major_number": 1, + "minor_number": 8, + "path": "/dev/random", + "type": 99 + } + ], + "name": "docker-koye", + "parent": "docker" + }, + "restrict_sys": true, + "mount_config": { + "device_nodes": [ + { + "cgroup_permissions": "rwm", + "file_mode": 438, + "major_number": 1, + "minor_number": 3, + "path": "/dev/null", + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "file_mode": 438, + "major_number": 1, + "minor_number": 5, + "path": "/dev/zero", + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "file_mode": 438, + "major_number": 1, + "minor_number": 7, + "path": "/dev/full", + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "file_mode": 438, + "major_number": 5, + "path": "/dev/tty", + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "file_mode": 438, + "major_number": 1, + "minor_number": 9, + "path": "/dev/urandom", + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "file_mode": 438, + "major_number": 1, + "minor_number": 8, + "path": "/dev/random", + "type": 99 + } + ], + "mounts": [ + { + "type": "tmpfs", + "destination": "/tmp" + } + ] + }, + "environment": [ + "HOME=/", + "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", + "HOSTNAME=koye", + "TERM=xterm" + ], + "hostname": "koye", + "namespaces": { + "NEWIPC": true, + "NEWNET": true, + "NEWNS": true, + "NEWPID": true, + "NEWUTS": true + }, + "networks": [ + { + "address": "127.0.0.1/0", + "gateway": "localhost", + "mtu": 1500, + "type": "loopback" + } + ], + "tty": true, + "user": "daemon" +} diff --git a/sample_configs/selinux.json b/sample_configs/selinux.json new file mode 100644 index 00000000..ce383e2c --- /dev/null +++ b/sample_configs/selinux.json @@ -0,0 +1,197 @@ +{ + "capabilities": [ + "CHOWN", + "DAC_OVERRIDE", + "FOWNER", + "MKNOD", + "NET_RAW", + "SETGID", + "SETUID", + "SETFCAP", + "SETPCAP", + "NET_BIND_SERVICE", + "SYS_CHROOT", + "KILL" + ], + "cgroups": { + "allowed_devices": [ + { + "cgroup_permissions": "m", + "major_number": -1, + "minor_number": -1, + "type": 99 + }, + { + "cgroup_permissions": "m", + "major_number": -1, + "minor_number": -1, + "type": 98 + }, + { + "cgroup_permissions": "rwm", + "major_number": 5, + "minor_number": 1, + "path": "/dev/console", + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "major_number": 4, + "path": "/dev/tty0", + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "major_number": 4, + "minor_number": 1, + "path": "/dev/tty1", + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "major_number": 136, + "minor_number": -1, + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "major_number": 5, + "minor_number": 2, + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "major_number": 10, + "minor_number": 200, + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "file_mode": 438, + "major_number": 1, + "minor_number": 3, + "path": "/dev/null", + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "file_mode": 438, + "major_number": 1, + "minor_number": 5, + "path": "/dev/zero", + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "file_mode": 438, + "major_number": 1, + "minor_number": 7, + "path": "/dev/full", + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "file_mode": 438, + "major_number": 5, + "path": "/dev/tty", + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "file_mode": 438, + "major_number": 1, + "minor_number": 9, + "path": "/dev/urandom", + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "file_mode": 438, + "major_number": 1, + "minor_number": 8, + "path": "/dev/random", + "type": 99 + } + ], + "name": "docker-koye", + "parent": "docker" + }, + "restrict_sys": true, + "process_label": "system_u:system_r:svirt_lxc_net_t:s0:c164,c475", + "mount_config": { + "mount_label": "system_u:system_r:svirt_lxc_net_t:s0:c164,c475", + "device_nodes": [ + { + "cgroup_permissions": "rwm", + "file_mode": 438, + "major_number": 1, + "minor_number": 3, + "path": "/dev/null", + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "file_mode": 438, + "major_number": 1, + "minor_number": 5, + "path": "/dev/zero", + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "file_mode": 438, + "major_number": 1, + "minor_number": 7, + "path": "/dev/full", + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "file_mode": 438, + "major_number": 5, + "path": "/dev/tty", + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "file_mode": 438, + "major_number": 1, + "minor_number": 9, + "path": "/dev/urandom", + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "file_mode": 438, + "major_number": 1, + "minor_number": 8, + "path": "/dev/random", + "type": 99 + } + ] + }, + "environment": [ + "HOME=/", + "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", + "HOSTNAME=koye", + "TERM=xterm" + ], + "hostname": "koye", + "namespaces": { + "NEWIPC": true, + "NEWNET": true, + "NEWNS": true, + "NEWPID": true, + "NEWUTS": true + }, + "networks": [ + { + "address": "127.0.0.1/0", + "gateway": "localhost", + "mtu": 1500, + "type": "loopback" + } + ], + "tty": true, + "user": "daemon" +} diff --git a/security/capabilities/capabilities.go b/security/capabilities/capabilities.go new file mode 100644 index 00000000..7aef5fa6 --- /dev/null +++ b/security/capabilities/capabilities.go @@ -0,0 +1,56 @@ +package capabilities + +import ( + "os" + + "github.com/syndtr/gocapability/capability" +) + +const allCapabilityTypes = capability.CAPS | capability.BOUNDS + +// DropBoundingSet drops the capability bounding set to those specified in the +// container configuration. +func DropBoundingSet(capabilities []string) error { + c, err := capability.NewPid(os.Getpid()) + if err != nil { + return err + } + + keep := getEnabledCapabilities(capabilities) + c.Clear(capability.BOUNDS) + c.Set(capability.BOUNDS, keep...) + + if err := c.Apply(capability.BOUNDS); err != nil { + return err + } + + return nil +} + +// DropCapabilities drops all capabilities for the current process except those specified in the container configuration. +func DropCapabilities(capList []string) error { + c, err := capability.NewPid(os.Getpid()) + if err != nil { + return err + } + + keep := getEnabledCapabilities(capList) + c.Clear(allCapabilityTypes) + c.Set(allCapabilityTypes, keep...) + + if err := c.Apply(allCapabilityTypes); err != nil { + return err + } + return nil +} + +// getEnabledCapabilities returns the capabilities that should not be dropped by the container. +func getEnabledCapabilities(capList []string) []capability.Cap { + keep := []capability.Cap{} + for _, capability := range capList { + if c := GetCapability(capability); c != nil { + keep = append(keep, c.Value) + } + } + return keep +} diff --git a/security/capabilities/types.go b/security/capabilities/types.go new file mode 100644 index 00000000..a960b804 --- /dev/null +++ b/security/capabilities/types.go @@ -0,0 +1,88 @@ +package capabilities + +import "github.com/syndtr/gocapability/capability" + +type ( + CapabilityMapping struct { + Key string `json:"key,omitempty"` + Value capability.Cap `json:"value,omitempty"` + } + Capabilities []*CapabilityMapping +) + +func (c *CapabilityMapping) String() string { + return c.Key +} + +func GetCapability(key string) *CapabilityMapping { + for _, capp := range capabilityList { + if capp.Key == key { + cpy := *capp + return &cpy + } + } + return nil +} + +func GetAllCapabilities() []string { + output := make([]string, len(capabilityList)) + for i, capability := range capabilityList { + output[i] = capability.String() + } + return output +} + +// Contains returns true if the specified Capability is +// in the slice +func (c Capabilities) contains(capp string) bool { + return c.get(capp) != nil +} + +func (c Capabilities) get(capp string) *CapabilityMapping { + for _, cap := range c { + if cap.Key == capp { + return cap + } + } + return nil +} + +var capabilityList = Capabilities{ + {Key: "SETPCAP", Value: capability.CAP_SETPCAP}, + {Key: "SYS_MODULE", Value: capability.CAP_SYS_MODULE}, + {Key: "SYS_RAWIO", Value: capability.CAP_SYS_RAWIO}, + {Key: "SYS_PACCT", Value: capability.CAP_SYS_PACCT}, + {Key: "SYS_ADMIN", Value: capability.CAP_SYS_ADMIN}, + {Key: "SYS_NICE", Value: capability.CAP_SYS_NICE}, + {Key: "SYS_RESOURCE", Value: capability.CAP_SYS_RESOURCE}, + {Key: "SYS_TIME", Value: capability.CAP_SYS_TIME}, + {Key: "SYS_TTY_CONFIG", Value: capability.CAP_SYS_TTY_CONFIG}, + {Key: "MKNOD", Value: capability.CAP_MKNOD}, + {Key: "AUDIT_WRITE", Value: capability.CAP_AUDIT_WRITE}, + {Key: "AUDIT_CONTROL", Value: capability.CAP_AUDIT_CONTROL}, + {Key: "MAC_OVERRIDE", Value: capability.CAP_MAC_OVERRIDE}, + {Key: "MAC_ADMIN", Value: capability.CAP_MAC_ADMIN}, + {Key: "NET_ADMIN", Value: capability.CAP_NET_ADMIN}, + {Key: "SYSLOG", Value: capability.CAP_SYSLOG}, + {Key: "CHOWN", Value: capability.CAP_CHOWN}, + {Key: "NET_RAW", Value: capability.CAP_NET_RAW}, + {Key: "DAC_OVERRIDE", Value: capability.CAP_DAC_OVERRIDE}, + {Key: "FOWNER", Value: capability.CAP_FOWNER}, + {Key: "DAC_READ_SEARCH", Value: capability.CAP_DAC_READ_SEARCH}, + {Key: "FSETID", Value: capability.CAP_FSETID}, + {Key: "KILL", Value: capability.CAP_KILL}, + {Key: "SETGID", Value: capability.CAP_SETGID}, + {Key: "SETUID", Value: capability.CAP_SETUID}, + {Key: "LINUX_IMMUTABLE", Value: capability.CAP_LINUX_IMMUTABLE}, + {Key: "NET_BIND_SERVICE", Value: capability.CAP_NET_BIND_SERVICE}, + {Key: "NET_BROADCAST", Value: capability.CAP_NET_BROADCAST}, + {Key: "IPC_LOCK", Value: capability.CAP_IPC_LOCK}, + {Key: "IPC_OWNER", Value: capability.CAP_IPC_OWNER}, + {Key: "SYS_CHROOT", Value: capability.CAP_SYS_CHROOT}, + {Key: "SYS_PTRACE", Value: capability.CAP_SYS_PTRACE}, + {Key: "SYS_BOOT", Value: capability.CAP_SYS_BOOT}, + {Key: "LEASE", Value: capability.CAP_LEASE}, + {Key: "SETFCAP", Value: capability.CAP_SETFCAP}, + {Key: "WAKE_ALARM", Value: capability.CAP_WAKE_ALARM}, + {Key: "BLOCK_SUSPEND", Value: capability.CAP_BLOCK_SUSPEND}, +} diff --git a/security/capabilities/types_test.go b/security/capabilities/types_test.go new file mode 100644 index 00000000..06e8a2b0 --- /dev/null +++ b/security/capabilities/types_test.go @@ -0,0 +1,19 @@ +package capabilities + +import ( + "testing" +) + +func TestCapabilitiesContains(t *testing.T) { + caps := Capabilities{ + GetCapability("MKNOD"), + GetCapability("SETPCAP"), + } + + if caps.contains("SYS_ADMIN") { + t.Fatal("capabilities should not contain SYS_ADMIN") + } + if !caps.contains("MKNOD") { + t.Fatal("capabilities should contain MKNOD but does not") + } +} diff --git a/security/restrict/restrict.go b/security/restrict/restrict.go new file mode 100644 index 00000000..dd765b1f --- /dev/null +++ b/security/restrict/restrict.go @@ -0,0 +1,53 @@ +// +build linux + +package restrict + +import ( + "fmt" + "os" + "syscall" + "time" +) + +const defaultMountFlags = syscall.MS_NOEXEC | syscall.MS_NOSUID | syscall.MS_NODEV + +func mountReadonly(path string) error { + for i := 0; i < 5; i++ { + if err := syscall.Mount("", path, "", syscall.MS_REMOUNT|syscall.MS_RDONLY, ""); err != nil && !os.IsNotExist(err) { + switch err { + case syscall.EINVAL: + // Probably not a mountpoint, use bind-mount + if err := syscall.Mount(path, path, "", syscall.MS_BIND, ""); err != nil { + return err + } + + return syscall.Mount(path, path, "", syscall.MS_BIND|syscall.MS_REMOUNT|syscall.MS_RDONLY|syscall.MS_REC|defaultMountFlags, "") + case syscall.EBUSY: + time.Sleep(100 * time.Millisecond) + continue + default: + return err + } + } + + return nil + } + + return fmt.Errorf("unable to mount %s as readonly max retries reached", path) +} + +// This has to be called while the container still has CAP_SYS_ADMIN (to be able to perform mounts). +// However, afterwards, CAP_SYS_ADMIN should be dropped (otherwise the user will be able to revert those changes). +func Restrict(mounts ...string) error { + for _, dest := range mounts { + if err := mountReadonly(dest); err != nil { + return fmt.Errorf("unable to remount %s readonly: %s", dest, err) + } + } + + if err := syscall.Mount("/dev/null", "/proc/kcore", "", syscall.MS_BIND, ""); err != nil && !os.IsNotExist(err) { + return fmt.Errorf("unable to bind-mount /dev/null over /proc/kcore: %s", err) + } + + return nil +} diff --git a/security/restrict/unsupported.go b/security/restrict/unsupported.go new file mode 100644 index 00000000..464e8d49 --- /dev/null +++ b/security/restrict/unsupported.go @@ -0,0 +1,9 @@ +// +build !linux + +package restrict + +import "fmt" + +func Restrict() error { + return fmt.Errorf("not supported") +} diff --git a/selinux/selinux.go b/selinux/selinux.go new file mode 100644 index 00000000..e0c90ee5 --- /dev/null +++ b/selinux/selinux.go @@ -0,0 +1,436 @@ +// +build linux + +package selinux + +import ( + "bufio" + "crypto/rand" + "encoding/binary" + "fmt" + "io" + "os" + "path/filepath" + "regexp" + "strconv" + "strings" + "syscall" + + "github.com/docker/docker/pkg/mount" + "github.com/docker/libcontainer/system" +) + +const ( + Enforcing = 1 + Permissive = 0 + Disabled = -1 + selinuxDir = "/etc/selinux/" + selinuxConfig = selinuxDir + "config" + selinuxTypeTag = "SELINUXTYPE" + selinuxTag = "SELINUX" + selinuxPath = "/sys/fs/selinux" + xattrNameSelinux = "security.selinux" + stRdOnly = 0x01 +) + +var ( + assignRegex = regexp.MustCompile(`^([^=]+)=(.*)$`) + spaceRegex = regexp.MustCompile(`^([^=]+) (.*)$`) + mcsList = make(map[string]bool) + selinuxfs = "unknown" + selinuxEnabled = false + selinuxEnabledChecked = false +) + +type SELinuxContext map[string]string + +// SetDisabled disables selinux support for the package +func SetDisabled() { + selinuxEnabled, selinuxEnabledChecked = false, true +} + +func getSelinuxMountPoint() string { + if selinuxfs != "unknown" { + return selinuxfs + } + selinuxfs = "" + + mounts, err := mount.GetMounts() + if err != nil { + return selinuxfs + } + for _, mount := range mounts { + if mount.Fstype == "selinuxfs" { + selinuxfs = mount.Mountpoint + break + } + } + if selinuxfs != "" { + var buf syscall.Statfs_t + syscall.Statfs(selinuxfs, &buf) + if (buf.Flags & stRdOnly) == 1 { + selinuxfs = "" + } + } + return selinuxfs +} + +func SelinuxEnabled() bool { + if selinuxEnabledChecked { + return selinuxEnabled + } + selinuxEnabledChecked = true + if fs := getSelinuxMountPoint(); fs != "" { + if con, _ := Getcon(); con != "kernel" { + selinuxEnabled = true + } + } + return selinuxEnabled +} + +func readConfig(target string) (value string) { + var ( + val, key string + bufin *bufio.Reader + ) + + in, err := os.Open(selinuxConfig) + if err != nil { + return "" + } + defer in.Close() + + bufin = bufio.NewReader(in) + + for done := false; !done; { + var line string + if line, err = bufin.ReadString('\n'); err != nil { + if err != io.EOF { + return "" + } + done = true + } + line = strings.TrimSpace(line) + if len(line) == 0 { + // Skip blank lines + continue + } + if line[0] == ';' || line[0] == '#' { + // Skip comments + continue + } + if groups := assignRegex.FindStringSubmatch(line); groups != nil { + key, val = strings.TrimSpace(groups[1]), strings.TrimSpace(groups[2]) + if key == target { + return strings.Trim(val, "\"") + } + } + } + return "" +} + +func getSELinuxPolicyRoot() string { + return selinuxDir + readConfig(selinuxTypeTag) +} + +func readCon(name string) (string, error) { + var val string + + in, err := os.Open(name) + if err != nil { + return "", err + } + defer in.Close() + + _, err = fmt.Fscanf(in, "%s", &val) + return val, err +} + +func Setfilecon(path string, scon string) error { + return system.Lsetxattr(path, xattrNameSelinux, []byte(scon), 0) +} + +// Return the SELinux label for this path +func Getfilecon(path string) (string, error) { + con, err := system.Lgetxattr(path, xattrNameSelinux) + return string(con), err +} + +func Setfscreatecon(scon string) error { + return writeCon(fmt.Sprintf("/proc/self/task/%d/attr/fscreate", syscall.Gettid()), scon) +} + +func Getfscreatecon() (string, error) { + return readCon(fmt.Sprintf("/proc/self/task/%d/attr/fscreate", syscall.Gettid())) +} + +// Return the SELinux label of the current process thread. +func Getcon() (string, error) { + return readCon(fmt.Sprintf("/proc/self/task/%d/attr/current", syscall.Gettid())) +} + +func Getpidcon(pid int) (string, error) { + return readCon(fmt.Sprintf("/proc/%d/attr/current", pid)) +} + +func Getexeccon() (string, error) { + return readCon(fmt.Sprintf("/proc/self/task/%d/attr/exec", syscall.Gettid())) +} + +func writeCon(name string, val string) error { + out, err := os.OpenFile(name, os.O_WRONLY, 0) + if err != nil { + return err + } + defer out.Close() + + if val != "" { + _, err = out.Write([]byte(val)) + } else { + _, err = out.Write(nil) + } + return err +} + +func Setexeccon(scon string) error { + return writeCon(fmt.Sprintf("/proc/self/task/%d/attr/exec", syscall.Gettid()), scon) +} + +func (c SELinuxContext) Get() string { + return fmt.Sprintf("%s:%s:%s:%s", c["user"], c["role"], c["type"], c["level"]) +} + +func NewContext(scon string) SELinuxContext { + c := make(SELinuxContext) + + if len(scon) != 0 { + con := strings.SplitN(scon, ":", 4) + c["user"] = con[0] + c["role"] = con[1] + c["type"] = con[2] + c["level"] = con[3] + } + return c +} + +func ReserveLabel(scon string) { + if len(scon) != 0 { + con := strings.SplitN(scon, ":", 4) + mcsAdd(con[3]) + } +} + +func SelinuxGetEnforce() int { + var enforce int + + enforceS, err := readCon(fmt.Sprintf("%s/enforce", selinuxPath)) + if err != nil { + return -1 + } + + enforce, err = strconv.Atoi(string(enforceS)) + if err != nil { + return -1 + } + return enforce +} + +func SelinuxGetEnforceMode() int { + switch readConfig(selinuxTag) { + case "enforcing": + return Enforcing + case "permissive": + return Permissive + } + return Disabled +} + +func mcsAdd(mcs string) error { + if mcsList[mcs] { + return fmt.Errorf("MCS Label already exists") + } + mcsList[mcs] = true + return nil +} + +func mcsDelete(mcs string) { + mcsList[mcs] = false +} + +func mcsExists(mcs string) bool { + return mcsList[mcs] +} + +func IntToMcs(id int, catRange uint32) string { + var ( + SETSIZE = int(catRange) + TIER = SETSIZE + ORD = id + ) + + if id < 1 || id > 523776 { + return "" + } + + for ORD > TIER { + ORD = ORD - TIER + TIER -= 1 + } + TIER = SETSIZE - TIER + ORD = ORD + TIER + return fmt.Sprintf("s0:c%d,c%d", TIER, ORD) +} + +func uniqMcs(catRange uint32) string { + var ( + n uint32 + c1, c2 uint32 + mcs string + ) + + for { + binary.Read(rand.Reader, binary.LittleEndian, &n) + c1 = n % catRange + binary.Read(rand.Reader, binary.LittleEndian, &n) + c2 = n % catRange + if c1 == c2 { + continue + } else { + if c1 > c2 { + t := c1 + c1 = c2 + c2 = t + } + } + mcs = fmt.Sprintf("s0:c%d,c%d", c1, c2) + if err := mcsAdd(mcs); err != nil { + continue + } + break + } + return mcs +} + +func FreeLxcContexts(scon string) { + if len(scon) != 0 { + con := strings.SplitN(scon, ":", 4) + mcsDelete(con[3]) + } +} + +func GetLxcContexts() (processLabel string, fileLabel string) { + var ( + val, key string + bufin *bufio.Reader + ) + + if !SelinuxEnabled() { + return "", "" + } + lxcPath := fmt.Sprintf("%s/contexts/lxc_contexts", getSELinuxPolicyRoot()) + in, err := os.Open(lxcPath) + if err != nil { + return "", "" + } + defer in.Close() + + bufin = bufio.NewReader(in) + + for done := false; !done; { + var line string + if line, err = bufin.ReadString('\n'); err != nil { + if err == io.EOF { + done = true + } else { + goto exit + } + } + line = strings.TrimSpace(line) + if len(line) == 0 { + // Skip blank lines + continue + } + if line[0] == ';' || line[0] == '#' { + // Skip comments + continue + } + if groups := assignRegex.FindStringSubmatch(line); groups != nil { + key, val = strings.TrimSpace(groups[1]), strings.TrimSpace(groups[2]) + if key == "process" { + processLabel = strings.Trim(val, "\"") + } + if key == "file" { + fileLabel = strings.Trim(val, "\"") + } + } + } + + if processLabel == "" || fileLabel == "" { + return "", "" + } + +exit: + // mcs := IntToMcs(os.Getpid(), 1024) + mcs := uniqMcs(1024) + scon := NewContext(processLabel) + scon["level"] = mcs + processLabel = scon.Get() + scon = NewContext(fileLabel) + scon["level"] = mcs + fileLabel = scon.Get() + return processLabel, fileLabel +} + +func SecurityCheckContext(val string) error { + return writeCon(fmt.Sprintf("%s.context", selinuxPath), val) +} + +func CopyLevel(src, dest string) (string, error) { + if src == "" { + return "", nil + } + if err := SecurityCheckContext(src); err != nil { + return "", err + } + if err := SecurityCheckContext(dest); err != nil { + return "", err + } + scon := NewContext(src) + tcon := NewContext(dest) + mcsDelete(tcon["level"]) + mcsAdd(scon["level"]) + tcon["level"] = scon["level"] + return tcon.Get(), nil +} + +// Prevent users from relabing system files +func badPrefix(fpath string) error { + var badprefixes = []string{"/usr"} + + for _, prefix := range badprefixes { + if fpath == prefix || strings.HasPrefix(fpath, fmt.Sprintf("%s/", prefix)) { + return fmt.Errorf("Relabeling content in %s is not allowed.", prefix) + } + } + return nil +} + +// Change the fpath file object to the SELinux label scon. +// If the fpath is a directory and recurse is true Chcon will walk the +// directory tree setting the label +func Chcon(fpath string, scon string, recurse bool) error { + if scon == "" { + return nil + } + if err := badPrefix(fpath); err != nil { + return err + } + callback := func(p string, info os.FileInfo, err error) error { + return Setfilecon(p, scon) + } + + if recurse { + return filepath.Walk(fpath, callback) + } + + return Setfilecon(fpath, scon) +} diff --git a/selinux/selinux_test.go b/selinux/selinux_test.go new file mode 100644 index 00000000..34c34974 --- /dev/null +++ b/selinux/selinux_test.go @@ -0,0 +1,64 @@ +// +build linux + +package selinux_test + +import ( + "os" + "testing" + + "github.com/docker/libcontainer/selinux" +) + +func testSetfilecon(t *testing.T) { + if selinux.SelinuxEnabled() { + tmp := "selinux_test" + out, _ := os.OpenFile(tmp, os.O_WRONLY, 0) + out.Close() + err := selinux.Setfilecon(tmp, "system_u:object_r:bin_t:s0") + if err != nil { + t.Log("Setfilecon failed") + t.Fatal(err) + } + os.Remove(tmp) + } +} + +func TestSELinux(t *testing.T) { + var ( + err error + plabel, flabel string + ) + + if selinux.SelinuxEnabled() { + t.Log("Enabled") + plabel, flabel = selinux.GetLxcContexts() + t.Log(plabel) + t.Log(flabel) + selinux.FreeLxcContexts(plabel) + plabel, flabel = selinux.GetLxcContexts() + t.Log(plabel) + t.Log(flabel) + selinux.FreeLxcContexts(plabel) + t.Log("getenforce ", selinux.SelinuxGetEnforce()) + t.Log("getenforcemode ", selinux.SelinuxGetEnforceMode()) + pid := os.Getpid() + t.Log("PID:%d MCS:%s\n", pid, selinux.IntToMcs(pid, 1023)) + err = selinux.Setfscreatecon("unconfined_u:unconfined_r:unconfined_t:s0") + if err == nil { + t.Log(selinux.Getfscreatecon()) + } else { + t.Log("setfscreatecon failed", err) + t.Fatal(err) + } + err = selinux.Setfscreatecon("") + if err == nil { + t.Log(selinux.Getfscreatecon()) + } else { + t.Log("setfscreatecon failed", err) + t.Fatal(err) + } + t.Log(selinux.Getpidcon(1)) + } else { + t.Log("Disabled") + } +} diff --git a/state.go b/state.go new file mode 100644 index 00000000..208b4c62 --- /dev/null +++ b/state.go @@ -0,0 +1,77 @@ +package libcontainer + +import ( + "encoding/json" + "os" + "path/filepath" + + "github.com/docker/libcontainer/network" +) + +// State represents a running container's state +type State struct { + // InitPid is the init process id in the parent namespace + InitPid int `json:"init_pid,omitempty"` + + // InitStartTime is the init process start time + InitStartTime string `json:"init_start_time,omitempty"` + + // Network runtime state. + NetworkState network.NetworkState `json:"network_state,omitempty"` + + // Path to all the cgroups setup for a container. Key is cgroup subsystem name. + CgroupPaths map[string]string `json:"cgroup_paths,omitempty"` +} + +// The running state of the container. +type RunState int + +const ( + // The name of the runtime state file + stateFile = "state.json" + + // The container exists and is running. + Running RunState = iota + + // The container exists, it is in the process of being paused. + Pausing + + // The container exists, but all its processes are paused. + Paused + + // The container does not exist. + Destroyed +) + +// SaveState writes the container's runtime state to a state.json file +// in the specified path +func SaveState(basePath string, state *State) error { + f, err := os.Create(filepath.Join(basePath, stateFile)) + if err != nil { + return err + } + defer f.Close() + + return json.NewEncoder(f).Encode(state) +} + +// GetState reads the state.json file for a running container +func GetState(basePath string) (*State, error) { + f, err := os.Open(filepath.Join(basePath, stateFile)) + if err != nil { + return nil, err + } + defer f.Close() + + var state *State + if err := json.NewDecoder(f).Decode(&state); err != nil { + return nil, err + } + + return state, nil +} + +// DeleteState deletes the state.json file +func DeleteState(basePath string) error { + return os.Remove(filepath.Join(basePath, stateFile)) +} diff --git a/syncpipe/sync_pipe.go b/syncpipe/sync_pipe.go new file mode 100644 index 00000000..f73c354d --- /dev/null +++ b/syncpipe/sync_pipe.go @@ -0,0 +1,105 @@ +package syncpipe + +import ( + "encoding/json" + "fmt" + "io/ioutil" + "os" + "syscall" +) + +// SyncPipe allows communication to and from the child processes +// to it's parent and allows the two independent processes to +// syncronize their state. +type SyncPipe struct { + parent, child *os.File +} + +func NewSyncPipeFromFd(parentFd, childFd uintptr) (*SyncPipe, error) { + s := &SyncPipe{} + + if parentFd > 0 { + s.parent = os.NewFile(parentFd, "parentPipe") + } else if childFd > 0 { + s.child = os.NewFile(childFd, "childPipe") + } else { + return nil, fmt.Errorf("no valid sync pipe fd specified") + } + + return s, nil +} + +func (s *SyncPipe) Child() *os.File { + return s.child +} + +func (s *SyncPipe) Parent() *os.File { + return s.parent +} + +func (s *SyncPipe) SendToChild(v interface{}) error { + data, err := json.Marshal(v) + if err != nil { + return err + } + + s.parent.Write(data) + + return syscall.Shutdown(int(s.parent.Fd()), syscall.SHUT_WR) +} + +func (s *SyncPipe) ReadFromChild() error { + data, err := ioutil.ReadAll(s.parent) + if err != nil { + return err + } + + if len(data) > 0 { + return fmt.Errorf("%s", data) + } + + return nil +} + +func (s *SyncPipe) ReadFromParent(v interface{}) error { + data, err := ioutil.ReadAll(s.child) + if err != nil { + return fmt.Errorf("error reading from sync pipe %s", err) + } + + if len(data) > 0 { + if err := json.Unmarshal(data, v); err != nil { + return err + } + } + + return nil +} + +func (s *SyncPipe) ReportChildError(err error) { + // ensure that any data sent from the parent is consumed so it doesn't + // receive ECONNRESET when the child writes to the pipe. + ioutil.ReadAll(s.child) + + s.child.Write([]byte(err.Error())) + s.CloseChild() +} + +func (s *SyncPipe) Close() error { + if s.parent != nil { + s.parent.Close() + } + + if s.child != nil { + s.child.Close() + } + + return nil +} + +func (s *SyncPipe) CloseChild() { + if s.child != nil { + s.child.Close() + s.child = nil + } +} diff --git a/syncpipe/sync_pipe_linux.go b/syncpipe/sync_pipe_linux.go new file mode 100644 index 00000000..bea4b52f --- /dev/null +++ b/syncpipe/sync_pipe_linux.go @@ -0,0 +1,20 @@ +package syncpipe + +import ( + "os" + "syscall" +) + +func NewSyncPipe() (s *SyncPipe, err error) { + s = &SyncPipe{} + + fds, err := syscall.Socketpair(syscall.AF_LOCAL, syscall.SOCK_STREAM|syscall.SOCK_CLOEXEC, 0) + if err != nil { + return nil, err + } + + s.child = os.NewFile(uintptr(fds[0]), "child syncpipe") + s.parent = os.NewFile(uintptr(fds[1]), "parent syncpipe") + + return s, nil +} diff --git a/syncpipe/sync_pipe_test.go b/syncpipe/sync_pipe_test.go new file mode 100644 index 00000000..906e6ed2 --- /dev/null +++ b/syncpipe/sync_pipe_test.go @@ -0,0 +1,72 @@ +package syncpipe + +import ( + "fmt" + "syscall" + "testing" +) + +type testStruct struct { + Name string +} + +func TestSendErrorFromChild(t *testing.T) { + pipe, err := NewSyncPipe() + if err != nil { + t.Fatal(err) + } + defer func() { + if err := pipe.Close(); err != nil { + t.Fatal(err) + } + }() + + childfd, err := syscall.Dup(int(pipe.Child().Fd())) + if err != nil { + t.Fatal(err) + } + childPipe, _ := NewSyncPipeFromFd(0, uintptr(childfd)) + + pipe.CloseChild() + pipe.SendToChild(nil) + + expected := "something bad happened" + childPipe.ReportChildError(fmt.Errorf(expected)) + + childError := pipe.ReadFromChild() + if childError == nil { + t.Fatal("expected an error to be returned but did not receive anything") + } + + if childError.Error() != expected { + t.Fatalf("expected %q but received error message %q", expected, childError.Error()) + } +} + +func TestSendPayloadToChild(t *testing.T) { + pipe, err := NewSyncPipe() + if err != nil { + t.Fatal(err) + } + + defer func() { + if err := pipe.Close(); err != nil { + t.Fatal(err) + } + }() + + expected := "libcontainer" + + if err := pipe.SendToChild(testStruct{Name: expected}); err != nil { + t.Fatal(err) + } + + var s *testStruct + if err := pipe.ReadFromParent(&s); err != nil { + t.Fatal(err) + } + + if s.Name != expected { + t.Fatalf("expected name %q but received %q", expected, s.Name) + } +} diff --git a/system/linux.go b/system/linux.go new file mode 100644 index 00000000..c07ef153 --- /dev/null +++ b/system/linux.go @@ -0,0 +1,60 @@ +// +build linux + +package system + +import ( + "os/exec" + "syscall" + "unsafe" +) + +func Execv(cmd string, args []string, env []string) error { + name, err := exec.LookPath(cmd) + if err != nil { + return err + } + + return syscall.Exec(name, args, env) +} + +func ParentDeathSignal(sig uintptr) error { + if _, _, err := syscall.RawSyscall(syscall.SYS_PRCTL, syscall.PR_SET_PDEATHSIG, sig, 0); err != 0 { + return err + } + return nil +} + +func GetParentDeathSignal() (int, error) { + var sig int + + _, _, err := syscall.RawSyscall(syscall.SYS_PRCTL, syscall.PR_GET_PDEATHSIG, uintptr(unsafe.Pointer(&sig)), 0) + + if err != 0 { + return -1, err + } + + return sig, nil +} + +func SetKeepCaps() error { + if _, _, err := syscall.RawSyscall(syscall.SYS_PRCTL, syscall.PR_SET_KEEPCAPS, 1, 0); err != 0 { + return err + } + + return nil +} + +func ClearKeepCaps() error { + if _, _, err := syscall.RawSyscall(syscall.SYS_PRCTL, syscall.PR_SET_KEEPCAPS, 0, 0); err != 0 { + return err + } + + return nil +} + +func Setctty() error { + if _, _, err := syscall.RawSyscall(syscall.SYS_IOCTL, 0, uintptr(syscall.TIOCSCTTY), 0); err != 0 { + return err + } + return nil +} diff --git a/system/proc.go b/system/proc.go new file mode 100644 index 00000000..37808a29 --- /dev/null +++ b/system/proc.go @@ -0,0 +1,27 @@ +package system + +import ( + "io/ioutil" + "path/filepath" + "strconv" + "strings" +) + +// look in /proc to find the process start time so that we can verify +// that this pid has started after ourself +func GetProcessStartTime(pid int) (string, error) { + data, err := ioutil.ReadFile(filepath.Join("/proc", strconv.Itoa(pid), "stat")) + if err != nil { + return "", err + } + + parts := strings.Split(string(data), " ") + // the starttime is located at pos 22 + // from the man page + // + // starttime %llu (was %lu before Linux 2.6) + // (22) The time the process started after system boot. In kernels before Linux 2.6, this + // value was expressed in jiffies. Since Linux 2.6, the value is expressed in clock ticks + // (divide by sysconf(_SC_CLK_TCK)). + return parts[22-1], nil // starts at 1 +} diff --git a/system/setns_linux.go b/system/setns_linux.go new file mode 100644 index 00000000..32821ee2 --- /dev/null +++ b/system/setns_linux.go @@ -0,0 +1,31 @@ +package system + +import ( + "fmt" + "runtime" + "syscall" +) + +// Via http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=7b21fddd087678a70ad64afc0f632e0f1071b092 +// +// We need different setns values for the different platforms and arch +// We are declaring the macro here because the SETNS syscall does not exist in th stdlib +var setNsMap = map[string]uintptr{ + "linux/386": 346, + "linux/amd64": 308, + "linux/arm": 374, +} + +func Setns(fd uintptr, flags uintptr) error { + ns, exists := setNsMap[fmt.Sprintf("%s/%s", runtime.GOOS, runtime.GOARCH)] + if !exists { + return fmt.Errorf("unsupported platform %s/%s", runtime.GOOS, runtime.GOARCH) + } + + _, _, err := syscall.RawSyscall(ns, fd, flags, 0) + if err != 0 { + return err + } + + return nil +} diff --git a/system/sysconfig.go b/system/sysconfig.go new file mode 100644 index 00000000..5efddefa --- /dev/null +++ b/system/sysconfig.go @@ -0,0 +1,12 @@ +// +build cgo + +package system + +/* +#include +*/ +import "C" + +func GetClockTicks() int { + return int(C.sysconf(C._SC_CLK_TCK)) +} diff --git a/system/sysconfig_notcgo.go b/system/sysconfig_notcgo.go new file mode 100644 index 00000000..663db82b --- /dev/null +++ b/system/sysconfig_notcgo.go @@ -0,0 +1,8 @@ +// +build !cgo + +package system + +func GetClockTicks() int { + // TODO figure out a better alternative for platforms where we're missing cgo + return 100 +} diff --git a/system/xattrs_linux.go b/system/xattrs_linux.go new file mode 100644 index 00000000..30f74dfb --- /dev/null +++ b/system/xattrs_linux.go @@ -0,0 +1,99 @@ +package system + +import ( + "syscall" + "unsafe" +) + +var _zero uintptr + +// Returns the size of xattrs and nil error +// Requires path, takes allocated []byte or nil as last argument +func Llistxattr(path string, dest []byte) (size int, err error) { + pathBytes, err := syscall.BytePtrFromString(path) + if err != nil { + return -1, err + } + var newpathBytes unsafe.Pointer + if len(dest) > 0 { + newpathBytes = unsafe.Pointer(&dest[0]) + } else { + newpathBytes = unsafe.Pointer(&_zero) + } + + _size, _, errno := syscall.Syscall6(syscall.SYS_LLISTXATTR, uintptr(unsafe.Pointer(pathBytes)), uintptr(newpathBytes), uintptr(len(dest)), 0, 0, 0) + size = int(_size) + if errno != 0 { + return -1, errno + } + + return size, nil +} + +// Returns a []byte slice if the xattr is set and nil otherwise +// Requires path and its attribute as arguments +func Lgetxattr(path string, attr string) ([]byte, error) { + var sz int + pathBytes, err := syscall.BytePtrFromString(path) + if err != nil { + return nil, err + } + attrBytes, err := syscall.BytePtrFromString(attr) + if err != nil { + return nil, err + } + + // Start with a 128 length byte array + sz = 128 + dest := make([]byte, sz) + destBytes := unsafe.Pointer(&dest[0]) + _sz, _, errno := syscall.Syscall6(syscall.SYS_LGETXATTR, uintptr(unsafe.Pointer(pathBytes)), uintptr(unsafe.Pointer(attrBytes)), uintptr(destBytes), uintptr(len(dest)), 0, 0) + + switch { + case errno == syscall.ENODATA: + return nil, errno + case errno == syscall.ENOTSUP: + return nil, errno + case errno == syscall.ERANGE: + // 128 byte array might just not be good enough, + // A dummy buffer is used ``uintptr(0)`` to get real size + // of the xattrs on disk + _sz, _, errno = syscall.Syscall6(syscall.SYS_LGETXATTR, uintptr(unsafe.Pointer(pathBytes)), uintptr(unsafe.Pointer(attrBytes)), uintptr(unsafe.Pointer(nil)), uintptr(0), 0, 0) + sz = int(_sz) + if sz < 0 { + return nil, errno + } + dest = make([]byte, sz) + destBytes := unsafe.Pointer(&dest[0]) + _sz, _, errno = syscall.Syscall6(syscall.SYS_LGETXATTR, uintptr(unsafe.Pointer(pathBytes)), uintptr(unsafe.Pointer(attrBytes)), uintptr(destBytes), uintptr(len(dest)), 0, 0) + if errno != 0 { + return nil, errno + } + case errno != 0: + return nil, errno + } + sz = int(_sz) + return dest[:sz], nil +} + +func Lsetxattr(path string, attr string, data []byte, flags int) error { + pathBytes, err := syscall.BytePtrFromString(path) + if err != nil { + return err + } + attrBytes, err := syscall.BytePtrFromString(attr) + if err != nil { + return err + } + var dataBytes unsafe.Pointer + if len(data) > 0 { + dataBytes = unsafe.Pointer(&data[0]) + } else { + dataBytes = unsafe.Pointer(&_zero) + } + _, _, errno := syscall.Syscall6(syscall.SYS_LSETXATTR, uintptr(unsafe.Pointer(pathBytes)), uintptr(unsafe.Pointer(attrBytes)), uintptr(dataBytes), uintptr(len(data)), uintptr(flags), 0) + if errno != 0 { + return errno + } + return nil +} diff --git a/types.go b/types.go new file mode 100644 index 00000000..c341137e --- /dev/null +++ b/types.go @@ -0,0 +1,11 @@ +package libcontainer + +import ( + "github.com/docker/libcontainer/cgroups" + "github.com/docker/libcontainer/network" +) + +type ContainerStats struct { + NetworkStats *network.NetworkStats `json:"network_stats,omitempty"` + CgroupStats *cgroups.Stats `json:"cgroup_stats,omitempty"` +} diff --git a/update-vendor.sh b/update-vendor.sh new file mode 100755 index 00000000..df66a0a8 --- /dev/null +++ b/update-vendor.sh @@ -0,0 +1,48 @@ +#!/usr/bin/env bash +set -e + +cd "$(dirname "$BASH_SOURCE")" + +# Downloads dependencies into vendor/ directory +mkdir -p vendor +cd vendor + +clone() { + vcs=$1 + pkg=$2 + rev=$3 + + pkg_url=https://$pkg + target_dir=src/$pkg + + echo -n "$pkg @ $rev: " + + if [ -d $target_dir ]; then + echo -n 'rm old, ' + rm -fr $target_dir + fi + + echo -n 'clone, ' + case $vcs in + git) + git clone --quiet --no-checkout $pkg_url $target_dir + ( cd $target_dir && git reset --quiet --hard $rev ) + ;; + hg) + hg clone --quiet --updaterev $rev $pkg_url $target_dir + ;; + esac + + echo -n 'rm VCS, ' + ( cd $target_dir && rm -rf .{git,hg} ) + + echo done +} + +# the following lines are in sorted order, FYI +clone git github.com/codegangsta/cli 1.1.0 +clone git github.com/coreos/go-systemd v2 +clone git github.com/godbus/dbus v1 +clone git github.com/syndtr/gocapability 3c85049eae + +# intentionally not vendoring Docker itself... that'd be a circle :) diff --git a/user/MAINTAINERS b/user/MAINTAINERS new file mode 100644 index 00000000..18e05a30 --- /dev/null +++ b/user/MAINTAINERS @@ -0,0 +1 @@ +Tianon Gravi (@tianon) diff --git a/user/user.go b/user/user.go new file mode 100644 index 00000000..493dd86f --- /dev/null +++ b/user/user.go @@ -0,0 +1,258 @@ +package user + +import ( + "bufio" + "fmt" + "io" + "os" + "strconv" + "strings" +) + +const ( + minId = 0 + maxId = 1<<31 - 1 //for 32-bit systems compatibility +) + +var ( + ErrRange = fmt.Errorf("Uids and gids must be in range %d-%d", minId, maxId) +) + +type User struct { + Name string + Pass string + Uid int + Gid int + Gecos string + Home string + Shell string +} + +type Group struct { + Name string + Pass string + Gid int + List []string +} + +func parseLine(line string, v ...interface{}) { + if line == "" { + return + } + + parts := strings.Split(line, ":") + for i, p := range parts { + if len(v) <= i { + // if we have more "parts" than we have places to put them, bail for great "tolerance" of naughty configuration files + break + } + + switch e := v[i].(type) { + case *string: + // "root", "adm", "/bin/bash" + *e = p + case *int: + // "0", "4", "1000" + // ignore string to int conversion errors, for great "tolerance" of naughty configuration files + *e, _ = strconv.Atoi(p) + case *[]string: + // "", "root", "root,adm,daemon" + if p != "" { + *e = strings.Split(p, ",") + } else { + *e = []string{} + } + default: + // panic, because this is a programming/logic error, not a runtime one + panic("parseLine expects only pointers! argument " + strconv.Itoa(i) + " is not a pointer!") + } + } +} + +func ParsePasswd() ([]*User, error) { + return ParsePasswdFilter(nil) +} + +func ParsePasswdFilter(filter func(*User) bool) ([]*User, error) { + f, err := os.Open("/etc/passwd") + if err != nil { + return nil, err + } + defer f.Close() + return parsePasswdFile(f, filter) +} + +func parsePasswdFile(r io.Reader, filter func(*User) bool) ([]*User, error) { + var ( + s = bufio.NewScanner(r) + out = []*User{} + ) + + for s.Scan() { + if err := s.Err(); err != nil { + return nil, err + } + + text := strings.TrimSpace(s.Text()) + if text == "" { + continue + } + + // see: man 5 passwd + // name:password:UID:GID:GECOS:directory:shell + // Name:Pass:Uid:Gid:Gecos:Home:Shell + // root:x:0:0:root:/root:/bin/bash + // adm:x:3:4:adm:/var/adm:/bin/false + p := &User{} + parseLine( + text, + &p.Name, &p.Pass, &p.Uid, &p.Gid, &p.Gecos, &p.Home, &p.Shell, + ) + + if filter == nil || filter(p) { + out = append(out, p) + } + } + + return out, nil +} + +func ParseGroup() ([]*Group, error) { + return ParseGroupFilter(nil) +} + +func ParseGroupFilter(filter func(*Group) bool) ([]*Group, error) { + f, err := os.Open("/etc/group") + if err != nil { + return nil, err + } + defer f.Close() + return parseGroupFile(f, filter) +} + +func parseGroupFile(r io.Reader, filter func(*Group) bool) ([]*Group, error) { + var ( + s = bufio.NewScanner(r) + out = []*Group{} + ) + + for s.Scan() { + if err := s.Err(); err != nil { + return nil, err + } + + text := s.Text() + if text == "" { + continue + } + + // see: man 5 group + // group_name:password:GID:user_list + // Name:Pass:Gid:List + // root:x:0:root + // adm:x:4:root,adm,daemon + p := &Group{} + parseLine( + text, + &p.Name, &p.Pass, &p.Gid, &p.List, + ) + + if filter == nil || filter(p) { + out = append(out, p) + } + } + + return out, nil +} + +// Given a string like "user", "1000", "user:group", "1000:1000", returns the uid, gid, list of supplementary group IDs, and home directory, if available and/or applicable. +func GetUserGroupSupplementaryHome(userSpec string, defaultUid, defaultGid int, defaultHome string) (int, int, []int, string, error) { + var ( + uid = defaultUid + gid = defaultGid + suppGids = []int{} + home = defaultHome + + userArg, groupArg string + ) + + // allow for userArg to have either "user" syntax, or optionally "user:group" syntax + parseLine(userSpec, &userArg, &groupArg) + + users, err := ParsePasswdFilter(func(u *User) bool { + if userArg == "" { + return u.Uid == uid + } + return u.Name == userArg || strconv.Itoa(u.Uid) == userArg + }) + if err != nil && !os.IsNotExist(err) { + if userArg == "" { + userArg = strconv.Itoa(uid) + } + return 0, 0, nil, "", fmt.Errorf("Unable to find user %v: %v", userArg, err) + } + + haveUser := users != nil && len(users) > 0 + if haveUser { + // if we found any user entries that matched our filter, let's take the first one as "correct" + uid = users[0].Uid + gid = users[0].Gid + home = users[0].Home + } else if userArg != "" { + // we asked for a user but didn't find them... let's check to see if we wanted a numeric user + uid, err = strconv.Atoi(userArg) + if err != nil { + // not numeric - we have to bail + return 0, 0, nil, "", fmt.Errorf("Unable to find user %v", userArg) + } + if uid < minId || uid > maxId { + return 0, 0, nil, "", ErrRange + } + + // if userArg couldn't be found in /etc/passwd but is numeric, just roll with it - this is legit + } + + if groupArg != "" || (haveUser && users[0].Name != "") { + groups, err := ParseGroupFilter(func(g *Group) bool { + if groupArg != "" { + return g.Name == groupArg || strconv.Itoa(g.Gid) == groupArg + } + for _, u := range g.List { + if u == users[0].Name { + return true + } + } + return false + }) + if err != nil && !os.IsNotExist(err) { + return 0, 0, nil, "", fmt.Errorf("Unable to find groups for user %v: %v", users[0].Name, err) + } + + haveGroup := groups != nil && len(groups) > 0 + if groupArg != "" { + if haveGroup { + // if we found any group entries that matched our filter, let's take the first one as "correct" + gid = groups[0].Gid + } else { + // we asked for a group but didn't find id... let's check to see if we wanted a numeric group + gid, err = strconv.Atoi(groupArg) + if err != nil { + // not numeric - we have to bail + return 0, 0, nil, "", fmt.Errorf("Unable to find group %v", groupArg) + } + if gid < minId || gid > maxId { + return 0, 0, nil, "", ErrRange + } + + // if groupArg couldn't be found in /etc/group but is numeric, just roll with it - this is legit + } + } else if haveGroup { + suppGids = make([]int, len(groups)) + for i, group := range groups { + suppGids[i] = group.Gid + } + } + } + + return uid, gid, suppGids, home, nil +} diff --git a/user/user_test.go b/user/user_test.go new file mode 100644 index 00000000..136632c2 --- /dev/null +++ b/user/user_test.go @@ -0,0 +1,94 @@ +package user + +import ( + "strings" + "testing" +) + +func TestUserParseLine(t *testing.T) { + var ( + a, b string + c []string + d int + ) + + parseLine("", &a, &b) + if a != "" || b != "" { + t.Fatalf("a and b should be empty ('%v', '%v')", a, b) + } + + parseLine("a", &a, &b) + if a != "a" || b != "" { + t.Fatalf("a should be 'a' and b should be empty ('%v', '%v')", a, b) + } + + parseLine("bad boys:corny cows", &a, &b) + if a != "bad boys" || b != "corny cows" { + t.Fatalf("a should be 'bad boys' and b should be 'corny cows' ('%v', '%v')", a, b) + } + + parseLine("", &c) + if len(c) != 0 { + t.Fatalf("c should be empty (%#v)", c) + } + + parseLine("d,e,f:g:h:i,j,k", &c, &a, &b, &c) + if a != "g" || b != "h" || len(c) != 3 || c[0] != "i" || c[1] != "j" || c[2] != "k" { + t.Fatalf("a should be 'g', b should be 'h', and c should be ['i','j','k'] ('%v', '%v', '%#v')", a, b, c) + } + + parseLine("::::::::::", &a, &b, &c) + if a != "" || b != "" || len(c) != 0 { + t.Fatalf("a, b, and c should all be empty ('%v', '%v', '%#v')", a, b, c) + } + + parseLine("not a number", &d) + if d != 0 { + t.Fatalf("d should be 0 (%v)", d) + } + + parseLine("b:12:c", &a, &d, &b) + if a != "b" || b != "c" || d != 12 { + t.Fatalf("a should be 'b' and b should be 'c', and d should be 12 ('%v', '%v', %v)", a, b, d) + } +} + +func TestUserParsePasswd(t *testing.T) { + users, err := parsePasswdFile(strings.NewReader(` +root:x:0:0:root:/root:/bin/bash +adm:x:3:4:adm:/var/adm:/bin/false +this is just some garbage data +`), nil) + if err != nil { + t.Fatalf("Unexpected error: %v", err) + } + if len(users) != 3 { + t.Fatalf("Expected 3 users, got %v", len(users)) + } + if users[0].Uid != 0 || users[0].Name != "root" { + t.Fatalf("Expected users[0] to be 0 - root, got %v - %v", users[0].Uid, users[0].Name) + } + if users[1].Uid != 3 || users[1].Name != "adm" { + t.Fatalf("Expected users[1] to be 3 - adm, got %v - %v", users[1].Uid, users[1].Name) + } +} + +func TestUserParseGroup(t *testing.T) { + groups, err := parseGroupFile(strings.NewReader(` +root:x:0:root +adm:x:4:root,adm,daemon +this is just some garbage data +`), nil) + if err != nil { + t.Fatalf("Unexpected error: %v", err) + } + if len(groups) != 3 { + t.Fatalf("Expected 3 groups, got %v", len(groups)) + } + if groups[0].Gid != 0 || groups[0].Name != "root" || len(groups[0].List) != 1 { + t.Fatalf("Expected groups[0] to be 0 - root - 1 member, got %v - %v - %v", groups[0].Gid, groups[0].Name, len(groups[0].List)) + } + if groups[1].Gid != 4 || groups[1].Name != "adm" || len(groups[1].List) != 3 { + t.Fatalf("Expected groups[1] to be 4 - adm - 3 members, got %v - %v - %v", groups[1].Gid, groups[1].Name, len(groups[1].List)) + } +} diff --git a/utils/utils.go b/utils/utils.go new file mode 100644 index 00000000..76184ce0 --- /dev/null +++ b/utils/utils.go @@ -0,0 +1,55 @@ +package utils + +import ( + "crypto/rand" + "encoding/hex" + "io" + "io/ioutil" + "path/filepath" + "strconv" + "syscall" +) + +// GenerateRandomName returns a new name joined with a prefix. This size +// specified is used to truncate the randomly generated value +func GenerateRandomName(prefix string, size int) (string, error) { + id := make([]byte, 32) + if _, err := io.ReadFull(rand.Reader, id); err != nil { + return "", err + } + return prefix + hex.EncodeToString(id)[:size], nil +} + +// ResolveRootfs ensures that the current working directory is +// not a symlink and returns the absolute path to the rootfs +func ResolveRootfs(uncleanRootfs string) (string, error) { + rootfs, err := filepath.Abs(uncleanRootfs) + if err != nil { + return "", err + } + return filepath.EvalSymlinks(rootfs) +} + +func CloseExecFrom(minFd int) error { + fdList, err := ioutil.ReadDir("/proc/self/fd") + if err != nil { + return err + } + for _, fi := range fdList { + fd, err := strconv.Atoi(fi.Name()) + if err != nil { + // ignore non-numeric file names + continue + } + + if fd < minFd { + // ignore descriptors lower than our specified minimum + continue + } + + // intentionally ignore errors from syscall.CloseOnExec + syscall.CloseOnExec(fd) + // the cases where this might fail are basically file descriptors that have already been closed (including and especially the one that was created when ioutil.ReadDir did the "opendir" syscall) + } + return nil +} diff --git a/xattr/xattr.go b/xattr/xattr.go new file mode 100644 index 00000000..fc08d01f --- /dev/null +++ b/xattr/xattr.go @@ -0,0 +1,53 @@ +// +build linux + +package xattr + +import ( + "syscall" + + "github.com/docker/libcontainer/system" +) + +func XattrEnabled(path string) bool { + if Setxattr(path, "user.test", "") == syscall.ENOTSUP { + return false + } + return true +} + +func stringsfromByte(buf []byte) (result []string) { + offset := 0 + for index, b := range buf { + if b == 0 { + result = append(result, string(buf[offset:index])) + offset = index + 1 + } + } + return +} + +func Listxattr(path string) ([]string, error) { + size, err := system.Llistxattr(path, nil) + if err != nil { + return nil, err + } + buf := make([]byte, size) + read, err := system.Llistxattr(path, buf) + if err != nil { + return nil, err + } + names := stringsfromByte(buf[:read]) + return names, nil +} + +func Getxattr(path, attr string) (string, error) { + value, err := system.Lgetxattr(path, attr) + if err != nil { + return "", err + } + return string(value), nil +} + +func Setxattr(path, xattr, value string) error { + return system.Lsetxattr(path, xattr, []byte(value), 0) +} diff --git a/xattr/xattr_test.go b/xattr/xattr_test.go new file mode 100644 index 00000000..d818c691 --- /dev/null +++ b/xattr/xattr_test.go @@ -0,0 +1,77 @@ +// +build linux + +package xattr_test + +import ( + "os" + "testing" + + "github.com/docker/libcontainer/xattr" +) + +func testXattr(t *testing.T) { + tmp := "xattr_test" + out, err := os.OpenFile(tmp, os.O_WRONLY, 0) + if err != nil { + t.Fatal("failed") + } + attr := "user.test" + out.Close() + + if !xattr.XattrEnabled(tmp) { + t.Log("Disabled") + t.Fatal("failed") + } + t.Log("Success") + + err = xattr.Setxattr(tmp, attr, "test") + if err != nil { + t.Fatal("failed") + } + + var value string + value, err = xattr.Getxattr(tmp, attr) + if err != nil { + t.Fatal("failed") + } + if value != "test" { + t.Fatal("failed") + } + t.Log("Success") + + var names []string + names, err = xattr.Listxattr(tmp) + if err != nil { + t.Fatal("failed") + } + + var found int + for _, name := range names { + if name == attr { + found = 1 + } + } + // Listxattr doesn't return trusted.* and system.* namespace + // attrs when run in unprevileged mode. + if found != 1 { + t.Fatal("failed") + } + t.Log("Success") + + big := "0000000000000000000000000000000000000000000000000000000000000000000008c6419ad822dfe29283fb3ac98dcc5908810cb31f4cfe690040c42c144b7492eicompslf20dxmlpgz" + // Test for long xattrs larger than 128 bytes + err = xattr.Setxattr(tmp, attr, big) + if err != nil { + t.Fatal("failed to add long value") + } + value, err = xattr.Getxattr(tmp, attr) + if err != nil { + t.Fatal("failed to get long value") + } + t.Log("Success") + + if value != big { + t.Fatal("failed, value doesn't match") + } + t.Log("Success") +} -- 2.30.2