added git-annex-compute-singularity
authorJoey Hess <joeyh@joeyh.name>
Mon, 10 Mar 2025 20:41:26 +0000 (16:41 -0400)
committerJoey Hess <joeyh@joeyh.name>
Mon, 10 Mar 2025 20:41:26 +0000 (16:41 -0400)
And implemented SANDBOX, which it needs.

COPYRIGHT
Remote/Compute.hs
doc/design/compute_special_remote_interface.mdwn
doc/special_remotes/compute.mdwn
doc/special_remotes/compute/git-annex-compute-singularity [new file with mode: 0755]
doc/special_remotes/compute/git-annex-compute-singularity-examples.mdwn [new file with mode: 0644]
doc/todo/compute_special_remote_remaining_todos.mdwn

index 54a250abae5b60a973f6babcc8c8947faec6c913..3ca3debd09b134052cdeaee1a0cb8ef41537abc6 100644 (file)
--- a/COPYRIGHT
+++ b/COPYRIGHT
@@ -14,7 +14,7 @@ Files: doc/special_remotes/external/*
 Copyright: © 2013 Joey Hess <id@joeyh.name>
 License: GPL-3+
 
-Files: doc/special_remotes/compute/git-annex-compute-imageconvert doc/special_remotes/compute/git-annex-compute-wasmedge
+Files: doc/special_remotes/compute/git-annex-compute-imageconvert doc/special_remotes/compute/git-annex-compute-wasmedge doc/special_remotes/compute/git-annex-compute-singularity
 Copyright: © 2025 Joey Hess <id@joeyh.name>
 License: GPL-3+
 
index be8429435c6b58959ef1593883ecd75e1382b476..7d21ddccdbce6f8f53ad0b5922f358cd90035128 100644 (file)
@@ -52,6 +52,7 @@ import Utility.Env
 import Utility.Tmp.Dir
 import Utility.Url
 import Utility.MonotonicClock
+import Utility.CopyFile
 import Types.Key
 import Backend
 import qualified Git
@@ -201,6 +202,7 @@ data ProcessCommand
        = ProcessInput FilePath
        | ProcessOutput FilePath
        | ProcessReproducible
+       | ProcessSandbox
        | ProcessProgress PercentFloat
        deriving (Show, Eq)
 
@@ -208,6 +210,7 @@ instance Proto.Receivable ProcessCommand where
        parseCommand "INPUT" = Proto.parse1 ProcessInput
        parseCommand "OUTPUT" = Proto.parse1 ProcessOutput
        parseCommand "REPRODUCIBLE" = Proto.parse0 ProcessReproducible
+       parseCommand "SANDBOX" = Proto.parse0 ProcessSandbox
        parseCommand "PROGRESS" = Proto.parse1 ProcessProgress
        parseCommand _ = Proto.parseFail
 
@@ -382,6 +385,7 @@ data ComputeProgramResult = ComputeProgramResult
        { computeState :: ComputeState
        , computeInputsUnavailable :: Bool
        , computeReproducible :: Bool
+       , computeSandbox :: Bool
        }
 
 runComputeProgram
@@ -410,7 +414,7 @@ runComputeProgram (ComputeProgram program) state (ImmutableState immutablestate)
                         }
                showOutput
                starttime <- liftIO currentMonotonicTimestamp
-               let startresult = ComputeProgramResult state False False
+               let startresult = ComputeProgramResult state False False False
                result <- withmeterfile $ \meterfile -> bracket
                        (liftIO $ createProcess pr)
                        (liftIO . cleanupProcess)
@@ -457,13 +461,17 @@ runComputeProgram (ComputeProgram program) state (ImmutableState immutablestate)
                        checksafefile tmpdir subdir f' "input"
                        checkimmutable knowninput "inputting" f' $ do
                                (k, inputcontent) <- getinputcontent f'
+                               let mkrel a = Just <$> 
+                                       (a >>= liftIO . relPathDirToFile subdir)
                                mp <- case inputcontent of
                                        Nothing -> pure Nothing
-                                       Just (Right f'') -> liftIO $
-                                               Just <$> relPathDirToFile subdir f''
-                                       Just (Left gitsha) ->
-                                               Just <$> (liftIO . relPathDirToFile subdir 
-                                                       =<< populategitsha gitsha tmpdir)
+                                       Just (Right obj)
+                                               | computeSandbox result -> 
+                                                       mkrel $ populatesandbox obj tmpdir
+                                               | otherwise ->
+                                                       mkrel $ pure obj
+                                       Just (Left gitsha) -> 
+                                               mkrel $ populategitsha gitsha tmpdir
                                sendresponse p $
                                        maybe "" fromOsPath mp
                                let result' = result
@@ -506,6 +514,14 @@ runComputeProgram (ComputeProgram program) state (ImmutableState immutablestate)
                        return result
                Just ProcessReproducible ->
                        return $ result { computeReproducible = True }
+               Just ProcessSandbox -> do
+                       sandboxpath <- liftIO $ fromOsPath <$>
+                               relPathDirToFile subdir tmpdir
+                       sendresponse p $
+                               if null sandboxpath
+                                       then "."
+                                       else sandboxpath
+                       return $ result { computeSandbox = True }
                Nothing -> giveup $
                        program ++ " output an unparseable line: \"" ++ l ++ "\""
 
@@ -546,12 +562,23 @@ runComputeProgram (ComputeProgram program) state (ImmutableState immutablestate)
        -- to the program as a parameter, which could parse it as a dashed
        -- option or other special parameter.
        populategitsha gitsha tmpdir = do
-               let f = tmpdir </> literalOsPath ".git" </> literalOsPath "objects"
+               let f = tmpdir </> literalOsPath ".git"
+                       </> literalOsPath "objects"
                        </> toOsPath (Git.fromRef' gitsha)
                liftIO $ createDirectoryIfMissing True $ takeDirectory f
                liftIO . F.writeFile f =<< catObject gitsha
                return f
 
+       populatesandbox annexobj tmpdir = do
+               let f = tmpdir </> literalOsPath ".git"
+                       </> literalOsPath "annex"
+                       </> literalOsPath "objects"
+                       </> takeFileName annexobj
+               liftIO $ createDirectoryIfMissing True $ takeDirectory f
+               liftIO $ unlessM (createLinkOrCopy annexobj f) $
+                       giveup "Unable to populate compute sandbox directory"
+               return f
+
        withmeterfile a = case meterkey of
                Nothing -> a (const noop)
                Just (_, progress) -> do
index 0ab7c45df440f10546db49d8a8f22fcc68e85770..52b676c04e5175053ba01d528fad3b251f078306 100644 (file)
@@ -88,6 +88,14 @@ indicates that the results of its computations are expected to be
 bit-for-bit reproducible. That makes `git-annex addcomputed` behave as if
 the `--reproducible` option is set.
 
+The program can also output a "SANDBOX" line, and then read a line from
+stdin that will be the path to the directory it should sandbox to (which
+corresponds to the top of the git repository, so may be above its working
+directory). Any "INPUT" lines that come after "SANDBOX" will have input
+files be provided via paths that are inside the sandbox directory. Usually
+that is done by making hard links, but it will fall back to copying annexed
+files if the filesystem does not support hard links.
+
 Anything that the program outputs to stderr will be displayed to the user.
 This stderr should be used for error messages, and possibly computation
 output, but not for progress displays.
index 33b1253978cb02e14a7b5e15d597c0c95c79084b..52d650068f038260e665253f0630685eddbd4243 100644 (file)
@@ -39,6 +39,13 @@ List it here with an example!
 
   `git-annex addcomputed --to=imageconvert foo.jpeg foo.gif`
 
+* [[compute/git-annex-compute-singularity]]
+  Uses [Singularity](https://sylabs.io/) to run a container, which is
+  checked into the git-annex repository, to compute other files in the
+  repository. Amoung other things, this can run other compute programs
+  inside a singularity container. 
+  [[Examples here|compute/git-annex-compute-singularity-examples]]
+
 * [[compute/git-annex-compute-wasmedge]]  
   Uses [WasmEdge](https://WasmEdge.org/) to run WASM programs that are
   checked into the git-annex repository, to compute other files in the
diff --git a/doc/special_remotes/compute/git-annex-compute-singularity b/doc/special_remotes/compute/git-annex-compute-singularity
new file mode 100755 (executable)
index 0000000..d296e01
--- /dev/null
@@ -0,0 +1,94 @@
+#!/bin/bash
+# git-annex compute remote program that runs singularity containers
+# from the git-annex repository.
+# 
+# Copyright 2025 Joey Hess; licenced under the GNU GPL version 3 or higher.
+set -e
+
+if [ -z "$1" ]; then
+       echo "Usage: container [singularity options] [inputs] -- [outputs] -- [command params]" >&2
+       exit 1
+fi
+
+nocompat_opt=""
+fakeroot_opt=""
+container=""
+binddir="`pwd`"
+rundir="`pwd`"
+
+run_singularity () {
+       # Network access is disabled (with --net --network=none), to
+       # prevent an untrusted singularity image from phoning home and/or
+       # attacking the local network.
+       #
+       # --oci is used to get process namespacing
+       singularity run --net --network=none --oci \
+               --bind="$binddir" --pwd="$rundir" \
+               $nocompat_opt $fakeroot_opt \
+               "$container" "$@"
+}
+
+# Avoid any security problems with harmful terminal escape sequences.
+strip_escape () {
+       sed 's/[\x1B]//g'
+}
+
+if [ -z "$ANNEX_COMPUTE_passthrough" ]; then
+       stage=1
+       while [ -n "$1" ]; do
+               if [ "$1" = "--" ]; then
+                       stage=$((stage+1))
+                       shift 1
+               else
+                       if [ "$stage" = 1 ]; then
+                               case "$1" in
+                                       "--no-compat")
+                                               nocompat_opt="--no-compat"
+                                               ;;
+                                       "--fakeroot")
+                                               fakeroot_opt="--fakeroot"
+                                               ;;
+                                       *)
+                                               echo "INPUT $1"
+                                               read input
+                                               if [ -n "$input" ]; then
+                                                       p="./$1"
+                                                       mkdir -p "$(dirname "$p")"
+                                                       ln "$(realpath "$input")" "$p"
+                                                       if [ -z "$container" ]; then
+                                                               container="$p"
+                                                       fi
+                                               fi
+                               esac
+                               shift 1
+                       elif [ "$stage" = 2 ]; then
+                               echo "OUTPUT $1"
+                               read output
+                               shift 1
+                       else
+                               break
+                       fi
+               fi
+       done
+       run_singularity "$@" </dev/null 2>&1 | strip_escape >&2
+else
+       # Tell git-annex that the program will be running sandboxed,
+       # it will tell us where the top of the sandbox is, and that's the
+       # directory to bind into singularity.
+       echo "SANDBOX"
+       read pathtotop
+       binddir="$(realpath "$pathtotop")"
+       echo "INPUT $pathtotop/$ANNEX_COMPUTE_passthrough"
+       read input
+       if [ -n "$input" ]; then
+               container="./$ANNEX_COMPUTE_passthrough"
+               mkdir -p "$(dirname "$container")"
+               ln "$(realpath "$input")" "$container"
+       else
+               echo "Unfortunately, addcomputed --fast cannot be used with git-annex-compute-singularity --passthrough=" >&2
+               exit 1
+       fi
+       # stdio is passed through to the git-annex-compute- command inside
+       # singularity
+       run_singularity "$@" 2> >( strip_escape 1>&2 )
+fi
diff --git a/doc/special_remotes/compute/git-annex-compute-singularity-examples.mdwn b/doc/special_remotes/compute/git-annex-compute-singularity-examples.mdwn
new file mode 100644 (file)
index 0000000..7613667
--- /dev/null
@@ -0,0 +1,70 @@
+[[git-annex-compute-singularity]] uses [Singularity](https://sylabs.io/)
+to run a container, which is checked into the git-annex repository,
+to [[compute]] other files in the repository.
+
+This can be used in two different ways. One is to run an arbitrary command
+inside the singularity container. That is very flexible, but the syntax is
+slighly awkward since you have to provide the input and output filenames,
+as well as the command. The other way to use it is to have a singularity
+container that contains and runs another `git-annex-compute-` command.
+
+## running an arbitrary command
+
+An example of running an arbitrary command is:
+
+       git-annex initremote singularity type=compute program=git-annex-compute-singularity
+    singularity build debian.sif docker://debian
+    git-annex add debian.sif
+    git-annex addcomputed --to=singularity -- debian.sif foo bar -- baz -- sh -c 'cat foo bar > baz'
+
+Here the first filename passed to `git-annex addcomputed` must be the
+singularity container image to use. It is followed by the input files to
+make available inside the container, followed by "--" and then the output
+files. Finally, "--" separates the output files from the parameters
+to pass into the container.
+
+## passing through to a git-annex-compute- command inside a singularity container
+
+    git-annex initremote foo type=compute program=git-annex-compute-singularity passthrough=imageconvert.sif 
+    git-annex addcomputed --to=foo foo.jpeg foo.gif
+
+This example uses a container `imageconvert.sif` that runs
+[[git-annex-compute-imageconvert]]. This allows using `git-annex addcomputed`
+with the same syntax that compute program usually uses.
+
+Note that the container file given to `passthrough=` is relative to the top
+of the git repository.
+
+To create that `imageconvert.sif` container:
+
+    cat > imageconvert.def <<EOF
+    Bootstrap: docker
+    From: debian
+    
+    %post
+        apt-get -y update
+        apt-get -y install imagemagick wget
+        wget https://git-annex.branchable.com/special_remotes/compute/git-annex-compute-imageconvert -O /go
+        chmod +x /go
+    
+    %runscript
+        /go "$@"
+    EOF
+    sudo singularity build imageconvert.sif imageconvert.def
+
+## singularity options
+
+`singularity run` is used to start the default command in the container.
+The command will find the input files in its current directory, and can
+write the output files to the same directory.
+
+Singularity is run with the `--oci` option, to get process namespacing
+and a generally secure sandboxed environment. Networks access is also
+disabled in the container.
+
+A few singularity options can be provided, to control how the container is
+run. The goal is to only allow options that keep it secure. See singularity's
+documentation for details about these options.
+
+* `--no-compat`
+* `--fakeroot`
index bb522398a40435b15cfc6188ba7ecba365608477..f8917aded8182912cb3adba708538363e0c06753 100644 (file)
@@ -1,6 +1,11 @@
 This is the remainder of my todo list while I was building the
 compute special remote. --[[Joey]]
 
+* git-annex-compute-singularity with passthrough= cannot be used
+  by `git-annex addcomputed --fast` because the singularity image is not
+  available to run. Maybe make a varity of INPUT that is provided also
+  in --fast mode to solve this?
+
 * write a tip showing how to use this
 
 * Write some simple compute programs so we have something to start with.