{- Stall detection for transfers.
-
- - Copyright 2020-2021 Joey Hess <id@joeyh.name>
+ - Copyright 2020-2024 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
-}
import Control.Concurrent.STM
import Control.Monad.IO.Class (MonadIO)
+import Data.Time.Clock
{- This may be safely canceled (with eg uninterruptibleCancel),
- as long as the passed action can be safely canceled. -}
detectStalls :: (Monad m, MonadIO m) => Maybe StallDetection -> TVar (Maybe BytesProcessed) -> m () -> m ()
detectStalls Nothing _ _ = noop
detectStalls (Just StallDetectionDisabled) _ _ = noop
-detectStalls (Just (StallDetection (BwRate minsz duration))) metervar onstall =
- detectStalls' minsz duration metervar onstall Nothing
+detectStalls (Just (StallDetection (BwRate minsz duration))) metervar onstall = do
+ -- If the progress is being updated, but less frequently than
+ -- the specified duration, a stall would be incorrectly detected.
+ --
+ -- For example, consider the case of a remote that does
+ -- not support progress updates, but is chunked with a large chunk
+ -- size. In that case, progress is only updated after each chunk.
+ --
+ -- So, wait for the first update, and see how long it takes.
+ -- It's longer than the duration, upscale the duration and minsz
+ -- accordingly.
+ starttime <- liftIO getCurrentTime
+ v <- waitforfirstupdate =<< readMeterVar metervar
+ endtime <- liftIO getCurrentTime
+ let timepassed = floor (endtime `diffUTCTime` starttime)
+ let (scaledminsz, scaledduration) = upscale timepassed
+ detectStalls' scaledminsz scaledduration metervar onstall v
+ where
+ upscale timepassed
+ | timepassed > dsecs =
+ let scale = scaleamount timepassed
+ in (minsz * scale, Duration (dsecs * scale))
+ | otherwise = (minsz, duration)
+ scaleamount timepassed = max 1 $ floor $
+ (fromIntegral timepassed / fromIntegral (max dsecs 1))
+ * scalefudgefactor
+ scalefudgefactor = 1.5 :: Double
+ dsecs = durationSeconds duration
+ minwaitsecs = Seconds $
+ min 60 (fromIntegral (durationSeconds duration))
+ waitforfirstupdate startval = do
+ liftIO $ threadDelaySeconds minwaitsecs
+ v <- readMeterVar metervar
+ if v > startval
+ then return v
+ else waitforfirstupdate startval
+
detectStalls (Just ProbeStallDetection) metervar onstall = do
-- Only do stall detection once the progress is confirmed to be
-- consistently updating. After the first update, it needs to
-- advance twice within 30 seconds. With that established,
-- if no data at all is sent for a 60 second period, it's
-- assumed to be a stall.
- v <- getval >>= waitforfirstupdate
+ v <- readMeterVar metervar >>= waitforfirstupdate
ontimelyadvance v $ \v' -> ontimelyadvance v' $
detectStalls' 1 duration metervar onstall
where
- getval = liftIO $ atomically $ fmap fromBytesProcessed
- <$> readTVar metervar
-
duration = Duration 60
delay = Seconds (fromIntegral (durationSeconds duration) `div` 2)
waitforfirstupdate startval = do
liftIO $ threadDelaySeconds delay
- v <- getval
+ v <- readMeterVar metervar
if v > startval
then return v
else waitforfirstupdate startval
ontimelyadvance v cont = do
liftIO $ threadDelaySeconds delay
- v' <- getval
+ v' <- readMeterVar metervar
when (v' > v) $
cont v'
detectStalls' minsz duration metervar onstall st = do
liftIO $ threadDelaySeconds delay
-- Get whatever progress value was reported most recently, if any.
- v <- liftIO $ atomically $ fmap fromBytesProcessed
- <$> readTVar metervar
+ v <- readMeterVar metervar
let cont = detectStalls' minsz duration metervar onstall v
case (st, v) of
(Nothing, _) -> cont
| otherwise -> cont
where
delay = Seconds (fromIntegral (durationSeconds duration))
+
+readMeterVar
+ :: MonadIO m
+ => TVar (Maybe BytesProcessed)
+ -> m (Maybe ByteSize)
+readMeterVar metervar = liftIO $ atomically $
+ fmap fromBytesProcessed <$> readTVar metervar
* Improve disk free space checking when transferring unsized keys to
local git remotes.
* import: --message/-m option.
+ * Improve annex.stalldetection to handle remotes that update progress
+ less frequently than the configured time period.
-- Joey Hess <id@joeyh.name> Fri, 29 Dec 2023 11:52:06 -0400
-- stored, update the chunk log.
chunksStored u k (FixedSizeChunks chunksize) numchunks
| otherwise = do
- liftIO $ meterupdate' zeroBytesProcessed
let (chunkkey, chunkkeys') = nextChunkKeyStream chunkkeys
storechunk chunkkey (ByteContent chunk) meterupdate'
+ liftIO $ meterupdate' zeroBytesProcessed
let bytesprocessed' = addBytesProcessed bytesprocessed (L.length chunk)
loop bytesprocessed' (splitchunk bs) chunkkeys'
where
--- /dev/null
+[[!comment format=mdwn
+ username="joey"
+ subject="""comment 1"""
+ date="2024-01-18T17:18:07Z"
+ content="""
+It looks like you must have annex.stalldetection (or the per-remote config)
+set. git-annex does not behave this way without that configuration.
+What is it set to?
+
+You are probably right in that it involves rclone special remote not
+reporting transfer progress back to git-annex.
+
+Normally, when a special remote does not do progress reporting,
+git-annex does not do any stall detection, because there must have been
+at least some previous progress update in order for it to detect a stall.
+
+But when chunking is enabled (as it was in your case with 1 gb chunks),
+git-annex itself updates the progress after each chunk. When the special
+remote does not do any progress reporting, and chunk size is large, that
+means that the progress will be updated very infrequently.
+
+So for example, if it takes 2 minutes to upload a chunk, and you had
+annex.stalldetection set to eg "10mb/1m", then in a chunk after the 1st one,
+git-annex would wake up after 1 minute, see that no data seemed to have
+been sent, and conclude there was a stall. You would need to change the
+time period in the config to something less granular eg "100mb/10m"
+to avoid that.
+
+This might be a documentation problem, it may not be clear to the user
+that "10mb/1m" is any different than "100mb/10m". And finding a value that
+works depends on knowing details of how often the progress gets updated
+for a given remote.
+
+But, your transcript show that the stall was detected on chunk 296.
+(git-annex's chunk, rclone is doing its own chunking to dropbox)
+So the annex.stalldetection configuration must have been one that
+worked most of the time, for it to transfer the previous 295 chunks
+without a stall having been detected. Unless this was a resume after
+previous command(s) had uploaded those other chunks.
+"""]]
--- /dev/null
+[[!comment format=mdwn
+ username="joey"
+ subject="""comment 2"""
+ date="2024-01-18T17:50:21Z"
+ content="""
+I think that what git-annex could do is detect when progress updates are
+happening with too low a granularity for the annex.stalldetection
+configuration.
+
+When waiting for the first progress update, it can keep track of how much time
+has elapsed. If annex.stalldetection is "10mb/2m" and it took 20 minutes to
+get the first progress update, the granularity is clearly too low.
+
+And then it could disregard the configuration, or suggest a better
+configuration value, or adjust what it's expecting to match the
+observed granularity.
+
+(The stall detection auto-prober uses a similar heuristic to that already.
+It observes the granularity and only if it's sufficiently low (an update
+every 30 seconds or less) does it assume that 60 seconds without an update
+may be a stall.)
+"""]]
--- /dev/null
+[[!comment format=mdwn
+ username="joey"
+ subject="""comment 3"""
+ date="2024-01-18T19:04:26Z"
+ content="""
+To reproduce this (more or less), I modified the example.sh external
+special remote to sleep for 10 seconds before each key store.
+Set up a remote with chunk=1mb, and annex.stalldetection = "0.001mb/1s".
+
+Uploading a 100 mb file, a stall is detected after the first chunk is
+uploaded. As expected, since 1 second passed with no update.
+
+When I resume the upload, the second chunk is uploaded and then a stall is
+detected on the third. And so on.
+
+I've implemented dynamic granularity scaling now, and in this test case, it notices
+it took 11 seconds for the first chunk, and behaves as if it were
+configured with annex.stalldetection of "0.022mb/22s". Which keeps it from
+detecting a stall.
+"""]]
Set to "false" to avoid all attempts at stall detection.
- For more fine control over what constitutes a stall, set to a value in
- the form "$amount/$timeperiod" to specify how much data git-annex should
+ To detect and cancel stalled transfers, set this to a value in the form
+ "$amount/$timeperiod" which specifies how much data git-annex should
expect to see flowing, minimum, over a given period of time.
For example, to detect outright stalls where no data has been transferred
stuck for a long time, you could use:
`git config remote.usbdrive.annex-stalldetection "1MB/1m"`
- This is not enabled by default, because it can make git-annex use
- more resources. To be able to cancel stalls, git-annex has to run
- transfers in separate processes (one per concurrent job). So it
- may need to open more connections to a remote than usual, or
- the communication with those processes may make it a bit slower.
+ Some remotes don't report transfer progress, and stalls cannot be
+ detected when using those.
+
+ Some remotes only report transfer progress occasionally, eg
+ after each chunk. To avoid false timeouts in such a situation, if the
+ first progress update takes longer to arrive than the configured time
+ period, the stall detection will be automically adjusted to use a longer
+ time period. For example, if the first progress update comes after 10
+ minutes, but annex.stalldetection is "1MB/1m", it will be treated as eg
+ "15MB/15m".
+
+ Configuring stall detection can make git-annex use more resources. To be
+ able to cancel stalls, git-annex has to run transfers in separate
+ processes (one per concurrent job). So it may need to open more
+ connections to a remote than usual, or the communication with those
+ processes may make it a bit slower.
* `remote.<name>.annex-checkuuid`