automatically adjust stall detection period

author Joey Hess <joeyh@joeyh.name>

Thu, 18 Jan 2024 21:11:56 +0000 (17:11 -0400)

committer Joey Hess <joeyh@joeyh.name>

Thu, 18 Jan 2024 21:12:10 +0000 (17:12 -0400)
author Joey Hess <joeyh@joeyh.name>
Thu, 18 Jan 2024 21:11:56 +0000 (17:11 -0400)
committer Joey Hess <joeyh@joeyh.name>
Thu, 18 Jan 2024 21:12:10 +0000 (17:12 -0400)
diff --git a/Annex/StallDetection.hs b/Annex/StallDetection.hs

index 9c095b4c86b3ac96f36274b6be99729620ae3efa..1fa440cc22ad01335f4541eaf145605b550274ef 100644 (file)
--- a/Annex/StallDetection.hs
+++ b/Annex/StallDetection.hs
@@ -1,6 +1,6 @@
  {- Stall detection for transfers.
   -
- - Copyright 2020-2021 Joey Hess <id@joeyh.name>
+ - Copyright 2020-2024 Joey Hess <id@joeyh.name>
   -
   - Licensed under the GNU AGPL version 3 or higher.
   -}
@@ -16,41 +16,74 @@ import Utility.ThreadScheduler
  
  import Control.Concurrent.STM
  import Control.Monad.IO.Class (MonadIO)
+import Data.Time.Clock
  
  {- This may be safely canceled (with eg uninterruptibleCancel),
   - as long as the passed action can be safely canceled. -}
  detectStalls :: (Monad m, MonadIO m) => Maybe StallDetection -> TVar (Maybe BytesProcessed) -> m () -> m ()
  detectStalls Nothing _ _ = noop
  detectStalls (Just StallDetectionDisabled) _ _ = noop
-detectStalls (Just (StallDetection (BwRate minsz duration))) metervar onstall =
-       detectStalls' minsz duration metervar onstall Nothing
+detectStalls (Just (StallDetection (BwRate minsz duration))) metervar onstall = do
+       -- If the progress is being updated, but less frequently than
+       -- the specified duration, a stall would be incorrectly detected.
+       --
+       -- For example, consider the case of a remote that does
+       -- not support progress updates, but is chunked with a large chunk
+       -- size. In that case, progress is only updated after each chunk.
+       --
+       -- So, wait for the first update, and see how long it takes.
+       -- It's longer than the duration, upscale the duration and minsz
+       -- accordingly.
+       starttime <- liftIO getCurrentTime
+       v <- waitforfirstupdate =<< readMeterVar metervar
+       endtime <- liftIO getCurrentTime
+       let timepassed = floor (endtime `diffUTCTime` starttime)
+       let (scaledminsz, scaledduration) = upscale timepassed
+       detectStalls' scaledminsz scaledduration metervar onstall v
+  where
+       upscale timepassed
+               | timepassed > dsecs = 
+                       let scale = scaleamount timepassed
+                       in (minsz * scale, Duration (dsecs * scale))
+               | otherwise = (minsz, duration)
+       scaleamount timepassed = max 1 $ floor $
+               (fromIntegral timepassed / fromIntegral (max dsecs 1))
+               * scalefudgefactor
+       scalefudgefactor = 1.5 :: Double
+       dsecs = durationSeconds duration
+       minwaitsecs = Seconds $
+               min 60 (fromIntegral (durationSeconds duration))
+       waitforfirstupdate startval = do
+               liftIO $ threadDelaySeconds minwaitsecs
+               v <- readMeterVar metervar
+               if v > startval
+                       then return v
+                       else waitforfirstupdate startval
+       
  detectStalls (Just ProbeStallDetection) metervar onstall = do
         -- Only do stall detection once the progress is confirmed to be
         -- consistently updating. After the first update, it needs to
         -- advance twice within 30 seconds. With that established,
         -- if no data at all is sent for a 60 second period, it's
         -- assumed to be a stall.
-       v <- getval >>= waitforfirstupdate
+       v <- readMeterVar metervar >>= waitforfirstupdate
         ontimelyadvance v $ \v' -> ontimelyadvance v' $
                 detectStalls' 1 duration metervar onstall
    where
-       getval = liftIO $ atomically $ fmap fromBytesProcessed
-               <$> readTVar metervar
-       
         duration = Duration 60
  
         delay = Seconds (fromIntegral (durationSeconds duration) `div` 2)
         
         waitforfirstupdate startval = do
                 liftIO $ threadDelaySeconds delay
-               v <- getval
+               v <- readMeterVar metervar
                 if v > startval
                         then return v
                         else waitforfirstupdate startval
  
         ontimelyadvance v cont = do
                 liftIO $ threadDelaySeconds delay
-               v' <- getval
+               v' <- readMeterVar metervar
                 when (v' > v) $
                         cont v'
  
@@ -65,8 +98,7 @@ detectStalls'
  detectStalls' minsz duration metervar onstall st = do
         liftIO $ threadDelaySeconds delay
         -- Get whatever progress value was reported most recently, if any.
-       v <- liftIO $ atomically $ fmap fromBytesProcessed
-               <$> readTVar metervar
+       v <- readMeterVar metervar
         let cont = detectStalls' minsz duration metervar onstall v
         case (st, v) of
                 (Nothing, _) -> cont
@@ -81,3 +113,10 @@ detectStalls' minsz duration metervar onstall st = do
                         | otherwise -> cont
    where
         delay = Seconds (fromIntegral (durationSeconds duration))
+
+readMeterVar
+       :: MonadIO m
+       => TVar (Maybe BytesProcessed)
+       -> m (Maybe ByteSize)
+readMeterVar metervar = liftIO $ atomically $ 
+       fmap fromBytesProcessed <$> readTVar metervar
diff --git a/CHANGELOG b/CHANGELOG

index 20278c8ba3265d5e6bfee671e1ff0d819d730016..8ab8386fb71c38d40500a25e6f736ffd63becd3e 100644 (file)
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -13,6 +13,8 @@ git-annex (10.20231228) UNRELEASED; urgency=medium
    * Improve disk free space checking when transferring unsized keys to
      local git remotes.
    * import: --message/-m option.
+  * Improve annex.stalldetection to handle remotes that update progress
+    less frequently than the configured time period.
  
   -- Joey Hess <id@joeyh.name>  Fri, 29 Dec 2023 11:52:06 -0400
  
diff --git a/Remote/Helper/Chunked.hs b/Remote/Helper/Chunked.hs

index 29b97b761a5fce787f82f8a5cf97ed81e71de8bb..bb903b72662575c1e981b9ba10f65c0cb40d146a 100644 (file)
--- a/Remote/Helper/Chunked.hs
+++ b/Remote/Helper/Chunked.hs
@@ -159,9 +159,9 @@ storeChunks u chunkconfig encryptor k f p enc encc storer checker =
                                 -- stored, update the chunk log.
                                 chunksStored u k (FixedSizeChunks chunksize) numchunks
                         | otherwise = do
-                               liftIO $ meterupdate' zeroBytesProcessed
                                 let (chunkkey, chunkkeys') = nextChunkKeyStream chunkkeys
                                 storechunk chunkkey (ByteContent chunk) meterupdate'
+                               liftIO $ meterupdate' zeroBytesProcessed
                                 let bytesprocessed' = addBytesProcessed bytesprocessed (L.length chunk)
                                 loop bytesprocessed' (splitchunk bs) chunkkeys'
                   where
diff --git a/doc/bugs/too_aggressive_in_claiming___34__Transfer_stalled__34____63__/comment_1_db83f6a38cae36da89f0ab4ef83021d8._comment b/doc/bugs/too_aggressive_in_claiming___34__Transfer_stalled__34____63__/comment_1_db83f6a38cae36da89f0ab4ef83021d8._comment

new file mode 100644 (file)

index 0000000..4c2ccc7
--- /dev/null
+++ b/doc/bugs/too_aggressive_in_claiming___34__Transfer_stalled__34____63__/comment_1_db83f6a38cae36da89f0ab4ef83021d8._comment
@@ -0,0 +1,40 @@
+[[!comment format=mdwn
+ username="joey"
+ subject="""comment 1"""
+ date="2024-01-18T17:18:07Z"
+ content="""
+It looks like you must have annex.stalldetection (or the per-remote config)
+set. git-annex does not behave this way without that configuration.
+What is it set to?
+
+You are probably right in that it involves rclone special remote not
+reporting transfer progress back to git-annex.
+
+Normally, when a special remote does not do progress reporting,
+git-annex does not do any stall detection, because there must have been
+at least some previous progress update in order for it to detect a stall.
+
+But when chunking is enabled (as it was in your case with 1 gb chunks),
+git-annex itself updates the progress after each chunk. When the special
+remote does not do any progress reporting, and chunk size is large, that
+means that the progress will be updated very infrequently. 
+
+So for example, if it takes 2 minutes to upload a chunk, and you had
+annex.stalldetection set to eg "10mb/1m", then in a chunk after the 1st one,
+git-annex would wake up after 1 minute, see that no data seemed to have
+been sent, and conclude there was a stall. You would need to change the
+time period in the config to something less granular eg "100mb/10m"
+to avoid that.
+
+This might be a documentation problem, it may not be clear to the user
+that "10mb/1m" is any different than "100mb/10m". And finding a value that
+works depends on knowing details of how often the progress gets updated
+for a given remote.
+
+But, your transcript show that the stall was detected on chunk 296.
+(git-annex's chunk, rclone is doing its own chunking to dropbox)
+So the annex.stalldetection configuration must have been one that
+worked most of the time, for it to transfer the previous 295 chunks
+without a stall having been detected. Unless this was a resume after
+previous command(s) had uploaded those other chunks.
+"""]]
diff --git a/doc/bugs/too_aggressive_in_claiming___34__Transfer_stalled__34____63__/comment_2_2aeb065a257729e852055533aff04650._comment b/doc/bugs/too_aggressive_in_claiming___34__Transfer_stalled__34____63__/comment_2_2aeb065a257729e852055533aff04650._comment

new file mode 100644 (file)

index 0000000..5d8e386
--- /dev/null
+++ b/doc/bugs/too_aggressive_in_claiming___34__Transfer_stalled__34____63__/comment_2_2aeb065a257729e852055533aff04650._comment
@@ -0,0 +1,22 @@
+[[!comment format=mdwn
+ username="joey"
+ subject="""comment 2"""
+ date="2024-01-18T17:50:21Z"
+ content="""
+I think that what git-annex could do is detect when progress updates are
+happening with too low a granularity for the annex.stalldetection
+configuration.
+
+When waiting for the first progress update, it can keep track of how much time
+has elapsed. If annex.stalldetection is "10mb/2m" and it took 20 minutes to
+get the first progress update, the granularity is clearly too low.
+
+And then it could disregard the configuration, or suggest a better
+configuration value, or adjust what it's expecting to match the
+observed granularity.
+
+(The stall detection auto-prober uses a similar heuristic to that already.
+It observes the granularity and only if it's sufficiently low (an update
+every 30 seconds or less) does it assume that 60 seconds without an update
+may be a stall.)
+"""]]
diff --git a/doc/bugs/too_aggressive_in_claiming___34__Transfer_stalled__34____63__/comment_3_f8bd6a233d835bdc413bbf0127608431._comment b/doc/bugs/too_aggressive_in_claiming___34__Transfer_stalled__34____63__/comment_3_f8bd6a233d835bdc413bbf0127608431._comment

new file mode 100644 (file)

index 0000000..dd93371
--- /dev/null
+++ b/doc/bugs/too_aggressive_in_claiming___34__Transfer_stalled__34____63__/comment_3_f8bd6a233d835bdc413bbf0127608431._comment
@@ -0,0 +1,20 @@
+[[!comment format=mdwn
+ username="joey"
+ subject="""comment 3"""
+ date="2024-01-18T19:04:26Z"
+ content="""
+To reproduce this (more or less), I modified the example.sh external
+special remote to sleep for 10 seconds before each key store.
+Set up a remote with chunk=1mb, and annex.stalldetection = "0.001mb/1s".
+
+Uploading a 100 mb file, a stall is detected after the first chunk is
+uploaded. As expected, since 1 second passed with no update.
+
+When I resume the upload, the second chunk is uploaded and then a stall is
+detected on the third. And so on.
+
+I've implemented dynamic granularity scaling now, and in this test case, it notices
+it took 11 seconds for the first chunk, and behaves as if it were
+configured with annex.stalldetection of "0.022mb/22s". Which keeps it from
+detecting a stall.
+"""]]
diff --git a/doc/git-annex.mdwn b/doc/git-annex.mdwn

index b127c317e5f08da931235f4daf8b9a0f544672aa..6b0f3287107cdfb27094c2936073719a63ae8c86 100644 (file)
--- a/doc/git-annex.mdwn
+++ b/doc/git-annex.mdwn
@@ -1538,8 +1538,8 @@ Remotes are configured using these settings in `.git/config`.
  
    Set to "false" to avoid all attempts at stall detection.
  
-  For more fine control over what constitutes a stall, set to a value in
-  the form "$amount/$timeperiod" to specify how much data git-annex should
+  To detect and cancel stalled transfers, set this to a value in the form
+  "$amount/$timeperiod" which specifies how much data git-annex should
    expect to see flowing, minimum, over a given period of time.
  
    For example, to detect outright stalls where no data has been transferred
@@ -1550,11 +1550,22 @@ Remotes are configured using these settings in `.git/config`.
    stuck for a long time, you could use:
    `git config remote.usbdrive.annex-stalldetection "1MB/1m"`
  
-  This is not enabled by default, because it can make git-annex use
-  more resources. To be able to cancel stalls, git-annex has to run
-  transfers in separate processes (one per concurrent job). So it
-  may need to open more connections to a remote than usual, or
-  the communication with those processes may make it a bit slower.
+  Some remotes don't report transfer progress, and stalls cannot be
+  detected when using those.
+
+  Some remotes only report transfer progress occasionally, eg
+  after each chunk. To avoid false timeouts in such a situation, if the
+  first progress update takes longer to arrive than the configured time
+  period, the stall detection will be automically adjusted to use a longer
+  time period. For example, if the first progress update comes after 10
+  minutes, but annex.stalldetection is "1MB/1m", it will be treated as eg
+  "15MB/15m".
+
+  Configuring stall detection can make git-annex use more resources. To be
+  able to cancel stalls, git-annex has to run transfers in separate
+  processes (one per concurrent job). So it may need to open more
+  connections to a remote than usual, or the communication with those
+  processes may make it a bit slower.
  
  * `remote.<name>.annex-checkuuid`
author	Joey Hess <joeyh@joeyh.name>
	Thu, 18 Jan 2024 21:11:56 +0000 (17:11 -0400)
committer	Joey Hess <joeyh@joeyh.name>
	Thu, 18 Jan 2024 21:12:10 +0000 (17:12 -0400)
Annex/StallDetection.hs		patch \| blob \| history
CHANGELOG		patch \| blob \| history
Remote/Helper/Chunked.hs		patch \| blob \| history
doc/bugs/too_aggressive_in_claiming___34__Transfer_stalled__34____63__/comment_1_db83f6a38cae36da89f0ab4ef83021d8._comment	[new file with mode: 0644]	patch \| blob
doc/bugs/too_aggressive_in_claiming___34__Transfer_stalled__34____63__/comment_2_2aeb065a257729e852055533aff04650._comment	[new file with mode: 0644]	patch \| blob
doc/bugs/too_aggressive_in_claiming___34__Transfer_stalled__34____63__/comment_3_f8bd6a233d835bdc413bbf0127608431._comment	[new file with mode: 0644]	patch \| blob
doc/git-annex.mdwn		patch \| blob \| history