Revert "mm: fix 100% CPU kswapd busyloop on unreclaimable nodes"

author Yves-Alexis Perez <corsac@debian.org>

Fri, 2 Feb 2018 08:46:54 +0000 (09:46 +0100)

committer Yves-Alexis Perez <corsac@debian.org>

Fri, 9 Feb 2018 12:58:52 +0000 (12:58 +0000)
author Yves-Alexis Perez <corsac@debian.org>
Fri, 2 Feb 2018 08:46:54 +0000 (09:46 +0100)
committer Yves-Alexis Perez <corsac@debian.org>
Fri, 9 Feb 2018 12:58:52 +0000 (12:58 +0000)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h

index 65a686a7bf34db8661f9b014c590025011781292..1192eb029c5b5cebce0d6a1efb3aad3f3f5a9fea 100644 (file)
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -633,8 +633,6 @@ typedef struct pglist_data {
         int kswapd_order;
         enum zone_type kswapd_classzone_idx;
  
-       int kswapd_failures;            /* Number of 'reclaimed == 0' runs */
-
  #ifdef CONFIG_COMPACTION
         int kcompactd_max_order;
         enum zone_type kcompactd_classzone_idx;
diff --git a/mm/internal.h b/mm/internal.h

index 3e2d016947479f5d403734031f2abf66dd985532..34a5459e5989e48fc523c37079a0919b62b35a60 100644 (file)
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -73,12 +73,6 @@ static inline void set_page_refcounted(struct page *page)
  
  extern unsigned long highest_memmap_pfn;
  
-/*
- * Maximum number of reclaim retries without progress before the OOM
- * killer is consider the only way forward.
- */
-#define MAX_RECLAIM_RETRIES 16
-
  /*
   * in mm/vmscan.c:
   */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index 94018ea5f93523b367f27286c6497907bbebb2c8..546713b3f762050f51e04290cc39ec1e939dfa56 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3421,6 +3421,12 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
         return false;
  }
  
+/*
+ * Maximum number of reclaim retries without any progress before OOM killer
+ * is consider as the only way to move forward.
+ */
+#define MAX_RECLAIM_RETRIES 16
+
  /*
   * Checks whether it makes sense to retry the reclaim to make a forward progress
   * for the given allocation request.
@@ -4379,8 +4385,7 @@ void show_free_areas(unsigned int filter)
                         K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
                         K(node_page_state(pgdat, NR_UNSTABLE_NFS)),
                         node_page_state(pgdat, NR_PAGES_SCANNED),
-                       pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ?
-                               "yes" : "no");
+                       !pgdat_reclaimable(pgdat) ? "yes" : "no");
         }
  
         for_each_populated_zone(zone) {
diff --git a/mm/vmscan.c b/mm/vmscan.c

index f118dc23f6624ac8b5ad9fb0f719805adf8662ed..30a88b945a44f58e812a5eaf824a087b51a5dbec 100644 (file)
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2606,15 +2606,6 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
         } while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
                                          sc->nr_scanned - nr_scanned, sc));
  
-       /*
-        * Kswapd gives up on balancing particular nodes after too
-        * many failures to reclaim anything from them and goes to
-        * sleep. On reclaim progress, reset the failure counter. A
-        * successful direct reclaim run will revive a dormant kswapd.
-        */
-       if (reclaimable)
-               pgdat->kswapd_failures = 0;
-
         return reclaimable;
  }
  
@@ -2689,6 +2680,10 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
                                                  GFP_KERNEL | __GFP_HARDWALL))
                                 continue;
  
+                       if (sc->priority != DEF_PRIORITY &&
+                           !pgdat_reclaimable(zone->zone_pgdat))
+                               continue;       /* Let kswapd poll it */
+
                         /*
                          * If we already have plenty of memory free for
                          * compaction in this zone, don't free any more.
@@ -2825,7 +2820,7 @@ retry:
         return 0;
  }
  
-static bool allow_direct_reclaim(pg_data_t *pgdat)
+static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
  {
         struct zone *zone;
         unsigned long pfmemalloc_reserve = 0;
@@ -2833,9 +2828,6 @@ static bool allow_direct_reclaim(pg_data_t *pgdat)
         int i;
         bool wmark_ok;
  
-       if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
-               return true;
-
         for (i = 0; i <= ZONE_NORMAL; i++) {
                 zone = &pgdat->node_zones[i];
                 if (!managed_zone(zone) ||
@@ -2916,7 +2908,7 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
  
                 /* Throttle based on the first usable node */
                 pgdat = zone->zone_pgdat;
-               if (allow_direct_reclaim(pgdat))
+               if (pfmemalloc_watermark_ok(pgdat))
                         goto out;
                 break;
         }
@@ -2938,14 +2930,14 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
          */
         if (!(gfp_mask & __GFP_FS)) {
                 wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
-                       allow_direct_reclaim(pgdat), HZ);
+                       pfmemalloc_watermark_ok(pgdat), HZ);
  
                 goto check_pending;
         }
  
         /* Throttle until kswapd wakes the process */
         wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
-               allow_direct_reclaim(pgdat));
+               pfmemalloc_watermark_ok(pgdat));
  
  check_pending:
         if (fatal_signal_pending(current))
@@ -3124,7 +3116,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx)
  
         /*
          * The throttled processes are normally woken up in balance_pgdat() as
-        * soon as allow_direct_reclaim() is true. But there is a potential
+        * soon as pfmemalloc_watermark_ok() is true. But there is a potential
          * race between when kswapd checks the watermarks and a process gets
          * throttled. There is also a potential race if processes get
          * throttled, kswapd wakes, a large process exits thereby balancing the
@@ -3138,10 +3130,6 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx)
         if (waitqueue_active(&pgdat->pfmemalloc_wait))
                 wake_up_all(&pgdat->pfmemalloc_wait);
  
-       /* Hopeless node, leave it to direct reclaim */
-       if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
-               return true;
-
         for (i = 0; i <= classzone_idx; i++) {
                 struct zone *zone = pgdat->node_zones + i;
  
@@ -3228,9 +3216,9 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
         count_vm_event(PAGEOUTRUN);
  
         do {
-               unsigned long nr_reclaimed = sc.nr_reclaimed;
                 bool raise_priority = true;
  
+               sc.nr_reclaimed = 0;
                 sc.reclaim_idx = classzone_idx;
  
                 /*
@@ -3309,7 +3297,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
                  * able to safely make forward progress. Wake them
                  */
                 if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
-                               allow_direct_reclaim(pgdat))
+                               pfmemalloc_watermark_ok(pgdat))
                         wake_up_all(&pgdat->pfmemalloc_wait);
  
                 /* Check if kswapd should be suspending */
@@ -3320,14 +3308,10 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
                  * Raise priority if scanning rate is too low or there was no
                  * progress in reclaiming pages
                  */
-               nr_reclaimed = sc.nr_reclaimed - nr_reclaimed;
-               if (raise_priority || !nr_reclaimed)
+               if (raise_priority || !sc.nr_reclaimed)
                         sc.priority--;
         } while (sc.priority >= 1);
  
-       if (!sc.nr_reclaimed)
-               pgdat->kswapd_failures++;
-
  out:
         /*
          * Return the order kswapd stopped reclaiming at as
@@ -3527,10 +3511,6 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
         if (!waitqueue_active(&pgdat->kswapd_wait))
                 return;
  
-       /* Hopeless node, leave it to direct reclaim */
-       if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
-               return;
-
         /* Only wake kswapd if all zones are unbalanced */
         for (z = 0; z <= classzone_idx; z++) {
                 zone = pgdat->node_zones + z;
@@ -3801,6 +3781,9 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
             sum_zone_node_page_state(pgdat->node_id, NR_SLAB_RECLAIMABLE) <= pgdat->min_slab_pages)
                 return NODE_RECLAIM_FULL;
  
+       if (!pgdat_reclaimable(pgdat))
+               return NODE_RECLAIM_FULL;
+
         /*
          * Do not scan if the allocation should not be delayed.
          */
diff --git a/mm/vmstat.c b/mm/vmstat.c

index 3863b5d6d598f72df2a6a74af4a0218c7bd6dadd..6a088df04b29d6605de6eb18b028bc9fca5783ca 100644 (file)
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1421,7 +1421,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
                    "\n  node_unreclaimable:  %u"
                    "\n  start_pfn:           %lu"
                    "\n  node_inactive_ratio: %u",
-                  pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES,
+                  !pgdat_reclaimable(zone->zone_pgdat),
                    zone->zone_start_pfn,
                    zone->zone_pgdat->inactive_ratio);
         seq_putc(m, '\n');
author	Yves-Alexis Perez <corsac@debian.org>
	Fri, 2 Feb 2018 08:46:54 +0000 (09:46 +0100)
committer	Yves-Alexis Perez <corsac@debian.org>
	Fri, 9 Feb 2018 12:58:52 +0000 (12:58 +0000)
include/linux/mmzone.h		patch \| blob \| history
mm/internal.h		patch \| blob \| history
mm/page_alloc.c		patch \| blob \| history
mm/vmscan.c		patch \| blob \| history
mm/vmstat.c		patch \| blob \| history