Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

memcg: fix oom kill behavior

In current page-fault code,

handle_mm_fault()
-> ...
-> mem_cgroup_charge()
-> map page or handle error.
-> check return code.

If page fault's return code is VM_FAULT_OOM, page_fault_out_of_memory() is
called. But if it's caused by memcg, OOM should have been already
invoked.

Then, I added a patch: a636b327f731143ccc544b966cfd8de6cb6d72c6. That
patch records last_oom_jiffies for memcg's sub-hierarchy and prevents
page_fault_out_of_memory from being invoked in near future.

But Nishimura-san reported that check by jiffies is not enough when the
system is terribly heavy.

This patch changes memcg's oom logic as.
* If memcg causes OOM-kill, continue to retry.
* remove jiffies check which is used now.
* add memcg-oom-lock which works like perzone oom lock.
* If current is killed(as a process), bypass charge.

Something more sophisticated can be added but this pactch does
fundamental things.
TODO:
- add oom notifier
- add permemcg disable-oom-kill flag and freezer at oom.
- more chances for wake up oom waiter (when changing memory limit etc..)

Reviewed-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Tested-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by

KAMEZAWA Hiroyuki and committed by
Linus Torvalds
867578cb 0263c12c

+107 -41
-6
include/linux/memcontrol.h
··· 124 124 return false; 125 125 } 126 126 127 - extern bool mem_cgroup_oom_called(struct task_struct *task); 128 127 void mem_cgroup_update_file_mapped(struct page *page, int val); 129 128 unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, 130 129 gfp_t gfp_mask, int nid, ··· 255 256 static inline bool mem_cgroup_disabled(void) 256 257 { 257 258 return true; 258 - } 259 - 260 - static inline bool mem_cgroup_oom_called(struct task_struct *task) 261 - { 262 - return false; 263 259 } 264 260 265 261 static inline int
+107 -27
mm/memcontrol.c
··· 203 203 * Should the accounting and control be hierarchical, per subtree? 204 204 */ 205 205 bool use_hierarchy; 206 - unsigned long last_oom_jiffies; 206 + atomic_t oom_lock; 207 207 atomic_t refcnt; 208 208 209 209 unsigned int swappiness; ··· 1246 1246 return total; 1247 1247 } 1248 1248 1249 - bool mem_cgroup_oom_called(struct task_struct *task) 1249 + static int mem_cgroup_oom_lock_cb(struct mem_cgroup *mem, void *data) 1250 1250 { 1251 - bool ret = false; 1252 - struct mem_cgroup *mem; 1253 - struct mm_struct *mm; 1251 + int *val = (int *)data; 1252 + int x; 1253 + /* 1254 + * Logically, we can stop scanning immediately when we find 1255 + * a memcg is already locked. But condidering unlock ops and 1256 + * creation/removal of memcg, scan-all is simple operation. 1257 + */ 1258 + x = atomic_inc_return(&mem->oom_lock); 1259 + *val = max(x, *val); 1260 + return 0; 1261 + } 1262 + /* 1263 + * Check OOM-Killer is already running under our hierarchy. 1264 + * If someone is running, return false. 1265 + */ 1266 + static bool mem_cgroup_oom_lock(struct mem_cgroup *mem) 1267 + { 1268 + int lock_count = 0; 1254 1269 1255 - rcu_read_lock(); 1256 - mm = task->mm; 1257 - if (!mm) 1258 - mm = &init_mm; 1259 - mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); 1260 - if (mem && time_before(jiffies, mem->last_oom_jiffies + HZ/10)) 1261 - ret = true; 1262 - rcu_read_unlock(); 1263 - return ret; 1270 + mem_cgroup_walk_tree(mem, &lock_count, mem_cgroup_oom_lock_cb); 1271 + 1272 + if (lock_count == 1) 1273 + return true; 1274 + return false; 1264 1275 } 1265 1276 1266 - static int record_last_oom_cb(struct mem_cgroup *mem, void *data) 1277 + static int mem_cgroup_oom_unlock_cb(struct mem_cgroup *mem, void *data) 1267 1278 { 1268 - mem->last_oom_jiffies = jiffies; 1279 + /* 1280 + * When a new child is created while the hierarchy is under oom, 1281 + * mem_cgroup_oom_lock() may not be called. We have to use 1282 + * atomic_add_unless() here. 1283 + */ 1284 + atomic_add_unless(&mem->oom_lock, -1, 0); 1269 1285 return 0; 1270 1286 } 1271 1287 1272 - static void record_last_oom(struct mem_cgroup *mem) 1288 + static void mem_cgroup_oom_unlock(struct mem_cgroup *mem) 1273 1289 { 1274 - mem_cgroup_walk_tree(mem, NULL, record_last_oom_cb); 1290 + mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_unlock_cb); 1291 + } 1292 + 1293 + static DEFINE_MUTEX(memcg_oom_mutex); 1294 + static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); 1295 + 1296 + /* 1297 + * try to call OOM killer. returns false if we should exit memory-reclaim loop. 1298 + */ 1299 + bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) 1300 + { 1301 + DEFINE_WAIT(wait); 1302 + bool locked; 1303 + 1304 + /* At first, try to OOM lock hierarchy under mem.*/ 1305 + mutex_lock(&memcg_oom_mutex); 1306 + locked = mem_cgroup_oom_lock(mem); 1307 + /* 1308 + * Even if signal_pending(), we can't quit charge() loop without 1309 + * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL 1310 + * under OOM is always welcomed, use TASK_KILLABLE here. 1311 + */ 1312 + if (!locked) 1313 + prepare_to_wait(&memcg_oom_waitq, &wait, TASK_KILLABLE); 1314 + mutex_unlock(&memcg_oom_mutex); 1315 + 1316 + if (locked) 1317 + mem_cgroup_out_of_memory(mem, mask); 1318 + else { 1319 + schedule(); 1320 + finish_wait(&memcg_oom_waitq, &wait); 1321 + } 1322 + mutex_lock(&memcg_oom_mutex); 1323 + mem_cgroup_oom_unlock(mem); 1324 + /* 1325 + * Here, we use global waitq .....more fine grained waitq ? 1326 + * Assume following hierarchy. 1327 + * A/ 1328 + * 01 1329 + * 02 1330 + * assume OOM happens both in A and 01 at the same time. Tthey are 1331 + * mutually exclusive by lock. (kill in 01 helps A.) 1332 + * When we use per memcg waitq, we have to wake up waiters on A and 02 1333 + * in addtion to waiters on 01. We use global waitq for avoiding mess. 1334 + * It will not be a big problem. 1335 + * (And a task may be moved to other groups while it's waiting for OOM.) 1336 + */ 1337 + wake_up_all(&memcg_oom_waitq); 1338 + mutex_unlock(&memcg_oom_mutex); 1339 + 1340 + if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) 1341 + return false; 1342 + /* Give chance to dying process */ 1343 + schedule_timeout(1); 1344 + return true; 1275 1345 } 1276 1346 1277 1347 /* ··· 1513 1443 struct res_counter *fail_res; 1514 1444 int csize = CHARGE_SIZE; 1515 1445 1516 - if (unlikely(test_thread_flag(TIF_MEMDIE))) { 1517 - /* Don't account this! */ 1518 - *memcg = NULL; 1519 - return 0; 1520 - } 1446 + /* 1447 + * Unlike gloval-vm's OOM-kill, we're not in memory shortage 1448 + * in system level. So, allow to go ahead dying process in addition to 1449 + * MEMDIE process. 1450 + */ 1451 + if (unlikely(test_thread_flag(TIF_MEMDIE) 1452 + || fatal_signal_pending(current))) 1453 + goto bypass; 1521 1454 1522 1455 /* 1523 1456 * We always charge the cgroup the mm_struct belongs to. ··· 1633 1560 } 1634 1561 1635 1562 if (!nr_retries--) { 1636 - if (oom) { 1637 - mem_cgroup_out_of_memory(mem_over_limit, gfp_mask); 1638 - record_last_oom(mem_over_limit); 1563 + if (!oom) 1564 + goto nomem; 1565 + if (mem_cgroup_handle_oom(mem_over_limit, gfp_mask)) { 1566 + nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 1567 + continue; 1639 1568 } 1640 - goto nomem; 1569 + /* When we reach here, current task is dying .*/ 1570 + css_put(&mem->css); 1571 + goto bypass; 1641 1572 } 1642 1573 } 1643 1574 if (csize > PAGE_SIZE) ··· 1651 1574 nomem: 1652 1575 css_put(&mem->css); 1653 1576 return -ENOMEM; 1577 + bypass: 1578 + *memcg = NULL; 1579 + return 0; 1654 1580 } 1655 1581 1656 1582 /*
-8
mm/oom_kill.c
··· 603 603 /* Got some memory back in the last second. */ 604 604 return; 605 605 606 - /* 607 - * If this is from memcg, oom-killer is already invoked. 608 - * and not worth to go system-wide-oom. 609 - */ 610 - if (mem_cgroup_oom_called(current)) 611 - goto rest_and_return; 612 - 613 606 if (sysctl_panic_on_oom) 614 607 panic("out of memory from page fault. panic_on_oom is selected.\n"); 615 608 ··· 614 621 * Give "p" a good chance of killing itself before we 615 622 * retry to allocate memory. 616 623 */ 617 - rest_and_return: 618 624 if (!test_thread_flag(TIF_MEMDIE)) 619 625 schedule_timeout_uninterruptible(1); 620 626 }