diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index fa9ba2edf7e0..1cbad1248e5a 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -356,12 +356,17 @@ static inline bool mem_cgroup_disabled(void) return !cgroup_subsys_enabled(memory_cgrp_subsys); } -static inline unsigned long mem_cgroup_protection(struct mem_cgroup *memcg) +static inline void mem_cgroup_protection(struct mem_cgroup *memcg, + unsigned long *min, unsigned long *low) { - if (mem_cgroup_disabled()) - return 0; + if (mem_cgroup_disabled()) { + *min = 0; + *low = 0; + return; + } - return max(READ_ONCE(memcg->memory.emin), READ_ONCE(memcg->memory.elow)); + *min = READ_ONCE(memcg->memory.emin); + *low = READ_ONCE(memcg->memory.elow); } enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root, @@ -839,9 +844,11 @@ static inline void memcg_memory_event_mm(struct mm_struct *mm, { } -static inline unsigned long mem_cgroup_protection(struct mem_cgroup *memcg) +static inline void mem_cgroup_protection(struct mem_cgroup *memcg, + unsigned long *min, unsigned long *low) { - return 0; + *min = 0; + *low = 0; } static inline enum mem_cgroup_protection mem_cgroup_protected( diff --git a/mm/vmscan.c b/mm/vmscan.c index dfefa1d99d1b..70347d626fb3 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2461,12 +2461,12 @@ out: int file = is_file_lru(lru); unsigned long lruvec_size; unsigned long scan; - unsigned long protection; + unsigned long min, low; lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx); - protection = mem_cgroup_protection(memcg); + mem_cgroup_protection(memcg, &min, &low); - if (protection > 0) { + if (min || low) { /* * Scale a cgroup's reclaim pressure by proportioning * its current usage to its memory.low or memory.min @@ -2481,28 +2481,38 @@ out: * set it too low, which is not ideal. */ unsigned long cgroup_size = mem_cgroup_size(memcg); - unsigned long baseline = 0; /* - * During the reclaim first pass, we only consider - * cgroups in excess of their protection setting, but if - * that doesn't produce free pages, we come back for a - * second pass where we reclaim from all groups. + * If there is any protection in place, we adjust scan + * pressure in proportion to how much a group's current + * usage exceeds that, in percent. * - * To maintain fairness in both cases, the first pass - * targets groups in proportion to their overage, and - * the second pass targets groups in proportion to their - * protection utilization. - * - * So on the first pass, a group whose size is 130% of - * its protection will be targeted at 30% of its size. - * On the second pass, a group whose size is at 40% of - * its protection will be - * targeted at 40% of its size. + * There is one special case: in the first reclaim pass, + * we skip over all groups that are within their low + * protection. If that fails to reclaim enough pages to + * satisfy the reclaim goal, we come back and override + * the best-effort low protection. However, we still + * ideally want to honor how well-behaved groups are in + * that case instead of simply punishing them all + * equally. As such, we reclaim them based on how much + * of their best-effort protection they are using. Usage + * below memory.min is excluded from consideration when + * calculating utilisation, as it isn't ever + * reclaimable, so it might as well not exist for our + * purposes. */ - if (!sc->memcg_low_reclaim) - baseline = lruvec_size; - scan = lruvec_size * cgroup_size / protection - baseline; + if (sc->memcg_low_reclaim && low > min) { + /* + * Reclaim according to utilisation between min + * and low + */ + scan = lruvec_size * (cgroup_size - min) / + (low - min); + } else { + /* Reclaim according to protection overage */ + scan = lruvec_size * cgroup_size / + max(min, low) - lruvec_size; + } /* * Don't allow the scan target to exceed the lruvec @@ -2518,7 +2528,8 @@ out: * some cases in the case of large overages. * * Also, minimally target SWAP_CLUSTER_MAX pages to keep - * reclaim moving forwards. + * reclaim moving forwards, avoiding decremeting + * sc->priority further than desirable. */ scan = clamp(scan, SWAP_CLUSTER_MAX, lruvec_size); } else {