diff --git a/include/linux/swap.h b/include/linux/swap.h index cb5baebf31d6..8a3c4a1caa14 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -217,8 +217,6 @@ struct swap_info_struct { unsigned int inuse_pages; /* number of those currently in use */ unsigned int cluster_next; /* likely index for next allocation */ unsigned int cluster_nr; /* countdown to next cluster search */ - unsigned int lowest_alloc; /* while preparing discard cluster */ - unsigned int highest_alloc; /* while preparing discard cluster */ struct swap_extent *curr_swap_extent; struct swap_extent first_swap_extent; struct block_device *bdev; /* swap device or bdev of swap file */ @@ -232,14 +230,18 @@ struct swap_info_struct { * protect map scan related fields like * swap_map, lowest_bit, highest_bit, * inuse_pages, cluster_next, - * cluster_nr, lowest_alloc and - * highest_alloc. other fields are only - * changed at swapon/swapoff, so are - * protected by swap_lock. changing - * flags need hold this lock and - * swap_lock. If both locks need hold, - * hold swap_lock first. + * cluster_nr, lowest_alloc, + * highest_alloc, free/discard cluster + * list. other fields are only changed + * at swapon/swapoff, so are protected + * by swap_lock. changing flags need + * hold this lock and swap_lock. If + * both locks need hold, hold swap_lock + * first. */ + struct work_struct discard_work; /* discard worker */ + struct swap_cluster_info discard_cluster_head; /* list head of discard clusters */ + struct swap_cluster_info discard_cluster_tail; /* list tail of discard clusters */ }; struct swap_list_t { diff --git a/mm/swapfile.c b/mm/swapfile.c index d1fbeb486de5..dac47c66055c 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -175,12 +175,6 @@ static void discard_swap_cluster(struct swap_info_struct *si, } } -static int wait_for_discard(void *word) -{ - schedule(); - return 0; -} - #define SWAPFILE_CLUSTER 256 #define LATENCY_LIMIT 256 @@ -242,6 +236,90 @@ static inline void cluster_set_null(struct swap_cluster_info *info) info->data = 0; } +/* Add a cluster to discard list and schedule it to do discard */ +static void swap_cluster_schedule_discard(struct swap_info_struct *si, + unsigned int idx) +{ + /* + * If scan_swap_map() can't find a free cluster, it will check + * si->swap_map directly. To make sure the discarding cluster isn't + * taken by scan_swap_map(), mark the swap entries bad (occupied). It + * will be cleared after discard + */ + memset(si->swap_map + idx * SWAPFILE_CLUSTER, + SWAP_MAP_BAD, SWAPFILE_CLUSTER); + + if (cluster_is_null(&si->discard_cluster_head)) { + cluster_set_next_flag(&si->discard_cluster_head, + idx, 0); + cluster_set_next_flag(&si->discard_cluster_tail, + idx, 0); + } else { + unsigned int tail = cluster_next(&si->discard_cluster_tail); + cluster_set_next(&si->cluster_info[tail], idx); + cluster_set_next_flag(&si->discard_cluster_tail, + idx, 0); + } + + schedule_work(&si->discard_work); +} + +/* + * Doing discard actually. After a cluster discard is finished, the cluster + * will be added to free cluster list. caller should hold si->lock. +*/ +static void swap_do_scheduled_discard(struct swap_info_struct *si) +{ + struct swap_cluster_info *info; + unsigned int idx; + + info = si->cluster_info; + + while (!cluster_is_null(&si->discard_cluster_head)) { + idx = cluster_next(&si->discard_cluster_head); + + cluster_set_next_flag(&si->discard_cluster_head, + cluster_next(&info[idx]), 0); + if (cluster_next(&si->discard_cluster_tail) == idx) { + cluster_set_null(&si->discard_cluster_head); + cluster_set_null(&si->discard_cluster_tail); + } + spin_unlock(&si->lock); + + discard_swap_cluster(si, idx * SWAPFILE_CLUSTER, + SWAPFILE_CLUSTER); + + spin_lock(&si->lock); + cluster_set_flag(&info[idx], CLUSTER_FLAG_FREE); + if (cluster_is_null(&si->free_cluster_head)) { + cluster_set_next_flag(&si->free_cluster_head, + idx, 0); + cluster_set_next_flag(&si->free_cluster_tail, + idx, 0); + } else { + unsigned int tail; + + tail = cluster_next(&si->free_cluster_tail); + cluster_set_next(&info[tail], idx); + cluster_set_next_flag(&si->free_cluster_tail, + idx, 0); + } + memset(si->swap_map + idx * SWAPFILE_CLUSTER, + 0, SWAPFILE_CLUSTER); + } +} + +static void swap_discard_work(struct work_struct *work) +{ + struct swap_info_struct *si; + + si = container_of(work, struct swap_info_struct, discard_work); + + spin_lock(&si->lock); + swap_do_scheduled_discard(si); + spin_unlock(&si->lock); +} + /* * The cluster corresponding to page_nr will be used. The cluster will be * removed from free cluster list and its usage counter will be increased. @@ -287,6 +365,16 @@ static void dec_cluster_info_page(struct swap_info_struct *p, cluster_count(&cluster_info[idx]) - 1); if (cluster_count(&cluster_info[idx]) == 0) { + /* + * If the swap is discardable, prepare discard the cluster + * instead of free it immediately. The cluster will be freed + * after discard. + */ + if (p->flags & SWP_PAGE_DISCARD) { + swap_cluster_schedule_discard(p, idx); + return; + } + cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE); if (cluster_is_null(&p->free_cluster_head)) { cluster_set_next_flag(&p->free_cluster_head, idx, 0); @@ -319,7 +407,6 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, unsigned long scan_base; unsigned long last_in_cluster = 0; int latency_ration = LATENCY_LIMIT; - int found_free_cluster = 0; /* * We try to cluster swap pages by allocating them sequentially @@ -340,19 +427,6 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, si->cluster_nr = SWAPFILE_CLUSTER - 1; goto checks; } - if (si->flags & SWP_PAGE_DISCARD) { - /* - * Start range check on racing allocations, in case - * they overlap the cluster we eventually decide on - * (we scan without swap_lock to allow preemption). - * It's hardly conceivable that cluster_nr could be - * wrapped during our scan, but don't depend on it. - */ - if (si->lowest_alloc) - goto checks; - si->lowest_alloc = si->max; - si->highest_alloc = 0; - } check_cluster: if (!cluster_is_null(&si->free_cluster_head)) { offset = cluster_next(&si->free_cluster_head) * @@ -360,15 +434,27 @@ check_cluster: last_in_cluster = offset + SWAPFILE_CLUSTER - 1; si->cluster_next = offset; si->cluster_nr = SWAPFILE_CLUSTER - 1; - found_free_cluster = 1; goto checks; } else if (si->cluster_info) { + /* + * we don't have free cluster but have some clusters in + * discarding, do discard now and reclaim them + */ + if (!cluster_is_null(&si->discard_cluster_head)) { + si->cluster_nr = 0; + swap_do_scheduled_discard(si); + scan_base = offset = si->cluster_next; + if (!si->cluster_nr) + goto check_cluster; + si->cluster_nr--; + goto checks; + } + /* * Checking free cluster is fast enough, we can do the * check every time */ si->cluster_nr = 0; - si->lowest_alloc = 0; goto checks; } @@ -395,7 +481,6 @@ check_cluster: offset -= SWAPFILE_CLUSTER - 1; si->cluster_next = offset; si->cluster_nr = SWAPFILE_CLUSTER - 1; - found_free_cluster = 1; goto checks; } if (unlikely(--latency_ration < 0)) { @@ -416,7 +501,6 @@ check_cluster: offset -= SWAPFILE_CLUSTER - 1; si->cluster_next = offset; si->cluster_nr = SWAPFILE_CLUSTER - 1; - found_free_cluster = 1; goto checks; } if (unlikely(--latency_ration < 0)) { @@ -428,7 +512,6 @@ check_cluster: offset = scan_base; spin_lock(&si->lock); si->cluster_nr = SWAPFILE_CLUSTER - 1; - si->lowest_alloc = 0; } checks: @@ -470,59 +553,6 @@ checks: si->cluster_next = offset + 1; si->flags -= SWP_SCANNING; - if (si->lowest_alloc) { - /* - * Only set when SWP_PAGE_DISCARD, and there's a scan - * for a free cluster in progress or just completed. - */ - if (found_free_cluster) { - /* - * To optimize wear-levelling, discard the - * old data of the cluster, taking care not to - * discard any of its pages that have already - * been allocated by racing tasks (offset has - * already stepped over any at the beginning). - */ - if (offset < si->highest_alloc && - si->lowest_alloc <= last_in_cluster) - last_in_cluster = si->lowest_alloc - 1; - si->flags |= SWP_DISCARDING; - spin_unlock(&si->lock); - - if (offset < last_in_cluster) - discard_swap_cluster(si, offset, - last_in_cluster - offset + 1); - - spin_lock(&si->lock); - si->lowest_alloc = 0; - si->flags &= ~SWP_DISCARDING; - - smp_mb(); /* wake_up_bit advises this */ - wake_up_bit(&si->flags, ilog2(SWP_DISCARDING)); - - } else if (si->flags & SWP_DISCARDING) { - /* - * Delay using pages allocated by racing tasks - * until the whole discard has been issued. We - * could defer that delay until swap_writepage, - * but it's easier to keep this self-contained. - */ - spin_unlock(&si->lock); - wait_on_bit(&si->flags, ilog2(SWP_DISCARDING), - wait_for_discard, TASK_UNINTERRUPTIBLE); - spin_lock(&si->lock); - } else { - /* - * Note pages allocated by racing tasks while - * scan for a free cluster is in progress, so - * that its final discard can exclude them. - */ - if (offset < si->lowest_alloc) - si->lowest_alloc = offset; - if (offset > si->highest_alloc) - si->highest_alloc = offset; - } - } return offset; scan: @@ -1806,6 +1836,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) goto out_dput; } + flush_work(&p->discard_work); + destroy_swap_extents(p); if (p->flags & SWP_CONTINUED) free_swap_count_continuations(p); @@ -2172,6 +2204,8 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p, cluster_set_null(&p->free_cluster_head); cluster_set_null(&p->free_cluster_tail); + cluster_set_null(&p->discard_cluster_head); + cluster_set_null(&p->discard_cluster_tail); for (i = 0; i < swap_header->info.nr_badpages; i++) { unsigned int page_nr = swap_header->info.badpages[i]; @@ -2281,6 +2315,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) if (IS_ERR(p)) return PTR_ERR(p); + INIT_WORK(&p->discard_work, swap_discard_work); + name = getname(specialfile); if (IS_ERR(name)) { error = PTR_ERR(name);