diff --git a/include/linux/swap.h b/include/linux/swap.h index 8a3c4a1caa14..24db9142e93b 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -198,6 +198,16 @@ struct swap_cluster_info { #define CLUSTER_FLAG_FREE 1 /* This cluster is free */ #define CLUSTER_FLAG_NEXT_NULL 2 /* This cluster has no next cluster */ +/* + * We assign a cluster to each CPU, so each CPU can allocate swap entry from + * its own cluster and swapout sequentially. The purpose is to optimize swapout + * throughput. + */ +struct percpu_cluster { + struct swap_cluster_info index; /* Current cluster index */ + unsigned int next; /* Likely next allocation offset */ +}; + /* * The in-memory structure used to track swap areas. */ @@ -217,6 +227,7 @@ struct swap_info_struct { unsigned int inuse_pages; /* number of those currently in use */ unsigned int cluster_next; /* likely index for next allocation */ unsigned int cluster_nr; /* countdown to next cluster search */ + struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap location */ struct swap_extent *curr_swap_extent; struct swap_extent first_swap_extent; struct block_device *bdev; /* swap device or bdev of swap file */ diff --git a/mm/swapfile.c b/mm/swapfile.c index 98e52e373bd8..3963fc24fcc1 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -392,13 +392,78 @@ static void dec_cluster_info_page(struct swap_info_struct *p, * It's possible scan_swap_map() uses a free cluster in the middle of free * cluster list. Avoiding such abuse to avoid list corruption. */ -static inline bool scan_swap_map_recheck_cluster(struct swap_info_struct *si, +static bool +scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si, unsigned long offset) { + struct percpu_cluster *percpu_cluster; + bool conflict; + offset /= SWAPFILE_CLUSTER; - return !cluster_is_null(&si->free_cluster_head) && + conflict = !cluster_is_null(&si->free_cluster_head) && offset != cluster_next(&si->free_cluster_head) && cluster_is_free(&si->cluster_info[offset]); + + if (!conflict) + return false; + + percpu_cluster = this_cpu_ptr(si->percpu_cluster); + cluster_set_null(&percpu_cluster->index); + return true; +} + +/* + * Try to get a swap entry from current cpu's swap entry pool (a cluster). This + * might involve allocating a new cluster for current CPU too. + */ +static void scan_swap_map_try_ssd_cluster(struct swap_info_struct *si, + unsigned long *offset, unsigned long *scan_base) +{ + struct percpu_cluster *cluster; + bool found_free; + unsigned long tmp; + +new_cluster: + cluster = this_cpu_ptr(si->percpu_cluster); + if (cluster_is_null(&cluster->index)) { + if (!cluster_is_null(&si->free_cluster_head)) { + cluster->index = si->free_cluster_head; + cluster->next = cluster_next(&cluster->index) * + SWAPFILE_CLUSTER; + } else if (!cluster_is_null(&si->discard_cluster_head)) { + /* + * we don't have free cluster but have some clusters in + * discarding, do discard now and reclaim them + */ + swap_do_scheduled_discard(si); + *scan_base = *offset = si->cluster_next; + goto new_cluster; + } else + return; + } + + found_free = false; + + /* + * Other CPUs can use our cluster if they can't find a free cluster, + * check if there is still free entry in the cluster + */ + tmp = cluster->next; + while (tmp < si->max && tmp < (cluster_next(&cluster->index) + 1) * + SWAPFILE_CLUSTER) { + if (!si->swap_map[tmp]) { + found_free = true; + break; + } + tmp++; + } + if (!found_free) { + cluster_set_null(&cluster->index); + goto new_cluster; + } + cluster->next = tmp + 1; + *offset = tmp; + *scan_base = tmp; } static unsigned long scan_swap_map(struct swap_info_struct *si, @@ -423,41 +488,17 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, si->flags += SWP_SCANNING; scan_base = offset = si->cluster_next; + /* SSD algorithm */ + if (si->cluster_info) { + scan_swap_map_try_ssd_cluster(si, &offset, &scan_base); + goto checks; + } + if (unlikely(!si->cluster_nr--)) { if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) { si->cluster_nr = SWAPFILE_CLUSTER - 1; goto checks; } -check_cluster: - if (!cluster_is_null(&si->free_cluster_head)) { - offset = cluster_next(&si->free_cluster_head) * - SWAPFILE_CLUSTER; - last_in_cluster = offset + SWAPFILE_CLUSTER - 1; - si->cluster_next = offset; - si->cluster_nr = SWAPFILE_CLUSTER - 1; - goto checks; - } else if (si->cluster_info) { - /* - * we don't have free cluster but have some clusters in - * discarding, do discard now and reclaim them - */ - if (!cluster_is_null(&si->discard_cluster_head)) { - si->cluster_nr = 0; - swap_do_scheduled_discard(si); - scan_base = offset = si->cluster_next; - if (!si->cluster_nr) - goto check_cluster; - si->cluster_nr--; - goto checks; - } - - /* - * Checking free cluster is fast enough, we can do the - * check every time - */ - si->cluster_nr = 0; - goto checks; - } spin_unlock(&si->lock); @@ -516,8 +557,10 @@ check_cluster: } checks: - if (scan_swap_map_recheck_cluster(si, offset)) - goto check_cluster; + if (si->cluster_info) { + while (scan_swap_map_ssd_cluster_conflict(si, offset)) + scan_swap_map_try_ssd_cluster(si, &offset, &scan_base); + } if (!(si->flags & SWP_WRITEOK)) goto no_page; if (!si->highest_bit) @@ -1884,6 +1927,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) spin_unlock(&swap_lock); frontswap_invalidate_area(type); mutex_unlock(&swapon_mutex); + free_percpu(p->percpu_cluster); + p->percpu_cluster = NULL; vfree(swap_map); vfree(cluster_info); vfree(frontswap_map); @@ -2403,6 +2448,16 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) error = -ENOMEM; goto bad_swap; } + p->percpu_cluster = alloc_percpu(struct percpu_cluster); + if (!p->percpu_cluster) { + error = -ENOMEM; + goto bad_swap; + } + for_each_possible_cpu(i) { + struct percpu_cluster *cluster; + cluster = per_cpu_ptr(p->percpu_cluster, i); + cluster_set_null(&cluster->index); + } } error = swap_cgroup_swapon(p->type, maxpages); @@ -2475,6 +2530,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) error = 0; goto out; bad_swap: + free_percpu(p->percpu_cluster); + p->percpu_cluster = NULL; if (inode && S_ISBLK(inode->i_mode) && p->bdev) { set_blocksize(p->bdev, p->old_block_size); blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);