From c71c512f4a65267e6a18163f4df729c489a51035 Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Wed, 16 Dec 2020 19:33:29 +0100 Subject: [PATCH 01/62] net/sched: sch_taprio: reset child qdiscs before freeing them [ Upstream commit 44d4775ca51805b376a8db5b34f650434a08e556 ] syzkaller shows that packets can still be dequeued while taprio_destroy() is running. Let sch_taprio use the reset() function to cancel the advance timer and drop all skbs from the child qdiscs. Fixes: 5a781ccbd19e ("tc: Add support for configuring the taprio scheduler") Link: https://syzkaller.appspot.com/bug?id=f362872379bf8f0017fb667c1ab158f2d1e764ae Reported-by: syzbot+8971da381fb5a31f542d@syzkaller.appspotmail.com Signed-off-by: Davide Caratti Acked-by: Vinicius Costa Gomes Link: https://lore.kernel.org/r/63b6d79b0e830ebb0283e020db4df3cdfdfb2b94.1608142843.git.dcaratti@redhat.com Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman --- net/sched/sch_taprio.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index 2b797a71e9bd..f2b1305e79d2 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -1597,6 +1597,21 @@ free_sched: return err; } +static void taprio_reset(struct Qdisc *sch) +{ + struct taprio_sched *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + int i; + + hrtimer_cancel(&q->advance_timer); + if (q->qdiscs) { + for (i = 0; i < dev->num_tx_queues && q->qdiscs[i]; i++) + qdisc_reset(q->qdiscs[i]); + } + sch->qstats.backlog = 0; + sch->q.qlen = 0; +} + static void taprio_destroy(struct Qdisc *sch) { struct taprio_sched *q = qdisc_priv(sch); @@ -1607,7 +1622,6 @@ static void taprio_destroy(struct Qdisc *sch) list_del(&q->taprio_list); spin_unlock(&taprio_list_lock); - hrtimer_cancel(&q->advance_timer); taprio_disable_offload(dev, q, NULL); @@ -1954,6 +1968,7 @@ static struct Qdisc_ops taprio_qdisc_ops __read_mostly = { .init = taprio_init, .change = taprio_change, .destroy = taprio_destroy, + .reset = taprio_reset, .peek = taprio_peek, .dequeue = taprio_dequeue, .enqueue = taprio_enqueue, From d3076d054f3e1dc7165449a1d901c98030bae508 Mon Sep 17 00:00:00 2001 From: Kevin Vigor Date: Fri, 6 Nov 2020 14:20:34 -0800 Subject: [PATCH 02/62] md/raid10: initialize r10_bio->read_slot before use. commit 93decc563637c4288380912eac0eb42fb246cc04 upstream. In __make_request() a new r10bio is allocated and passed to raid10_read_request(). The read_slot member of the bio is not initialized, and the raid10_read_request() uses it to index an array. This leads to occasional panics. Fix by initializing the field to invalid value and checking for valid value in raid10_read_request(). Cc: stable@vger.kernel.org Signed-off-by: Kevin Vigor Signed-off-by: Song Liu Signed-off-by: Greg Kroah-Hartman --- drivers/md/raid10.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index ec136e44aef7..a195a85cc366 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1145,7 +1145,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, struct md_rdev *err_rdev = NULL; gfp_t gfp = GFP_NOIO; - if (r10_bio->devs[slot].rdev) { + if (slot >= 0 && r10_bio->devs[slot].rdev) { /* * This is an error retry, but we cannot * safely dereference the rdev in the r10_bio, @@ -1510,6 +1510,7 @@ static void __make_request(struct mddev *mddev, struct bio *bio, int sectors) r10_bio->mddev = mddev; r10_bio->sector = bio->bi_iter.bi_sector; r10_bio->state = 0; + r10_bio->read_slot = -1; memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * conf->copies); if (bio_data_dir(bio) == READ) From faa72d97c3e3cddb9382aab3a7042deb5601a9d0 Mon Sep 17 00:00:00 2001 From: Zhuguangqing Date: Fri, 6 Nov 2020 17:22:43 +0800 Subject: [PATCH 03/62] thermal/drivers/cpufreq_cooling: Update cpufreq_state only if state has changed commit 236761f19a4f373354f1dcf399b57753f1f4b871 upstream. If state has not changed successfully and we updated cpufreq_state, next time when the new state is equal to cpufreq_state (not changed successfully last time), we will return directly and miss a freq_qos_update_request() that should have been. Fixes: 5130802ddbb1 ("thermal: cpu_cooling: Switch to QoS requests for freq limits") Cc: v5.4+ # v5.4+ Signed-off-by: Zhuguangqing Acked-by: Viresh Kumar Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20201106092243.15574-1-zhuguangqing83@gmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/thermal/cpu_cooling.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/thermal/cpu_cooling.c b/drivers/thermal/cpu_cooling.c index c37886a26712..9d24bc05df0d 100644 --- a/drivers/thermal/cpu_cooling.c +++ b/drivers/thermal/cpu_cooling.c @@ -320,6 +320,7 @@ static int cpufreq_set_cur_state(struct thermal_cooling_device *cdev, unsigned long state) { struct cpufreq_cooling_device *cpufreq_cdev = cdev->devdata; + int ret; /* Request state should be less than max_level */ if (WARN_ON(state > cpufreq_cdev->max_level)) @@ -329,10 +330,12 @@ static int cpufreq_set_cur_state(struct thermal_cooling_device *cdev, if (cpufreq_cdev->cpufreq_state == state) return 0; - cpufreq_cdev->cpufreq_state = state; + ret = freq_qos_update_request(&cpufreq_cdev->qos_req, + cpufreq_cdev->freq_table[state].frequency); + if (ret > 0) + cpufreq_cdev->cpufreq_state = state; - return freq_qos_update_request(&cpufreq_cdev->qos_req, - cpufreq_cdev->freq_table[state].frequency); + return ret; } /** From 3ebfed353afd3a7b2ccd4c858691fe35db50ba0f Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 28 Dec 2020 10:54:31 -0800 Subject: [PATCH 04/62] ext4: prevent creating duplicate encrypted filenames commit 75d18cd1868c2aee43553723872c35d7908f240f upstream. As described in "fscrypt: add fscrypt_is_nokey_name()", it's possible to create a duplicate filename in an encrypted directory by creating a file concurrently with adding the directory's encryption key. Fix this bug on ext4 by rejecting no-key dentries in ext4_add_entry(). Note that the duplicate check in ext4_find_dest_de() sometimes prevented this bug. However in many cases it didn't, since ext4_find_dest_de() doesn't examine every dentry. Fixes: 4461471107b7 ("ext4 crypto: enable filename encryption") Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20201118075609.120337-3-ebiggers@kernel.org Signed-off-by: Eric Biggers Signed-off-by: Greg Kroah-Hartman --- fs/ext4/namei.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 36a81b57012a..59038e361337 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2192,6 +2192,9 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, if (!dentry->d_name.len) return -EINVAL; + if (fscrypt_is_nokey_name(dentry)) + return -ENOKEY; + #ifdef CONFIG_UNICODE if (ext4_has_strict_mode(sbi) && IS_CASEFOLDED(dir) && sbi->s_encoding && utf8_validate(sbi->s_encoding, &dentry->d_name)) From 6fe20a5204a6841410da31c3e48ab9f06c6c3437 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 28 Dec 2020 10:54:33 -0800 Subject: [PATCH 05/62] ubifs: prevent creating duplicate encrypted filenames commit 76786a0f083473de31678bdb259a3d4167cf756d upstream. As described in "fscrypt: add fscrypt_is_nokey_name()", it's possible to create a duplicate filename in an encrypted directory by creating a file concurrently with adding the directory's encryption key. Fix this bug on ubifs by rejecting no-key dentries in ubifs_create(), ubifs_mkdir(), ubifs_mknod(), and ubifs_symlink(). Note that ubifs doesn't actually report the duplicate filenames from readdir, but rather it seems to replace the original dentry with a new one (which is still wrong, just a different effect from ext4). On ubifs, this fixes xfstest generic/595 as well as the new xfstest I wrote specifically for this bug. Fixes: f4f61d2cc6d8 ("ubifs: Implement encrypted filenames") Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20201118075609.120337-5-ebiggers@kernel.org Signed-off-by: Eric Biggers Signed-off-by: Greg Kroah-Hartman --- fs/ubifs/dir.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 6c0e19f7a21f..a5e5e9b9d4e3 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -278,6 +278,15 @@ done: return d_splice_alias(inode, dentry); } +static int ubifs_prepare_create(struct inode *dir, struct dentry *dentry, + struct fscrypt_name *nm) +{ + if (fscrypt_is_nokey_name(dentry)) + return -ENOKEY; + + return fscrypt_setup_filename(dir, &dentry->d_name, 0, nm); +} + static int ubifs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { @@ -301,7 +310,7 @@ static int ubifs_create(struct inode *dir, struct dentry *dentry, umode_t mode, if (err) return err; - err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &nm); + err = ubifs_prepare_create(dir, dentry, &nm); if (err) goto out_budg; @@ -961,7 +970,7 @@ static int ubifs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) if (err) return err; - err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &nm); + err = ubifs_prepare_create(dir, dentry, &nm); if (err) goto out_budg; @@ -1046,7 +1055,7 @@ static int ubifs_mknod(struct inode *dir, struct dentry *dentry, return err; } - err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &nm); + err = ubifs_prepare_create(dir, dentry, &nm); if (err) { kfree(dev); goto out_budg; @@ -1130,7 +1139,7 @@ static int ubifs_symlink(struct inode *dir, struct dentry *dentry, if (err) return err; - err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &nm); + err = ubifs_prepare_create(dir, dentry, &nm); if (err) goto out_budg; From eddc69467e39e7038dac877d566d60fc6ffb1fdd Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 28 Dec 2020 10:54:32 -0800 Subject: [PATCH 06/62] f2fs: prevent creating duplicate encrypted filenames commit bfc2b7e8518999003a61f91c1deb5e88ed77b07d upstream. As described in "fscrypt: add fscrypt_is_nokey_name()", it's possible to create a duplicate filename in an encrypted directory by creating a file concurrently with adding the directory's encryption key. Fix this bug on f2fs by rejecting no-key dentries in f2fs_add_link(). Note that the weird check for the current task in f2fs_do_add_link() seems to make this bug difficult to reproduce on f2fs. Fixes: 9ea97163c6da ("f2fs crypto: add filename encryption for f2fs_add_link") Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20201118075609.120337-4-ebiggers@kernel.org Signed-off-by: Eric Biggers Signed-off-by: Greg Kroah-Hartman --- fs/f2fs/f2fs.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 63440abe58c4..0ddc4a74b9d4 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2998,6 +2998,8 @@ bool f2fs_empty_dir(struct inode *dir); static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode) { + if (fscrypt_is_nokey_name(dentry)) + return -ENOKEY; return f2fs_do_add_link(d_inode(dentry->d_parent), &dentry->d_name, inode, inode->i_ino, inode->i_mode); } From 34f000524d3327283d3cdf6437f289bc1f172a87 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 28 Dec 2020 10:54:30 -0800 Subject: [PATCH 07/62] fscrypt: add fscrypt_is_nokey_name() commit 159e1de201b6fca10bfec50405a3b53a561096a8 upstream. It's possible to create a duplicate filename in an encrypted directory by creating a file concurrently with adding the encryption key. Specifically, sys_open(O_CREAT) (or sys_mkdir(), sys_mknod(), or sys_symlink()) can lookup the target filename while the directory's encryption key hasn't been added yet, resulting in a negative no-key dentry. The VFS then calls ->create() (or ->mkdir(), ->mknod(), or ->symlink()) because the dentry is negative. Normally, ->create() would return -ENOKEY due to the directory's key being unavailable. However, if the key was added between the dentry lookup and ->create(), then the filesystem will go ahead and try to create the file. If the target filename happens to already exist as a normal name (not a no-key name), a duplicate filename may be added to the directory. In order to fix this, we need to fix the filesystems to prevent ->create(), ->mkdir(), ->mknod(), and ->symlink() on no-key names. (->rename() and ->link() need it too, but those are already handled correctly by fscrypt_prepare_rename() and fscrypt_prepare_link().) In preparation for this, add a helper function fscrypt_is_nokey_name() that filesystems can use to do this check. Use this helper function for the existing checks that fs/crypto/ does for rename and link. Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20201118075609.120337-2-ebiggers@kernel.org Signed-off-by: Eric Biggers Signed-off-by: Greg Kroah-Hartman --- fs/crypto/hooks.c | 10 +++++----- include/linux/fscrypt.h | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+), 5 deletions(-) diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c index bb3b7fcfdd48..a5a40a76b8ed 100644 --- a/fs/crypto/hooks.c +++ b/fs/crypto/hooks.c @@ -58,8 +58,8 @@ int __fscrypt_prepare_link(struct inode *inode, struct inode *dir, if (err) return err; - /* ... in case we looked up ciphertext name before key was added */ - if (dentry->d_flags & DCACHE_ENCRYPTED_NAME) + /* ... in case we looked up no-key name before key was added */ + if (fscrypt_is_nokey_name(dentry)) return -ENOKEY; if (!fscrypt_has_permitted_context(dir, inode)) @@ -83,9 +83,9 @@ int __fscrypt_prepare_rename(struct inode *old_dir, struct dentry *old_dentry, if (err) return err; - /* ... in case we looked up ciphertext name(s) before key was added */ - if ((old_dentry->d_flags | new_dentry->d_flags) & - DCACHE_ENCRYPTED_NAME) + /* ... in case we looked up no-key name(s) before key was added */ + if (fscrypt_is_nokey_name(old_dentry) || + fscrypt_is_nokey_name(new_dentry)) return -ENOKEY; if (old_dir != new_dir) { diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index f622f7460ed8..032e5bcf9701 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -100,6 +100,35 @@ static inline void fscrypt_handle_d_move(struct dentry *dentry) dentry->d_flags &= ~DCACHE_ENCRYPTED_NAME; } +/** + * fscrypt_is_nokey_name() - test whether a dentry is a no-key name + * @dentry: the dentry to check + * + * This returns true if the dentry is a no-key dentry. A no-key dentry is a + * dentry that was created in an encrypted directory that hasn't had its + * encryption key added yet. Such dentries may be either positive or negative. + * + * When a filesystem is asked to create a new filename in an encrypted directory + * and the new filename's dentry is a no-key dentry, it must fail the operation + * with ENOKEY. This includes ->create(), ->mkdir(), ->mknod(), ->symlink(), + * ->rename(), and ->link(). (However, ->rename() and ->link() are already + * handled by fscrypt_prepare_rename() and fscrypt_prepare_link().) + * + * This is necessary because creating a filename requires the directory's + * encryption key, but just checking for the key on the directory inode during + * the final filesystem operation doesn't guarantee that the key was available + * during the preceding dentry lookup. And the key must have already been + * available during the dentry lookup in order for it to have been checked + * whether the filename already exists in the directory and for the new file's + * dentry not to be invalidated due to it incorrectly having the no-key flag. + * + * Return: %true if the dentry is a no-key name + */ +static inline bool fscrypt_is_nokey_name(const struct dentry *dentry) +{ + return dentry->d_flags & DCACHE_ENCRYPTED_NAME; +} + /* crypto.c */ extern void fscrypt_enqueue_decrypt_work(struct work_struct *); extern struct fscrypt_ctx *fscrypt_get_ctx(gfp_t); @@ -290,6 +319,11 @@ static inline void fscrypt_handle_d_move(struct dentry *dentry) { } +static inline bool fscrypt_is_nokey_name(const struct dentry *dentry) +{ + return false; +} + /* crypto.c */ static inline void fscrypt_enqueue_decrypt_work(struct work_struct *work) { From 29c2d3e91e3d060dc7941660b645f044aff3ad37 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 28 Dec 2020 10:47:47 -0800 Subject: [PATCH 08/62] fscrypt: remove kernel-internal constants from UAPI header commit 3ceb6543e9cf6ed87cc1fbc6f23ca2db903564cd upstream. There isn't really any valid reason to use __FSCRYPT_MODE_MAX or FSCRYPT_POLICY_FLAGS_VALID in a userspace program. These constants are only meant to be used by the kernel internally, and they are defined in the UAPI header next to the mode numbers and flags only so that kernel developers don't forget to update them when adding new modes or flags. In https://lkml.kernel.org/r/20201005074133.1958633-2-satyat@google.com there was an example of someone wanting to use __FSCRYPT_MODE_MAX in a user program, and it was wrong because the program would have broken if __FSCRYPT_MODE_MAX were ever increased. So having this definition available is harmful. FSCRYPT_POLICY_FLAGS_VALID has the same problem. So, remove these definitions from the UAPI header. Replace FSCRYPT_POLICY_FLAGS_VALID with just listing the valid flags explicitly in the one kernel function that needs it. Move __FSCRYPT_MODE_MAX to fscrypt_private.h, remove the double underscores (which were only present to discourage use by userspace), and add a BUILD_BUG_ON() and comments to (hopefully) ensure it is kept in sync. Keep the old name FS_POLICY_FLAGS_VALID, since it's been around for longer and there's a greater chance that removing it would break source compatibility with some program. Indeed, mtd-utils is using it in an #ifdef, and removing it would introduce compiler warnings (about FS_POLICY_FLAGS_PAD_* being redefined) into the mtd-utils build. However, reduce its value to 0x07 so that it only includes the flags with old names (the ones present before Linux 5.4), and try to make it clear that it's now "frozen" and no new flags should be added to it. Fixes: 2336d0deb2d4 ("fscrypt: use FSCRYPT_ prefix for uapi constants") Cc: # v5.4+ Link: https://lore.kernel.org/r/20201024005132.495952-1-ebiggers@kernel.org Signed-off-by: Eric Biggers Signed-off-by: Greg Kroah-Hartman --- fs/crypto/fscrypt_private.h | 5 ++++- fs/crypto/keysetup.c | 2 ++ fs/crypto/policy.c | 6 ++++-- include/uapi/linux/fscrypt.h | 5 ++--- 4 files changed, 12 insertions(+), 6 deletions(-) diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index e84efc01512e..ec7387266190 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -23,6 +23,9 @@ #define FSCRYPT_CONTEXT_V1 1 #define FSCRYPT_CONTEXT_V2 2 +/* Keep this in sync with include/uapi/linux/fscrypt.h */ +#define FSCRYPT_MODE_MAX FSCRYPT_MODE_ADIANTUM + struct fscrypt_context_v1 { u8 version; /* FSCRYPT_CONTEXT_V1 */ u8 contents_encryption_mode; @@ -387,7 +390,7 @@ struct fscrypt_master_key { spinlock_t mk_decrypted_inodes_lock; /* Per-mode tfms for DIRECT_KEY policies, allocated on-demand */ - struct crypto_skcipher *mk_mode_keys[__FSCRYPT_MODE_MAX + 1]; + struct crypto_skcipher *mk_mode_keys[FSCRYPT_MODE_MAX + 1]; } __randomize_layout; diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c index 75898340eb46..3e86f75b532a 100644 --- a/fs/crypto/keysetup.c +++ b/fs/crypto/keysetup.c @@ -55,6 +55,8 @@ static struct fscrypt_mode * select_encryption_mode(const union fscrypt_policy *policy, const struct inode *inode) { + BUILD_BUG_ON(ARRAY_SIZE(available_modes) != FSCRYPT_MODE_MAX + 1); + if (S_ISREG(inode->i_mode)) return &available_modes[fscrypt_policy_contents_mode(policy)]; diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index 4072ba644595..8e1b10861c10 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -55,7 +55,8 @@ bool fscrypt_supported_policy(const union fscrypt_policy *policy_u, return false; } - if (policy->flags & ~FSCRYPT_POLICY_FLAGS_VALID) { + if (policy->flags & ~(FSCRYPT_POLICY_FLAGS_PAD_MASK | + FSCRYPT_POLICY_FLAG_DIRECT_KEY)) { fscrypt_warn(inode, "Unsupported encryption flags (0x%02x)", policy->flags); @@ -76,7 +77,8 @@ bool fscrypt_supported_policy(const union fscrypt_policy *policy_u, return false; } - if (policy->flags & ~FSCRYPT_POLICY_FLAGS_VALID) { + if (policy->flags & ~(FSCRYPT_POLICY_FLAGS_PAD_MASK | + FSCRYPT_POLICY_FLAG_DIRECT_KEY)) { fscrypt_warn(inode, "Unsupported encryption flags (0x%02x)", policy->flags); diff --git a/include/uapi/linux/fscrypt.h b/include/uapi/linux/fscrypt.h index 39ccfe9311c3..b14f436f4ebd 100644 --- a/include/uapi/linux/fscrypt.h +++ b/include/uapi/linux/fscrypt.h @@ -17,7 +17,6 @@ #define FSCRYPT_POLICY_FLAGS_PAD_32 0x03 #define FSCRYPT_POLICY_FLAGS_PAD_MASK 0x03 #define FSCRYPT_POLICY_FLAG_DIRECT_KEY 0x04 -#define FSCRYPT_POLICY_FLAGS_VALID 0x07 /* Encryption algorithms */ #define FSCRYPT_MODE_AES_256_XTS 1 @@ -25,7 +24,7 @@ #define FSCRYPT_MODE_AES_128_CBC 5 #define FSCRYPT_MODE_AES_128_CTS 6 #define FSCRYPT_MODE_ADIANTUM 9 -#define __FSCRYPT_MODE_MAX 9 +/* If adding a mode number > 9, update FSCRYPT_MODE_MAX in fscrypt_private.h */ /* * Legacy policy version; ad-hoc KDF and no key verification. @@ -162,7 +161,7 @@ struct fscrypt_get_key_status_arg { #define FS_POLICY_FLAGS_PAD_32 FSCRYPT_POLICY_FLAGS_PAD_32 #define FS_POLICY_FLAGS_PAD_MASK FSCRYPT_POLICY_FLAGS_PAD_MASK #define FS_POLICY_FLAG_DIRECT_KEY FSCRYPT_POLICY_FLAG_DIRECT_KEY -#define FS_POLICY_FLAGS_VALID FSCRYPT_POLICY_FLAGS_VALID +#define FS_POLICY_FLAGS_VALID 0x07 /* contains old flags only */ #define FS_ENCRYPTION_MODE_INVALID 0 /* never used */ #define FS_ENCRYPTION_MODE_AES_256_XTS FSCRYPT_MODE_AES_256_XTS #define FS_ENCRYPTION_MODE_AES_256_GCM 2 /* never used */ From 30aea96ff142fe30545fd2ec071fddddca89ca42 Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Fri, 13 Nov 2020 18:52:02 +0100 Subject: [PATCH 09/62] vfio/pci: Move dummy_resources_list init in vfio_pci_probe() [ Upstream commit 16b8fe4caf499ae8e12d2ab1b1324497e36a7b83 ] In case an error occurs in vfio_pci_enable() before the call to vfio_pci_probe_mmaps(), vfio_pci_disable() will try to iterate on an uninitialized list and cause a kernel panic. Lets move to the initialization to vfio_pci_probe() to fix the issue. Signed-off-by: Eric Auger Fixes: 05f0c03fbac1 ("vfio-pci: Allow to mmap sub-page MMIO BARs if the mmio page is exclusive") CC: Stable # v4.7+ Signed-off-by: Alex Williamson Signed-off-by: Sasha Levin --- drivers/vfio/pci/vfio_pci.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index 632653cd70e3..2372e161cd5e 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -114,8 +114,6 @@ static void vfio_pci_probe_mmaps(struct vfio_pci_device *vdev) int bar; struct vfio_pci_dummy_resource *dummy_res; - INIT_LIST_HEAD(&vdev->dummy_resources_list); - for (bar = PCI_STD_RESOURCES; bar <= PCI_STD_RESOURCE_END; bar++) { res = vdev->pdev->resource + bar; @@ -1606,6 +1604,7 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) mutex_init(&vdev->igate); spin_lock_init(&vdev->irqlock); mutex_init(&vdev->ioeventfds_lock); + INIT_LIST_HEAD(&vdev->dummy_resources_list); INIT_LIST_HEAD(&vdev->ioeventfds_list); mutex_init(&vdev->vma_lock); INIT_LIST_HEAD(&vdev->vma_list); From 6b0a4f603d5b209af45ee973b0f8a2e1838e7b4a Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 4 Nov 2020 11:07:33 +0000 Subject: [PATCH 10/62] btrfs: fix race when defragmenting leads to unnecessary IO [ Upstream commit 7f458a3873ae94efe1f37c8b96c97e7298769e98 ] When defragmenting we skip ranges that have holes or inline extents, so that we don't do unnecessary IO and waste space. We do this check when calling should_defrag_range() at btrfs_defrag_file(). However we do it without holding the inode's lock. The reason we do it like this is to avoid blocking other tasks for too long, that possibly want to operate on other file ranges, since after the call to should_defrag_range() and before locking the inode, we trigger a synchronous page cache readahead. However before we were able to lock the inode, some other task might have punched a hole in our range, or we may now have an inline extent there, in which case we should not set the range for defrag anymore since that would cause unnecessary IO and make us waste space (i.e. allocating extents to contain zeros for a hole). So after we locked the inode and the range in the iotree, check again if we have holes or an inline extent, and if we do, just skip the range. I hit this while testing my next patch that fixes races when updating an inode's number of bytes (subject "btrfs: update the number of bytes used by an inode atomically"), and it depends on this change in order to work correctly. Alternatively I could rework that other patch to detect holes and flag their range with the 'new delalloc' bit, but this itself fixes an efficiency problem due a race that from a functional point of view is not harmful (it could be triggered with btrfs/062 from fstests). CC: stable@vger.kernel.org # 5.4+ Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Signed-off-by: David Sterba Signed-off-by: Sasha Levin --- fs/btrfs/ioctl.c | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index f58e03d1775d..8ed71b3b2546 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1256,6 +1256,7 @@ static int cluster_pages_for_defrag(struct inode *inode, u64 page_end; u64 page_cnt; u64 start = (u64)start_index << PAGE_SHIFT; + u64 search_start; int ret; int i; int i_done; @@ -1352,6 +1353,40 @@ again: lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end - 1, &cached_state); + + /* + * When defragmenting we skip ranges that have holes or inline extents, + * (check should_defrag_range()), to avoid unnecessary IO and wasting + * space. At btrfs_defrag_file(), we check if a range should be defragged + * before locking the inode and then, if it should, we trigger a sync + * page cache readahead - we lock the inode only after that to avoid + * blocking for too long other tasks that possibly want to operate on + * other file ranges. But before we were able to get the inode lock, + * some other task may have punched a hole in the range, or we may have + * now an inline extent, in which case we should not defrag. So check + * for that here, where we have the inode and the range locked, and bail + * out if that happened. + */ + search_start = page_start; + while (search_start < page_end) { + struct extent_map *em; + + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, search_start, + page_end - search_start, 0); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto out_unlock_range; + } + if (em->block_start >= EXTENT_MAP_LAST_BYTE) { + free_extent_map(em); + /* Ok, 0 means we did not defrag anything */ + ret = 0; + goto out_unlock_range; + } + search_start = extent_map_end(em); + free_extent_map(em); + } + clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end - 1, EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, &cached_state); @@ -1382,6 +1417,10 @@ again: btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT); extent_changeset_free(data_reserved); return i_done; + +out_unlock_range: + unlock_extent_cached(&BTRFS_I(inode)->io_tree, + page_start, page_end - 1, &cached_state); out: for (i = 0; i < i_done; i++) { unlock_page(pages[i]); From 11459136a107bd45fdf786a922ee03b7dbb65c87 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 27 Nov 2020 12:33:54 +0100 Subject: [PATCH 11/62] ext4: don't remount read-only with errors=continue on reboot [ Upstream commit b08070eca9e247f60ab39d79b2c25d274750441f ] ext4_handle_error() with errors=continue mount option can accidentally remount the filesystem read-only when the system is rebooting. Fix that. Fixes: 1dc1097ff60e ("ext4: avoid panic during forced reboot") Signed-off-by: Jan Kara Reviewed-by: Andreas Dilger Cc: stable@kernel.org Link: https://lore.kernel.org/r/20201127113405.26867-2-jack@suse.cz Signed-off-by: Theodore Ts'o Signed-off-by: Sasha Levin --- fs/ext4/super.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 920658ca8777..06568467b0c2 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -455,19 +455,17 @@ static bool system_going_down(void) static void ext4_handle_error(struct super_block *sb) { + journal_t *journal = EXT4_SB(sb)->s_journal; + if (test_opt(sb, WARN_ON_ERROR)) WARN_ON_ONCE(1); - if (sb_rdonly(sb)) + if (sb_rdonly(sb) || test_opt(sb, ERRORS_CONT)) return; - if (!test_opt(sb, ERRORS_CONT)) { - journal_t *journal = EXT4_SB(sb)->s_journal; - - EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED; - if (journal) - jbd2_journal_abort(journal, -EIO); - } + EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED; + if (journal) + jbd2_journal_abort(journal, -EIO); /* * We force ERRORS_RO behavior when system is rebooting. Otherwise we * could panic during 'reboot -f' as the underlying device got already From d77c1ab54c9edf38a1723902c9eaba133b5ca6c8 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 20 Jan 2020 16:33:06 +0100 Subject: [PATCH 12/62] KVM: x86: avoid incorrect writes to host MSR_IA32_SPEC_CTRL [ Upstream commit 6441fa6178f5456d1d4b512c08798888f99db185 ] If the guest is configured to have SPEC_CTRL but the host does not (which is a nonsensical configuration but these are not explicitly forbidden) then a host-initiated MSR write can write vmx->spec_ctrl (respectively svm->spec_ctrl) and trigger a #GP when KVM tries to restore the host value of the MSR. Add a more comprehensive check for valid bits of SPEC_CTRL, covering host CPUID flags and, since we are at it and it is more correct that way, guest CPUID flags too. For AMD, remove the unnecessary is_guest_mode check around setting the MSR interception bitmap, so that the code looks the same as for Intel. Cc: Jim Mattson Signed-off-by: Paolo Bonzini Signed-off-by: Sasha Levin --- arch/x86/kvm/svm.c | 9 +++------ arch/x86/kvm/vmx/vmx.c | 7 +++---- arch/x86/kvm/x86.c | 22 ++++++++++++++++++++++ arch/x86/kvm/x86.h | 1 + 4 files changed, 29 insertions(+), 10 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index c79c1a07f44b..72bf1d8175ac 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -4322,12 +4322,10 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) !guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD)) return 1; - /* The STIBP bit doesn't fault even if it's not advertised */ - if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD)) + if (data & ~kvm_spec_ctrl_valid_bits(vcpu)) return 1; svm->spec_ctrl = data; - if (!data) break; @@ -4351,13 +4349,12 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) if (data & ~PRED_CMD_IBPB) return 1; - + if (!boot_cpu_has(X86_FEATURE_AMD_IBPB)) + return 1; if (!data) break; wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB); - if (is_guest_mode(vcpu)) - break; set_msr_interception(svm->msrpm, MSR_IA32_PRED_CMD, 0, 1); break; case MSR_AMD64_VIRT_SPEC_CTRL: diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 2a1ed3aae100..8450fce70bd9 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1974,12 +1974,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) return 1; - /* The STIBP bit doesn't fault even if it's not advertised */ - if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD)) + if (data & ~kvm_spec_ctrl_valid_bits(vcpu)) return 1; vmx->spec_ctrl = data; - if (!data) break; @@ -2006,7 +2004,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (data & ~PRED_CMD_IBPB) return 1; - + if (!boot_cpu_has(X86_FEATURE_SPEC_CTRL)) + return 1; if (!data) break; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b7f86acb8c91..72990c3c6faf 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10369,6 +10369,28 @@ bool kvm_arch_no_poll(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_arch_no_poll); +u64 kvm_spec_ctrl_valid_bits(struct kvm_vcpu *vcpu) +{ + uint64_t bits = SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD; + + /* The STIBP bit doesn't fault even if it's not advertised */ + if (!guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) && + !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS)) + bits &= ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP); + if (!boot_cpu_has(X86_FEATURE_SPEC_CTRL) && + !boot_cpu_has(X86_FEATURE_AMD_IBRS)) + bits &= ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP); + + if (!guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL_SSBD) && + !guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD)) + bits &= ~SPEC_CTRL_SSBD; + if (!boot_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) && + !boot_cpu_has(X86_FEATURE_AMD_SSBD)) + bits &= ~SPEC_CTRL_SSBD; + + return bits; +} +EXPORT_SYMBOL_GPL(kvm_spec_ctrl_valid_bits); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index de6b55484876..301286d92432 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -368,5 +368,6 @@ static inline bool kvm_pat_valid(u64 data) void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu); void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu); +u64 kvm_spec_ctrl_valid_bits(struct kvm_vcpu *vcpu); #endif From 3d4a05894500693ff25ba99fc38994044a92236b Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 5 Feb 2020 16:10:52 +0100 Subject: [PATCH 13/62] KVM: SVM: relax conditions for allowing MSR_IA32_SPEC_CTRL accesses [ Upstream commit df7e8818926eb4712b67421442acf7d568fe2645 ] Userspace that does not know about the AMD_IBRS bit might still allow the guest to protect itself with MSR_IA32_SPEC_CTRL using the Intel SPEC_CTRL bit. However, svm.c disallows this and will cause a #GP in the guest when writing to the MSR. Fix this by loosening the test and allowing the Intel CPUID bit, and in fact allow the AMD_STIBP bit as well since it allows writing to MSR_IA32_SPEC_CTRL too. Reported-by: Zhiyi Guo Analyzed-by: Dr. David Alan Gilbert Analyzed-by: Laszlo Ersek Signed-off-by: Paolo Bonzini Signed-off-by: Sasha Levin --- arch/x86/kvm/svm.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 72bf1d8175ac..ca746006ac04 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -4233,6 +4233,8 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_IA32_SPEC_CTRL: if (!msr_info->host_initiated && + !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) && + !guest_cpuid_has(vcpu, X86_FEATURE_AMD_STIBP) && !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS) && !guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD)) return 1; @@ -4318,6 +4320,8 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) break; case MSR_IA32_SPEC_CTRL: if (!msr->host_initiated && + !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) && + !guest_cpuid_has(vcpu, X86_FEATURE_AMD_STIBP) && !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS) && !guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD)) return 1; From 7cb6087b45367aa2edf7f026a3144031ce4dc75a Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 3 Dec 2020 09:40:15 -0500 Subject: [PATCH 14/62] KVM: x86: reinstate vendor-agnostic check on SPEC_CTRL cpuid bits [ Upstream commit 39485ed95d6b83b62fa75c06c2c4d33992e0d971 ] Until commit e7c587da1252 ("x86/speculation: Use synthetic bits for IBRS/IBPB/STIBP"), KVM was testing both Intel and AMD CPUID bits before allowing the guest to write MSR_IA32_SPEC_CTRL and MSR_IA32_PRED_CMD. Testing only Intel bits on VMX processors, or only AMD bits on SVM processors, fails if the guests are created with the "opposite" vendor as the host. While at it, also tweak the host CPU check to use the vendor-agnostic feature bit X86_FEATURE_IBPB, since we only care about the availability of the MSR on the host here and not about specific CPUID bits. Fixes: e7c587da1252 ("x86/speculation: Use synthetic bits for IBRS/IBPB/STIBP") Cc: stable@vger.kernel.org Reported-by: Denis V. Lunev Signed-off-by: Paolo Bonzini Signed-off-by: Sasha Levin --- arch/x86/kvm/cpuid.h | 14 ++++++++++++++ arch/x86/kvm/svm.c | 14 ++++---------- arch/x86/kvm/vmx/vmx.c | 8 ++++---- 3 files changed, 22 insertions(+), 14 deletions(-) diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index d78a61408243..7dec43b2c420 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -154,6 +154,20 @@ static inline int guest_cpuid_stepping(struct kvm_vcpu *vcpu) return x86_stepping(best->eax); } +static inline bool guest_has_spec_ctrl_msr(struct kvm_vcpu *vcpu) +{ + return (guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) || + guest_cpuid_has(vcpu, X86_FEATURE_AMD_STIBP) || + guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS) || + guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD)); +} + +static inline bool guest_has_pred_cmd_msr(struct kvm_vcpu *vcpu) +{ + return (guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) || + guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBPB)); +} + static inline bool supports_cpuid_fault(struct kvm_vcpu *vcpu) { return vcpu->arch.msr_platform_info & MSR_PLATFORM_INFO_CPUID_FAULT; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index ca746006ac04..2b506904be02 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -4233,10 +4233,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_IA32_SPEC_CTRL: if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) && - !guest_cpuid_has(vcpu, X86_FEATURE_AMD_STIBP) && - !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS) && - !guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD)) + !guest_has_spec_ctrl_msr(vcpu)) return 1; msr_info->data = svm->spec_ctrl; @@ -4320,10 +4317,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) break; case MSR_IA32_SPEC_CTRL: if (!msr->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) && - !guest_cpuid_has(vcpu, X86_FEATURE_AMD_STIBP) && - !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS) && - !guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD)) + !guest_has_spec_ctrl_msr(vcpu)) return 1; if (data & ~kvm_spec_ctrl_valid_bits(vcpu)) @@ -4348,12 +4342,12 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) break; case MSR_IA32_PRED_CMD: if (!msr->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBPB)) + !guest_has_pred_cmd_msr(vcpu)) return 1; if (data & ~PRED_CMD_IBPB) return 1; - if (!boot_cpu_has(X86_FEATURE_AMD_IBPB)) + if (!boot_cpu_has(X86_FEATURE_IBPB)) return 1; if (!data) break; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 8450fce70bd9..e7fd2f00edc1 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1788,7 +1788,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_IA32_SPEC_CTRL: if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) + !guest_has_spec_ctrl_msr(vcpu)) return 1; msr_info->data = to_vmx(vcpu)->spec_ctrl; @@ -1971,7 +1971,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_IA32_SPEC_CTRL: if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) + !guest_has_spec_ctrl_msr(vcpu)) return 1; if (data & ~kvm_spec_ctrl_valid_bits(vcpu)) @@ -1999,12 +1999,12 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_IA32_PRED_CMD: if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) + !guest_has_pred_cmd_msr(vcpu)) return 1; if (data & ~PRED_CMD_IBPB) return 1; - if (!boot_cpu_has(X86_FEATURE_SPEC_CTRL)) + if (!boot_cpu_has(X86_FEATURE_IBPB)) return 1; if (!data) break; From ee78e7d93e351b7baee5dc5ac1492ddc98accf43 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 22 Oct 2020 14:05:46 +0000 Subject: [PATCH 15/62] powerpc/bitops: Fix possible undefined behaviour with fls() and fls64() [ Upstream commit 1891ef21d92c4801ea082ee8ed478e304ddc6749 ] fls() and fls64() are using __builtin_ctz() and _builtin_ctzll(). On powerpc, those builtins trivially use ctlzw and ctlzd power instructions. Allthough those instructions provide the expected result with input argument 0, __builtin_ctz() and __builtin_ctzll() are documented as undefined for value 0. The easiest fix would be to use fls() and fls64() functions defined in include/asm-generic/bitops/builtin-fls.h and include/asm-generic/bitops/fls64.h, but GCC output is not optimal: 00000388 : 388: 2c 03 00 00 cmpwi r3,0 38c: 41 82 00 10 beq 39c 390: 7c 63 00 34 cntlzw r3,r3 394: 20 63 00 20 subfic r3,r3,32 398: 4e 80 00 20 blr 39c: 38 60 00 00 li r3,0 3a0: 4e 80 00 20 blr 000003b0 : 3b0: 2c 03 00 00 cmpwi r3,0 3b4: 40 82 00 1c bne 3d0 3b8: 2f 84 00 00 cmpwi cr7,r4,0 3bc: 38 60 00 00 li r3,0 3c0: 4d 9e 00 20 beqlr cr7 3c4: 7c 83 00 34 cntlzw r3,r4 3c8: 20 63 00 20 subfic r3,r3,32 3cc: 4e 80 00 20 blr 3d0: 7c 63 00 34 cntlzw r3,r3 3d4: 20 63 00 40 subfic r3,r3,64 3d8: 4e 80 00 20 blr When the input of fls(x) is a constant, just check x for nullity and return either 0 or __builtin_clz(x). Otherwise, use cntlzw instruction directly. For fls64() on PPC64, do the same but with __builtin_clzll() and cntlzd instruction. On PPC32, lets take the generic fls64() which will use our fls(). The result is as expected: 00000388 : 388: 7c 63 00 34 cntlzw r3,r3 38c: 20 63 00 20 subfic r3,r3,32 390: 4e 80 00 20 blr 000003a0 : 3a0: 2c 03 00 00 cmpwi r3,0 3a4: 40 82 00 10 bne 3b4 3a8: 7c 83 00 34 cntlzw r3,r4 3ac: 20 63 00 20 subfic r3,r3,32 3b0: 4e 80 00 20 blr 3b4: 7c 63 00 34 cntlzw r3,r3 3b8: 20 63 00 40 subfic r3,r3,64 3bc: 4e 80 00 20 blr Fixes: 2fcff790dcb4 ("powerpc: Use builtin functions for fls()/__fls()/fls64()") Cc: stable@vger.kernel.org Signed-off-by: Christophe Leroy Acked-by: Segher Boessenkool Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/348c2d3f19ffcff8abe50d52513f989c4581d000.1603375524.git.christophe.leroy@csgroup.eu Signed-off-by: Sasha Levin --- arch/powerpc/include/asm/bitops.h | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/include/asm/bitops.h b/arch/powerpc/include/asm/bitops.h index 603aed229af7..46338f236004 100644 --- a/arch/powerpc/include/asm/bitops.h +++ b/arch/powerpc/include/asm/bitops.h @@ -217,15 +217,34 @@ static __inline__ void __clear_bit_unlock(int nr, volatile unsigned long *addr) */ static __inline__ int fls(unsigned int x) { - return 32 - __builtin_clz(x); + int lz; + + if (__builtin_constant_p(x)) + return x ? 32 - __builtin_clz(x) : 0; + asm("cntlzw %0,%1" : "=r" (lz) : "r" (x)); + return 32 - lz; } #include +/* + * 64-bit can do this using one cntlzd (count leading zeroes doubleword) + * instruction; for 32-bit we use the generic version, which does two + * 32-bit fls calls. + */ +#ifdef CONFIG_PPC64 static __inline__ int fls64(__u64 x) { - return 64 - __builtin_clzll(x); + int lz; + + if (__builtin_constant_p(x)) + return x ? 64 - __builtin_clzll(x) : 0; + asm("cntlzd %0,%1" : "=r" (lz) : "r" (x)); + return 64 - lz; } +#else +#include +#endif #ifdef CONFIG_PPC64 unsigned int __arch_hweight8(unsigned int w); From 3a83e289e4b7890a046d3ee2e7a59f3ec19f5dc1 Mon Sep 17 00:00:00 2001 From: lizhe Date: Wed, 14 Oct 2020 14:54:43 +0800 Subject: [PATCH 16/62] jffs2: Allow setting rp_size to zero during remounting [ Upstream commit cd3ed3c73ac671ff6b0230ccb72b8300292d3643 ] Set rp_size to zero will be ignore during remounting. The method to identify whether we input a remounting option of rp_size is to check if the rp_size input is zero. It can not work well if we pass "rp_size=0". This patch add a bool variable "set_rp_size" to fix this problem. Reported-by: Jubin Zhong Signed-off-by: lizhe Signed-off-by: Richard Weinberger Signed-off-by: Sasha Levin --- fs/jffs2/jffs2_fs_sb.h | 1 + fs/jffs2/super.c | 7 +++++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/jffs2/jffs2_fs_sb.h b/fs/jffs2/jffs2_fs_sb.h index 778275f48a87..5a7091746f68 100644 --- a/fs/jffs2/jffs2_fs_sb.h +++ b/fs/jffs2/jffs2_fs_sb.h @@ -38,6 +38,7 @@ struct jffs2_mount_opts { * users. This is implemented simply by means of not allowing the * latter users to write to the file system if the amount if the * available space is less then 'rp_size'. */ + bool set_rp_size; unsigned int rp_size; }; diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index 60636b2e35ea..68ce77cbeed3 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -88,7 +88,7 @@ static int jffs2_show_options(struct seq_file *s, struct dentry *root) if (opts->override_compr) seq_printf(s, ",compr=%s", jffs2_compr_name(opts->compr)); - if (opts->rp_size) + if (opts->set_rp_size) seq_printf(s, ",rp_size=%u", opts->rp_size / 1024); return 0; @@ -212,6 +212,7 @@ static int jffs2_parse_param(struct fs_context *fc, struct fs_parameter *param) if (opt > c->mtd->size) return invalf(fc, "jffs2: Too large reserve pool specified, max is %llu KB", c->mtd->size / 1024); + c->mount_opts.set_rp_size = true; c->mount_opts.rp_size = opt; break; default: @@ -231,8 +232,10 @@ static inline void jffs2_update_mount_opts(struct fs_context *fc) c->mount_opts.override_compr = new_c->mount_opts.override_compr; c->mount_opts.compr = new_c->mount_opts.compr; } - if (new_c->mount_opts.rp_size) + if (new_c->mount_opts.set_rp_size) { + c->mount_opts.set_rp_size = new_c->mount_opts.set_rp_size; c->mount_opts.rp_size = new_c->mount_opts.rp_size; + } mutex_unlock(&c->alloc_sem); } From 9ce7ac5ed53b0ead01f60bb952f8f58f727f2059 Mon Sep 17 00:00:00 2001 From: Jamie Iles Date: Mon, 12 Oct 2020 14:12:04 +0100 Subject: [PATCH 17/62] jffs2: Fix NULL pointer dereference in rp_size fs option parsing [ Upstream commit a61df3c413e49b0042f9caf774c58512d1cc71b7 ] syzkaller found the following JFFS2 splat: Unable to handle kernel paging request at virtual address dfffa00000000001 Mem abort info: ESR = 0x96000004 EC = 0x25: DABT (current EL), IL = 32 bits SET = 0, FnV = 0 EA = 0, S1PTW = 0 Data abort info: ISV = 0, ISS = 0x00000004 CM = 0, WnR = 0 [dfffa00000000001] address between user and kernel address ranges Internal error: Oops: 96000004 [#1] SMP Dumping ftrace buffer: (ftrace buffer empty) Modules linked in: CPU: 0 PID: 12745 Comm: syz-executor.5 Tainted: G S 5.9.0-rc8+ #98 Hardware name: linux,dummy-virt (DT) pstate: 20400005 (nzCv daif +PAN -UAO BTYPE=--) pc : jffs2_parse_param+0x138/0x308 fs/jffs2/super.c:206 lr : jffs2_parse_param+0x108/0x308 fs/jffs2/super.c:205 sp : ffff000022a57910 x29: ffff000022a57910 x28: 0000000000000000 x27: ffff000057634008 x26: 000000000000d800 x25: 000000000000d800 x24: ffff0000271a9000 x23: ffffa0001adb5dc0 x22: ffff000023fdcf00 x21: 1fffe0000454af2c x20: ffff000024cc9400 x19: 0000000000000000 x18: 0000000000000000 x17: 0000000000000000 x16: ffffa000102dbdd0 x15: 0000000000000000 x14: ffffa000109e44bc x13: ffffa00010a3a26c x12: ffff80000476e0b3 x11: 1fffe0000476e0b2 x10: ffff80000476e0b2 x9 : ffffa00010a3ad60 x8 : ffff000023b70593 x7 : 0000000000000003 x6 : 00000000f1f1f1f1 x5 : ffff000023fdcf00 x4 : 0000000000000002 x3 : ffffa00010000000 x2 : 0000000000000001 x1 : dfffa00000000000 x0 : 0000000000000008 Call trace: jffs2_parse_param+0x138/0x308 fs/jffs2/super.c:206 vfs_parse_fs_param+0x234/0x4e8 fs/fs_context.c:117 vfs_parse_fs_string+0xe8/0x148 fs/fs_context.c:161 generic_parse_monolithic+0x17c/0x208 fs/fs_context.c:201 parse_monolithic_mount_data+0x7c/0xa8 fs/fs_context.c:649 do_new_mount fs/namespace.c:2871 [inline] path_mount+0x548/0x1da8 fs/namespace.c:3192 do_mount+0x124/0x138 fs/namespace.c:3205 __do_sys_mount fs/namespace.c:3413 [inline] __se_sys_mount fs/namespace.c:3390 [inline] __arm64_sys_mount+0x164/0x238 fs/namespace.c:3390 __invoke_syscall arch/arm64/kernel/syscall.c:36 [inline] invoke_syscall arch/arm64/kernel/syscall.c:48 [inline] el0_svc_common.constprop.0+0x15c/0x598 arch/arm64/kernel/syscall.c:149 do_el0_svc+0x60/0x150 arch/arm64/kernel/syscall.c:195 el0_svc+0x34/0xb0 arch/arm64/kernel/entry-common.c:226 el0_sync_handler+0xc8/0x5b4 arch/arm64/kernel/entry-common.c:236 el0_sync+0x15c/0x180 arch/arm64/kernel/entry.S:663 Code: d2d40001 f2fbffe1 91002260 d343fc02 (38e16841) ---[ end trace 4edf690313deda44 ]--- This is because since ec10a24f10c8, the option parsing happens before fill_super and so the MTD device isn't associated with the filesystem. Defer the size check until there is a valid association. Fixes: ec10a24f10c8 ("vfs: Convert jffs2 to use the new mount API") Cc: Cc: David Howells Signed-off-by: Jamie Iles Signed-off-by: Richard Weinberger Signed-off-by: Sasha Levin --- fs/jffs2/super.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index 68ce77cbeed3..6839a61e8ff1 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -208,12 +208,8 @@ static int jffs2_parse_param(struct fs_context *fc, struct fs_parameter *param) case Opt_rp_size: if (result.uint_32 > UINT_MAX / 1024) return invalf(fc, "jffs2: rp_size unrepresentable"); - opt = result.uint_32 * 1024; - if (opt > c->mtd->size) - return invalf(fc, "jffs2: Too large reserve pool specified, max is %llu KB", - c->mtd->size / 1024); + c->mount_opts.rp_size = result.uint_32 * 1024; c->mount_opts.set_rp_size = true; - c->mount_opts.rp_size = opt; break; default: return -EINVAL; @@ -275,6 +271,10 @@ static int jffs2_fill_super(struct super_block *sb, struct fs_context *fc) c->mtd = sb->s_mtd; c->os_priv = sb; + if (c->mount_opts.rp_size > c->mtd->size) + return invalf(fc, "jffs2: Too large reserve pool specified, max is %llu KB", + c->mtd->size / 1024); + /* Initialize JFFS2 superblock locks, the further initialization will * be done later */ mutex_init(&c->alloc_sem); From af07e4dd07839fd64841c31edf5682e09008e98c Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 8 Dec 2020 21:29:44 -0800 Subject: [PATCH 18/62] scsi: block: Fix a race in the runtime power management code commit fa4d0f1992a96f6d7c988ef423e3127e613f6ac9 upstream. With the current implementation the following race can happen: * blk_pre_runtime_suspend() calls blk_freeze_queue_start() and blk_mq_unfreeze_queue(). * blk_queue_enter() calls blk_queue_pm_only() and that function returns true. * blk_queue_enter() calls blk_pm_request_resume() and that function does not call pm_request_resume() because the queue runtime status is RPM_ACTIVE. * blk_pre_runtime_suspend() changes the queue status into RPM_SUSPENDING. Fix this race by changing the queue runtime status into RPM_SUSPENDING before switching q_usage_counter to atomic mode. Link: https://lore.kernel.org/r/20201209052951.16136-2-bvanassche@acm.org Fixes: 986d413b7c15 ("blk-mq: Enable support for runtime power management") Cc: Ming Lei Cc: Rafael J. Wysocki Cc: stable Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Jens Axboe Acked-by: Alan Stern Acked-by: Stanley Chu Co-developed-by: Can Guo Signed-off-by: Can Guo Signed-off-by: Bart Van Assche Signed-off-by: Martin K. Petersen Signed-off-by: Greg Kroah-Hartman --- block/blk-pm.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/block/blk-pm.c b/block/blk-pm.c index 1adc1cd748b4..2ccf88dbaa40 100644 --- a/block/blk-pm.c +++ b/block/blk-pm.c @@ -67,6 +67,10 @@ int blk_pre_runtime_suspend(struct request_queue *q) WARN_ON_ONCE(q->rpm_status != RPM_ACTIVE); + spin_lock_irq(&q->queue_lock); + q->rpm_status = RPM_SUSPENDING; + spin_unlock_irq(&q->queue_lock); + /* * Increase the pm_only counter before checking whether any * non-PM blk_queue_enter() calls are in progress to avoid that any @@ -89,15 +93,14 @@ int blk_pre_runtime_suspend(struct request_queue *q) /* Switch q_usage_counter back to per-cpu mode. */ blk_mq_unfreeze_queue(q); - spin_lock_irq(&q->queue_lock); - if (ret < 0) + if (ret < 0) { + spin_lock_irq(&q->queue_lock); + q->rpm_status = RPM_ACTIVE; pm_runtime_mark_last_busy(q->dev); - else - q->rpm_status = RPM_SUSPENDING; - spin_unlock_irq(&q->queue_lock); + spin_unlock_irq(&q->queue_lock); - if (ret) blk_clear_pm_only(q); + } return ret; } From 376c3111413cbc97315f0fb327ae45d1ebf8c8fc Mon Sep 17 00:00:00 2001 From: Petr Vorel Date: Mon, 14 Dec 2020 19:03:21 -0800 Subject: [PATCH 19/62] uapi: move constants from to commit a85cbe6159ffc973e5702f70a3bd5185f8f3c38d upstream. and include in UAPI headers instead of . The reason is to avoid indirect include when using some network headers: or others -> -> . This indirect include causes on MUSL redefinition of struct sysinfo when included both and some of UAPI headers: In file included from x86_64-buildroot-linux-musl/sysroot/usr/include/linux/kernel.h:5, from x86_64-buildroot-linux-musl/sysroot/usr/include/linux/netlink.h:5, from ../include/tst_netlink.h:14, from tst_crypto.c:13: x86_64-buildroot-linux-musl/sysroot/usr/include/linux/sysinfo.h:8:8: error: redefinition of `struct sysinfo' struct sysinfo { ^~~~~~~ In file included from ../include/tst_safe_macros.h:15, from ../include/tst_test.h:93, from tst_crypto.c:11: x86_64-buildroot-linux-musl/sysroot/usr/include/sys/sysinfo.h:10:8: note: originally defined here Link: https://lkml.kernel.org/r/20201015190013.8901-1-petr.vorel@gmail.com Signed-off-by: Petr Vorel Suggested-by: Rich Felker Acked-by: Rich Felker Cc: Peter Korsgaard Cc: Baruch Siach Cc: Florian Weimer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/const.h | 5 +++++ include/uapi/linux/ethtool.h | 2 +- include/uapi/linux/kernel.h | 9 +-------- include/uapi/linux/lightnvm.h | 2 +- include/uapi/linux/mroute6.h | 2 +- include/uapi/linux/netfilter/x_tables.h | 2 +- include/uapi/linux/netlink.h | 2 +- include/uapi/linux/sysctl.h | 2 +- 8 files changed, 12 insertions(+), 14 deletions(-) diff --git a/include/uapi/linux/const.h b/include/uapi/linux/const.h index 5ed721ad5b19..af2a44c08683 100644 --- a/include/uapi/linux/const.h +++ b/include/uapi/linux/const.h @@ -28,4 +28,9 @@ #define _BITUL(x) (_UL(1) << (x)) #define _BITULL(x) (_ULL(1) << (x)) +#define __ALIGN_KERNEL(x, a) __ALIGN_KERNEL_MASK(x, (typeof(x))(a) - 1) +#define __ALIGN_KERNEL_MASK(x, mask) (((x) + (mask)) & ~(mask)) + +#define __KERNEL_DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d)) + #endif /* _UAPI_LINUX_CONST_H */ diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 8938b76c4ee3..7857aa413627 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -14,7 +14,7 @@ #ifndef _UAPI_LINUX_ETHTOOL_H #define _UAPI_LINUX_ETHTOOL_H -#include +#include #include #include diff --git a/include/uapi/linux/kernel.h b/include/uapi/linux/kernel.h index 0ff8f7477847..fadf2db71fe8 100644 --- a/include/uapi/linux/kernel.h +++ b/include/uapi/linux/kernel.h @@ -3,13 +3,6 @@ #define _UAPI_LINUX_KERNEL_H #include - -/* - * 'kernel.h' contains some often-used function prototypes etc - */ -#define __ALIGN_KERNEL(x, a) __ALIGN_KERNEL_MASK(x, (typeof(x))(a) - 1) -#define __ALIGN_KERNEL_MASK(x, mask) (((x) + (mask)) & ~(mask)) - -#define __KERNEL_DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d)) +#include #endif /* _UAPI_LINUX_KERNEL_H */ diff --git a/include/uapi/linux/lightnvm.h b/include/uapi/linux/lightnvm.h index f9a1be7fc696..ead2e72e5c88 100644 --- a/include/uapi/linux/lightnvm.h +++ b/include/uapi/linux/lightnvm.h @@ -21,7 +21,7 @@ #define _UAPI_LINUX_LIGHTNVM_H #ifdef __KERNEL__ -#include +#include #include #else /* __KERNEL__ */ #include diff --git a/include/uapi/linux/mroute6.h b/include/uapi/linux/mroute6.h index c36177a86516..a1fd6173e2db 100644 --- a/include/uapi/linux/mroute6.h +++ b/include/uapi/linux/mroute6.h @@ -2,7 +2,7 @@ #ifndef _UAPI__LINUX_MROUTE6_H #define _UAPI__LINUX_MROUTE6_H -#include +#include #include #include #include /* For struct sockaddr_in6. */ diff --git a/include/uapi/linux/netfilter/x_tables.h b/include/uapi/linux/netfilter/x_tables.h index a8283f7dbc51..b8c6bb233ac1 100644 --- a/include/uapi/linux/netfilter/x_tables.h +++ b/include/uapi/linux/netfilter/x_tables.h @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ #ifndef _UAPI_X_TABLES_H #define _UAPI_X_TABLES_H -#include +#include #include #define XT_FUNCTION_MAXNAMELEN 30 diff --git a/include/uapi/linux/netlink.h b/include/uapi/linux/netlink.h index 0a4d73317759..622c78c821aa 100644 --- a/include/uapi/linux/netlink.h +++ b/include/uapi/linux/netlink.h @@ -2,7 +2,7 @@ #ifndef _UAPI__LINUX_NETLINK_H #define _UAPI__LINUX_NETLINK_H -#include +#include #include /* for __kernel_sa_family_t */ #include diff --git a/include/uapi/linux/sysctl.h b/include/uapi/linux/sysctl.h index 87aa2a6d9125..cc453ed0e65e 100644 --- a/include/uapi/linux/sysctl.h +++ b/include/uapi/linux/sysctl.h @@ -23,7 +23,7 @@ #ifndef _UAPI_LINUX_SYSCTL_H #define _UAPI_LINUX_SYSCTL_H -#include +#include #include #include From 7c3d8d73bafdfda43dc36479fdd0f97a6a46b925 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 17 Dec 2020 14:55:01 -0300 Subject: [PATCH 20/62] tools headers UAPI: Sync linux/const.h with the kernel headers commit 7ddcdea5b54492f54700f427f58690cf1e187e5e upstream. To pick up the changes in: a85cbe6159ffc973 ("uapi: move constants from to ") That causes no changes in tooling, just addresses this perf build warning: Warning: Kernel ABI header at 'tools/include/uapi/linux/const.h' differs from latest version at 'include/uapi/linux/const.h' diff -u tools/include/uapi/linux/const.h include/uapi/linux/const.h Cc: Adrian Hunter Cc: Ian Rogers Cc: Jiri Olsa Cc: Namhyung Kim Cc: Petr Vorel Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Greg Kroah-Hartman --- tools/include/uapi/linux/const.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tools/include/uapi/linux/const.h b/tools/include/uapi/linux/const.h index 5ed721ad5b19..af2a44c08683 100644 --- a/tools/include/uapi/linux/const.h +++ b/tools/include/uapi/linux/const.h @@ -28,4 +28,9 @@ #define _BITUL(x) (_UL(1) << (x)) #define _BITULL(x) (_ULL(1) << (x)) +#define __ALIGN_KERNEL(x, a) __ALIGN_KERNEL_MASK(x, (typeof(x))(a) - 1) +#define __ALIGN_KERNEL_MASK(x, mask) (((x) + (mask)) & ~(mask)) + +#define __KERNEL_DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d)) + #endif /* _UAPI_LINUX_CONST_H */ From 8ec95e3084180378f76a3fe849980cc5192ab38a Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 20 Nov 2020 10:55:11 +0900 Subject: [PATCH 21/62] null_blk: Fix zone size initialization commit 0ebcdd702f49aeb0ad2e2d894f8c124a0acc6e23 upstream. For a null_blk device with zoned mode enabled is currently initialized with a number of zones equal to the device capacity divided by the zone size, without considering if the device capacity is a multiple of the zone size. If the zone size is not a divisor of the capacity, the zones end up not covering the entire capacity, potentially resulting is out of bounds accesses to the zone array. Fix this by adding one last smaller zone with a size equal to the remainder of the disk capacity divided by the zone size if the capacity is not a multiple of the zone size. For such smaller last zone, the zone capacity is also checked so that it does not exceed the smaller zone size. Reported-by: Naohiro Aota Fixes: ca4b2a011948 ("null_blk: add zone support") Cc: stable@vger.kernel.org Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- drivers/block/null_blk_zoned.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/drivers/block/null_blk_zoned.c b/drivers/block/null_blk_zoned.c index 2553e05e0725..5f1376578ea3 100644 --- a/drivers/block/null_blk_zoned.c +++ b/drivers/block/null_blk_zoned.c @@ -2,8 +2,7 @@ #include #include "null_blk.h" -/* zone_size in MBs to sectors. */ -#define ZONE_SIZE_SHIFT 11 +#define MB_TO_SECTS(mb) (((sector_t)mb * SZ_1M) >> SECTOR_SHIFT) static inline unsigned int null_zone_no(struct nullb_device *dev, sector_t sect) { @@ -12,7 +11,7 @@ static inline unsigned int null_zone_no(struct nullb_device *dev, sector_t sect) int null_zone_init(struct nullb_device *dev) { - sector_t dev_size = (sector_t)dev->size * 1024 * 1024; + sector_t dev_capacity_sects; sector_t sector = 0; unsigned int i; @@ -25,9 +24,12 @@ int null_zone_init(struct nullb_device *dev) return -EINVAL; } - dev->zone_size_sects = dev->zone_size << ZONE_SIZE_SHIFT; - dev->nr_zones = dev_size >> - (SECTOR_SHIFT + ilog2(dev->zone_size_sects)); + dev_capacity_sects = MB_TO_SECTS(dev->size); + dev->zone_size_sects = MB_TO_SECTS(dev->zone_size); + dev->nr_zones = dev_capacity_sects >> ilog2(dev->zone_size_sects); + if (dev_capacity_sects & (dev->zone_size_sects - 1)) + dev->nr_zones++; + dev->zones = kvmalloc_array(dev->nr_zones, sizeof(struct blk_zone), GFP_KERNEL | __GFP_ZERO); if (!dev->zones) @@ -55,7 +57,10 @@ int null_zone_init(struct nullb_device *dev) struct blk_zone *zone = &dev->zones[i]; zone->start = zone->wp = sector; - zone->len = dev->zone_size_sects; + if (zone->start + dev->zone_size_sects > dev_capacity_sects) + zone->len = dev_capacity_sects - zone->start; + else + zone->len = dev->zone_size_sects; zone->type = BLK_ZONE_TYPE_SEQWRITE_REQ; zone->cond = BLK_ZONE_COND_EMPTY; From 8ddf02859c691d6e37fe2ca878750fb8f293ea16 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 23 Nov 2020 11:23:12 +0100 Subject: [PATCH 22/62] of: fix linker-section match-table corruption commit 5812b32e01c6d86ba7a84110702b46d8a8531fe9 upstream. Specify type alignment when declaring linker-section match-table entries to prevent gcc from increasing alignment and corrupting the various tables with padding (e.g. timers, irqchips, clocks, reserved memory). This is specifically needed on x86 where gcc (typically) aligns larger objects like struct of_device_id with static extent on 32-byte boundaries which at best prevents matching on anything but the first entry. Specifying alignment when declaring variables suppresses this optimisation. Here's a 64-bit example where all entries are corrupt as 16 bytes of padding has been inserted before the first entry: ffffffff8266b4b0 D __clk_of_table ffffffff8266b4c0 d __of_table_fixed_factor_clk ffffffff8266b5a0 d __of_table_fixed_clk ffffffff8266b680 d __clk_of_table_sentinel And here's a 32-bit example where the 8-byte-aligned table happens to be placed on a 32-byte boundary so that all but the first entry are corrupt due to the 28 bytes of padding inserted between entries: 812b3ec0 D __irqchip_of_table 812b3ec0 d __of_table_irqchip1 812b3fa0 d __of_table_irqchip2 812b4080 d __of_table_irqchip3 812b4160 d irqchip_of_match_end Verified on x86 using gcc-9.3 and gcc-4.9 (which uses 64-byte alignment), and on arm using gcc-7.2. Note that there are no in-tree users of these tables on x86 currently (even if they are included in the image). Fixes: 54196ccbe0ba ("of: consolidate linker section OF match table declarations") Fixes: f6e916b82022 ("irqchip: add basic infrastructure") Cc: stable # 3.9 Signed-off-by: Johan Hovold Link: https://lore.kernel.org/r/20201123102319.8090-2-johan@kernel.org [ johan: adjust context to 5.4 ] Signed-off-by: Johan Hovold Signed-off-by: Greg Kroah-Hartman --- include/linux/of.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/of.h b/include/linux/of.h index 844f89e1b039..a7621e2b440a 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -1282,6 +1282,7 @@ static inline int of_get_available_child_count(const struct device_node *np) #define _OF_DECLARE(table, name, compat, fn, fn_type) \ static const struct of_device_id __of_table_##name \ __used __section(__##table##_of_table) \ + __aligned(__alignof__(struct of_device_id)) \ = { .compatible = compat, \ .data = (fn == (fn_type)NULL) ? fn : fn } #else From 57ba2c7a50bfe02063fe4591ebfc92788fbde965 Mon Sep 17 00:00:00 2001 From: Qinglang Miao Date: Thu, 10 Dec 2020 09:29:43 +0800 Subject: [PATCH 23/62] cgroup: Fix memory leak when parsing multiple source parameters commit 2d18e54dd8662442ef5898c6bdadeaf90b3cebbc upstream. A memory leak is found in cgroup1_parse_param() when multiple source parameters overwrite fc->source in the fs_context struct without free. unreferenced object 0xffff888100d930e0 (size 16): comm "mount", pid 520, jiffies 4303326831 (age 152.783s) hex dump (first 16 bytes): 74 65 73 74 6c 65 61 6b 00 00 00 00 00 00 00 00 testleak........ backtrace: [<000000003e5023ec>] kmemdup_nul+0x2d/0xa0 [<00000000377dbdaa>] vfs_parse_fs_string+0xc0/0x150 [<00000000cb2b4882>] generic_parse_monolithic+0x15a/0x1d0 [<000000000f750198>] path_mount+0xee1/0x1820 [<0000000004756de2>] do_mount+0xea/0x100 [<0000000094cafb0a>] __x64_sys_mount+0x14b/0x1f0 Fix this bug by permitting a single source parameter and rejecting with an error all subsequent ones. Fixes: 8d2451f4994f ("cgroup1: switch to option-by-option parsing") Reported-by: Hulk Robot Signed-off-by: Qinglang Miao Reviewed-by: Zefan Li Signed-off-by: Tejun Heo Signed-off-by: Greg Kroah-Hartman --- kernel/cgroup/cgroup-v1.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index f684c82efc2e..79682c23407c 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -914,6 +914,8 @@ int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param) opt = fs_parse(fc, &cgroup1_fs_parameters, param, &result); if (opt == -ENOPARAM) { if (strcmp(param->key, "source") == 0) { + if (fc->source) + return invalf(fc, "Multiple sources not supported"); fc->source = param->string; param->string = NULL; return 0; From b726f86022076ceaedb0bbfa2e7a2c975b569608 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 8 Dec 2020 14:05:05 -0800 Subject: [PATCH 24/62] scsi: cxgb4i: Fix TLS dependency commit cb5253198f10a4cd79b7523c581e6173c7d49ddb upstream. SCSI_CXGB4_ISCSI selects CHELSIO_T4. The latter depends on TLS || TLS=n, so since 'select' does not check dependencies of the selected symbol, SCSI_CXGB4_ISCSI should also depend on TLS || TLS=n. This prevents the following kconfig warning and restricts SCSI_CXGB4_ISCSI to 'm' whenever TLS=m. WARNING: unmet direct dependencies detected for CHELSIO_T4 Depends on [m]: NETDEVICES [=y] && ETHERNET [=y] && NET_VENDOR_CHELSIO [=y] && PCI [=y] && (IPV6 [=y] || IPV6 [=y]=n) && (TLS [=m] || TLS [=m]=n) Selected by [y]: - SCSI_CXGB4_ISCSI [=y] && SCSI_LOWLEVEL [=y] && SCSI [=y] && PCI [=y] && INET [=y] && (IPV6 [=y] || IPV6 [=y]=n) && ETHERNET [=y] Link: https://lore.kernel.org/r/20201208220505.24488-1-rdunlap@infradead.org Fixes: 7b36b6e03b0d ("[SCSI] cxgb4i v5: iscsi driver") Cc: Karen Xie Cc: linux-scsi@vger.kernel.org Cc: "James E.J. Bottomley" Cc: "Martin K. Petersen" Signed-off-by: Randy Dunlap Signed-off-by: Martin K. Petersen Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/cxgbi/cxgb4i/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/scsi/cxgbi/cxgb4i/Kconfig b/drivers/scsi/cxgbi/cxgb4i/Kconfig index d1f1baba3285..d1bdd754c6a4 100644 --- a/drivers/scsi/cxgbi/cxgb4i/Kconfig +++ b/drivers/scsi/cxgbi/cxgb4i/Kconfig @@ -4,6 +4,7 @@ config SCSI_CXGB4_ISCSI depends on PCI && INET && (IPV6 || IPV6=n) depends on THERMAL || !THERMAL depends on ETHERNET + depends on TLS || TLS=n select NET_VENDOR_CHELSIO select CHELSIO_T4 select CHELSIO_LIB From 18e1101b0ee9b9f2483e0efd0dcf3763b3915026 Mon Sep 17 00:00:00 2001 From: Anant Thazhemadam Date: Wed, 30 Sep 2020 00:28:15 +0530 Subject: [PATCH 25/62] Bluetooth: hci_h5: close serdev device and free hu in h5_close commit 70f259a3f4276b71db365b1d6ff1eab805ea6ec3 upstream. When h5_close() gets called, the memory allocated for the hu gets freed only if hu->serdev doesn't exist. This leads to a memory leak. So when h5_close() is requested, close the serdev device instance and free the memory allocated to the hu entirely instead. Fixes: https://syzkaller.appspot.com/bug?extid=6ce141c55b2f7aafd1c4 Reported-by: syzbot+6ce141c55b2f7aafd1c4@syzkaller.appspotmail.com Tested-by: syzbot+6ce141c55b2f7aafd1c4@syzkaller.appspotmail.com Signed-off-by: Anant Thazhemadam Signed-off-by: Marcel Holtmann Signed-off-by: Greg Kroah-Hartman --- drivers/bluetooth/hci_h5.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/bluetooth/hci_h5.c b/drivers/bluetooth/hci_h5.c index e11af747395d..17b0f1b793ec 100644 --- a/drivers/bluetooth/hci_h5.c +++ b/drivers/bluetooth/hci_h5.c @@ -250,8 +250,12 @@ static int h5_close(struct hci_uart *hu) if (h5->vnd && h5->vnd->close) h5->vnd->close(h5); - if (!hu->serdev) - kfree(h5); + if (hu->serdev) + serdev_device_close(hu->serdev); + + kfree_skb(h5->rx_skb); + kfree(h5); + h5 = NULL; return 0; } From 01be033cc127dd3718eecc7783e1faa0416ea113 Mon Sep 17 00:00:00 2001 From: Rustam Kovhaev Date: Sun, 1 Nov 2020 06:09:58 -0800 Subject: [PATCH 26/62] reiserfs: add check for an invalid ih_entry_count commit d24396c5290ba8ab04ba505176874c4e04a2d53c upstream. when directory item has an invalid value set for ih_entry_count it might trigger use-after-free or out-of-bounds read in bin_search_in_dir_item() ih_entry_count * IH_SIZE for directory item should not be larger than ih_item_len Link: https://lore.kernel.org/r/20201101140958.3650143-1-rkovhaev@gmail.com Reported-and-tested-by: syzbot+83b6f7cf9922cae5c4d7@syzkaller.appspotmail.com Link: https://syzkaller.appspot.com/bug?extid=83b6f7cf9922cae5c4d7 Signed-off-by: Rustam Kovhaev Signed-off-by: Jan Kara Signed-off-by: Greg Kroah-Hartman --- fs/reiserfs/stree.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c index bb4973aefbb1..9e64e23014e8 100644 --- a/fs/reiserfs/stree.c +++ b/fs/reiserfs/stree.c @@ -454,6 +454,12 @@ static int is_leaf(char *buf, int blocksize, struct buffer_head *bh) "(second one): %h", ih); return 0; } + if (is_direntry_le_ih(ih) && (ih_item_len(ih) < (ih_entry_count(ih) * IH_SIZE))) { + reiserfs_warning(NULL, "reiserfs-5093", + "item entry count seems wrong %h", + ih); + return 0; + } prev_location = ih_location(ih); } From 750627d36f84bc3880d9059b5afa5207a2ddeb43 Mon Sep 17 00:00:00 2001 From: Anant Thazhemadam Date: Mon, 23 Nov 2020 04:15:34 +0530 Subject: [PATCH 27/62] misc: vmw_vmci: fix kernel info-leak by initializing dbells in vmci_ctx_get_chkpt_doorbells() commit 31dcb6c30a26d32650ce134820f27de3c675a45a upstream. A kernel-infoleak was reported by syzbot, which was caused because dbells was left uninitialized. Using kzalloc() instead of kmalloc() fixes this issue. Reported-by: syzbot+a79e17c39564bedf0930@syzkaller.appspotmail.com Tested-by: syzbot+a79e17c39564bedf0930@syzkaller.appspotmail.com Signed-off-by: Anant Thazhemadam Link: https://lore.kernel.org/r/20201122224534.333471-1-anant.thazhemadam@gmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/misc/vmw_vmci/vmci_context.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/misc/vmw_vmci/vmci_context.c b/drivers/misc/vmw_vmci/vmci_context.c index 16695366ec92..26ff49fdf0f7 100644 --- a/drivers/misc/vmw_vmci/vmci_context.c +++ b/drivers/misc/vmw_vmci/vmci_context.c @@ -743,7 +743,7 @@ static int vmci_ctx_get_chkpt_doorbells(struct vmci_ctx *context, return VMCI_ERROR_MORE_DATA; } - dbells = kmalloc(data_size, GFP_ATOMIC); + dbells = kzalloc(data_size, GFP_ATOMIC); if (!dbells) return VMCI_ERROR_NO_MEM; From 28a29e3a658a444f34eabcc788098a97c81b4237 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Fri, 27 Nov 2020 07:40:21 +0100 Subject: [PATCH 28/62] media: gp8psk: initialize stats at power control logic commit d0ac1a26ed5943127cb0156148735f5f52a07075 upstream. As reported on: https://lore.kernel.org/linux-media/20190627222020.45909-1-willemdebruijn.kernel@gmail.com/ if gp8psk_usb_in_op() returns an error, the status var is not initialized. Yet, this var is used later on, in order to identify: - if the device was already started; - if firmware has loaded; - if the LNBf was powered on. Using status = 0 seems to ensure that everything will be properly powered up. So, instead of the proposed solution, let's just set status = 0. Reported-by: syzbot Reported-by: Willem de Bruijn Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Greg Kroah-Hartman --- drivers/media/usb/dvb-usb/gp8psk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/media/usb/dvb-usb/gp8psk.c b/drivers/media/usb/dvb-usb/gp8psk.c index 1282f701f185..ac8b8bf6ee1d 100644 --- a/drivers/media/usb/dvb-usb/gp8psk.c +++ b/drivers/media/usb/dvb-usb/gp8psk.c @@ -182,7 +182,7 @@ out_rel_fw: static int gp8psk_power_ctrl(struct dvb_usb_device *d, int onoff) { - u8 status, buf; + u8 status = 0, buf; int gp_product_id = le16_to_cpu(d->udev->descriptor.idProduct); if (onoff) { From 4250fe65b2e6c0e4163e74e6f4ec1455f9f5dd08 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 9 Dec 2020 16:49:36 +0800 Subject: [PATCH 29/62] f2fs: fix shift-out-of-bounds in sanity_check_raw_super() commit e584bbe821229a3e7cc409eecd51df66f9268c21 upstream. syzbot reported a bug which could cause shift-out-of-bounds issue, fix it. Call Trace: __dump_stack lib/dump_stack.c:79 [inline] dump_stack+0x107/0x163 lib/dump_stack.c:120 ubsan_epilogue+0xb/0x5a lib/ubsan.c:148 __ubsan_handle_shift_out_of_bounds.cold+0xb1/0x181 lib/ubsan.c:395 sanity_check_raw_super fs/f2fs/super.c:2812 [inline] read_raw_super_block fs/f2fs/super.c:3267 [inline] f2fs_fill_super.cold+0x16c9/0x16f6 fs/f2fs/super.c:3519 mount_bdev+0x34d/0x410 fs/super.c:1366 legacy_get_tree+0x105/0x220 fs/fs_context.c:592 vfs_get_tree+0x89/0x2f0 fs/super.c:1496 do_new_mount fs/namespace.c:2896 [inline] path_mount+0x12ae/0x1e70 fs/namespace.c:3227 do_mount fs/namespace.c:3240 [inline] __do_sys_mount fs/namespace.c:3448 [inline] __se_sys_mount fs/namespace.c:3425 [inline] __x64_sys_mount+0x27f/0x300 fs/namespace.c:3425 do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Reported-by: syzbot+ca9a785f8ac472085994@syzkaller.appspotmail.com Signed-off-by: Anant Thazhemadam Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim Signed-off-by: Greg Kroah-Hartman --- fs/f2fs/super.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index fa461db696e7..a9a083232bcf 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2523,7 +2523,6 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi, block_t total_sections, blocks_per_seg; struct f2fs_super_block *raw_super = (struct f2fs_super_block *) (bh->b_data + F2FS_SUPER_OFFSET); - unsigned int blocksize; size_t crc_offset = 0; __u32 crc = 0; @@ -2557,10 +2556,10 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi, } /* Currently, support only 4KB block size */ - blocksize = 1 << le32_to_cpu(raw_super->log_blocksize); - if (blocksize != F2FS_BLKSIZE) { - f2fs_info(sbi, "Invalid blocksize (%u), supports only 4KB", - blocksize); + if (le32_to_cpu(raw_super->log_blocksize) != F2FS_BLKSIZE_BITS) { + f2fs_info(sbi, "Invalid log_blocksize (%u), supports only %u", + le32_to_cpu(raw_super->log_blocksize), + F2FS_BLKSIZE_BITS); return -EFSCORRUPTED; } From 8d2204a0539192cc75727e8917e1bb6b2a9e40bb Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Sun, 6 Dec 2020 09:34:56 +0100 Subject: [PATCH 30/62] ALSA: seq: Use bool for snd_seq_queue internal flags commit 4ebd47037027c4beae99680bff3b20fdee5d7c1e upstream. The snd_seq_queue struct contains various flags in the bit fields. Those are categorized to two different use cases, both of which are protected by different spinlocks. That implies that there are still potential risks of the bad operations for bit fields by concurrent accesses. For addressing the problem, this patch rearranges those flags to be a standard bool instead of a bit field. Reported-by: syzbot+63cbe31877bb80ef58f5@syzkaller.appspotmail.com Link: https://lore.kernel.org/r/20201206083456.21110-1-tiwai@suse.de Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/core/seq/seq_queue.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sound/core/seq/seq_queue.h b/sound/core/seq/seq_queue.h index 9254c8dbe5e3..25d2d6b61007 100644 --- a/sound/core/seq/seq_queue.h +++ b/sound/core/seq/seq_queue.h @@ -26,10 +26,10 @@ struct snd_seq_queue { struct snd_seq_timer *timer; /* time keeper for this queue */ int owner; /* client that 'owns' the timer */ - unsigned int locked:1, /* timer is only accesibble by owner if set */ - klocked:1, /* kernel lock (after START) */ - check_again:1, - check_blocked:1; + bool locked; /* timer is only accesibble by owner if set */ + bool klocked; /* kernel lock (after START) */ + bool check_again; /* concurrent access happened during check */ + bool check_blocked; /* queue being checked */ unsigned int flags; /* status flags */ unsigned int info_flags; /* info for sync */ From 3a2a5e197a8452b5af9d964a8fab4cdd06a8c381 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Sun, 6 Dec 2020 09:35:27 +0100 Subject: [PATCH 31/62] ALSA: rawmidi: Access runtime->avail always in spinlock commit 88a06d6fd6b369d88cec46c62db3e2604a2f50d5 upstream. The runtime->avail field may be accessed concurrently while some places refer to it without taking the runtime->lock spinlock, as detected by KCSAN. Usually this isn't a big problem, but for consistency and safety, we should take the spinlock at each place referencing this field. Reported-by: syzbot+a23a6f1215c84756577c@syzkaller.appspotmail.com Reported-by: syzbot+3d367d1df1d2b67f5c19@syzkaller.appspotmail.com Link: https://lore.kernel.org/r/20201206083527.21163-1-tiwai@suse.de Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/core/rawmidi.c | 49 +++++++++++++++++++++++++++++++------------- 1 file changed, 35 insertions(+), 14 deletions(-) diff --git a/sound/core/rawmidi.c b/sound/core/rawmidi.c index 94db4683cfaf..6a3543b8455f 100644 --- a/sound/core/rawmidi.c +++ b/sound/core/rawmidi.c @@ -72,11 +72,21 @@ static inline unsigned short snd_rawmidi_file_flags(struct file *file) } } -static inline int snd_rawmidi_ready(struct snd_rawmidi_substream *substream) +static inline bool __snd_rawmidi_ready(struct snd_rawmidi_runtime *runtime) +{ + return runtime->avail >= runtime->avail_min; +} + +static bool snd_rawmidi_ready(struct snd_rawmidi_substream *substream) { struct snd_rawmidi_runtime *runtime = substream->runtime; + unsigned long flags; + bool ready; - return runtime->avail >= runtime->avail_min; + spin_lock_irqsave(&runtime->lock, flags); + ready = __snd_rawmidi_ready(runtime); + spin_unlock_irqrestore(&runtime->lock, flags); + return ready; } static inline int snd_rawmidi_ready_append(struct snd_rawmidi_substream *substream, @@ -945,7 +955,7 @@ int snd_rawmidi_receive(struct snd_rawmidi_substream *substream, if (result > 0) { if (runtime->event) schedule_work(&runtime->event_work); - else if (snd_rawmidi_ready(substream)) + else if (__snd_rawmidi_ready(runtime)) wake_up(&runtime->sleep); } spin_unlock_irqrestore(&runtime->lock, flags); @@ -1024,7 +1034,7 @@ static ssize_t snd_rawmidi_read(struct file *file, char __user *buf, size_t coun result = 0; while (count > 0) { spin_lock_irq(&runtime->lock); - while (!snd_rawmidi_ready(substream)) { + while (!__snd_rawmidi_ready(runtime)) { wait_queue_entry_t wait; if ((file->f_flags & O_NONBLOCK) != 0 || result > 0) { @@ -1041,9 +1051,11 @@ static ssize_t snd_rawmidi_read(struct file *file, char __user *buf, size_t coun return -ENODEV; if (signal_pending(current)) return result > 0 ? result : -ERESTARTSYS; - if (!runtime->avail) - return result > 0 ? result : -EIO; spin_lock_irq(&runtime->lock); + if (!runtime->avail) { + spin_unlock_irq(&runtime->lock); + return result > 0 ? result : -EIO; + } } spin_unlock_irq(&runtime->lock); count1 = snd_rawmidi_kernel_read1(substream, @@ -1181,7 +1193,7 @@ int __snd_rawmidi_transmit_ack(struct snd_rawmidi_substream *substream, int coun runtime->avail += count; substream->bytes += count; if (count > 0) { - if (runtime->drain || snd_rawmidi_ready(substream)) + if (runtime->drain || __snd_rawmidi_ready(runtime)) wake_up(&runtime->sleep); } return count; @@ -1370,9 +1382,11 @@ static ssize_t snd_rawmidi_write(struct file *file, const char __user *buf, return -ENODEV; if (signal_pending(current)) return result > 0 ? result : -ERESTARTSYS; - if (!runtime->avail && !timeout) - return result > 0 ? result : -EIO; spin_lock_irq(&runtime->lock); + if (!runtime->avail && !timeout) { + spin_unlock_irq(&runtime->lock); + return result > 0 ? result : -EIO; + } } spin_unlock_irq(&runtime->lock); count1 = snd_rawmidi_kernel_write1(substream, buf, NULL, count); @@ -1452,6 +1466,7 @@ static void snd_rawmidi_proc_info_read(struct snd_info_entry *entry, struct snd_rawmidi *rmidi; struct snd_rawmidi_substream *substream; struct snd_rawmidi_runtime *runtime; + unsigned long buffer_size, avail, xruns; rmidi = entry->private_data; snd_iprintf(buffer, "%s\n\n", rmidi->name); @@ -1470,13 +1485,16 @@ static void snd_rawmidi_proc_info_read(struct snd_info_entry *entry, " Owner PID : %d\n", pid_vnr(substream->pid)); runtime = substream->runtime; + spin_lock_irq(&runtime->lock); + buffer_size = runtime->buffer_size; + avail = runtime->avail; + spin_unlock_irq(&runtime->lock); snd_iprintf(buffer, " Mode : %s\n" " Buffer size : %lu\n" " Avail : %lu\n", runtime->oss ? "OSS compatible" : "native", - (unsigned long) runtime->buffer_size, - (unsigned long) runtime->avail); + buffer_size, avail); } } } @@ -1494,13 +1512,16 @@ static void snd_rawmidi_proc_info_read(struct snd_info_entry *entry, " Owner PID : %d\n", pid_vnr(substream->pid)); runtime = substream->runtime; + spin_lock_irq(&runtime->lock); + buffer_size = runtime->buffer_size; + avail = runtime->avail; + xruns = runtime->xruns; + spin_unlock_irq(&runtime->lock); snd_iprintf(buffer, " Buffer size : %lu\n" " Avail : %lu\n" " Overruns : %lu\n", - (unsigned long) runtime->buffer_size, - (unsigned long) runtime->avail, - (unsigned long) runtime->xruns); + buffer_size, avail, xruns); } } } From 5b2f1ad6b12ba8cc17b33436a92ed3a1443fa251 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 15 Dec 2020 20:45:44 -0800 Subject: [PATCH 32/62] bfs: don't use WARNING: string when it's just info. commit dc889b8d4a8122549feabe99eead04e6b23b6513 upstream. Make the printk() [bfs "printf" macro] seem less severe by changing "WARNING:" to "NOTE:". warns us about using WARNING or BUG in a format string other than in WARN() or BUG() family macros. bfs/inode.c is doing just that in a normal printk() call, so change the "WARNING" string to be "NOTE". Link: https://lkml.kernel.org/r/20201203212634.17278-1-rdunlap@infradead.org Reported-by: syzbot+3fd34060f26e766536ff@syzkaller.appspotmail.com Signed-off-by: Randy Dunlap Cc: Dmitry Vyukov Cc: Al Viro Cc: "Tigran A. Aivazian" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/bfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index f8ce1368218b..1a8f3c8ab32c 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -351,7 +351,7 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent) info->si_lasti = (le32_to_cpu(bfs_sb->s_start) - BFS_BSIZE) / sizeof(struct bfs_inode) + BFS_ROOT_INO - 1; if (info->si_lasti == BFS_MAX_LASTI) - printf("WARNING: filesystem %s was created with 512 inodes, the real maximum is 511, mounting anyway\n", s->s_id); + printf("NOTE: filesystem %s was created with 512 inodes, the real maximum is 511, mounting anyway\n", s->s_id); else if (info->si_lasti > BFS_MAX_LASTI) { printf("Impossible last inode number %lu > %d on %s\n", info->si_lasti, BFS_MAX_LASTI, s->s_id); goto out1; From 642c2d74c365ae8f642e50e3c141a8b8b35cc7dd Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Thu, 5 Nov 2020 14:23:51 +0800 Subject: [PATCH 33/62] fcntl: Fix potential deadlock in send_sig{io, urg}() commit 8d1ddb5e79374fb277985a6b3faa2ed8631c5b4c upstream. Syzbot reports a potential deadlock found by the newly added recursive read deadlock detection in lockdep: [...] ======================================================== [...] WARNING: possible irq lock inversion dependency detected [...] 5.9.0-rc2-syzkaller #0 Not tainted [...] -------------------------------------------------------- [...] syz-executor.1/10214 just changed the state of lock: [...] ffff88811f506338 (&f->f_owner.lock){.+..}-{2:2}, at: send_sigurg+0x1d/0x200 [...] but this lock was taken by another, HARDIRQ-safe lock in the past: [...] (&dev->event_lock){-...}-{2:2} [...] [...] [...] and interrupts could create inverse lock ordering between them. [...] [...] [...] other info that might help us debug this: [...] Chain exists of: [...] &dev->event_lock --> &new->fa_lock --> &f->f_owner.lock [...] [...] Possible interrupt unsafe locking scenario: [...] [...] CPU0 CPU1 [...] ---- ---- [...] lock(&f->f_owner.lock); [...] local_irq_disable(); [...] lock(&dev->event_lock); [...] lock(&new->fa_lock); [...] [...] lock(&dev->event_lock); [...] [...] *** DEADLOCK *** The corresponding deadlock case is as followed: CPU 0 CPU 1 CPU 2 read_lock(&fown->lock); spin_lock_irqsave(&dev->event_lock, ...) write_lock_irq(&filp->f_owner.lock); // wait for the lock read_lock(&fown-lock); // have to wait until the writer release // due to the fairness spin_lock_irqsave(&dev->event_lock); // wait for the lock The lock dependency on CPU 1 happens if there exists a call sequence: input_inject_event(): spin_lock_irqsave(&dev->event_lock,...); input_handle_event(): input_pass_values(): input_to_handler(): handler->event(): // evdev_event() evdev_pass_values(): spin_lock(&client->buffer_lock); __pass_event(): kill_fasync(): kill_fasync_rcu(): read_lock(&fa->fa_lock); send_sigio(): read_lock(&fown->lock); To fix this, make the reader in send_sigurg() and send_sigio() use read_lock_irqsave() and read_lock_irqrestore(). Reported-by: syzbot+22e87cdf94021b984aa6@syzkaller.appspotmail.com Reported-by: syzbot+c5e32344981ad9f33750@syzkaller.appspotmail.com Signed-off-by: Boqun Feng Signed-off-by: Jeff Layton Signed-off-by: Greg Kroah-Hartman --- fs/fcntl.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/fs/fcntl.c b/fs/fcntl.c index 3d40771e8e7c..3dc90e5293e6 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -779,9 +779,10 @@ void send_sigio(struct fown_struct *fown, int fd, int band) { struct task_struct *p; enum pid_type type; + unsigned long flags; struct pid *pid; - read_lock(&fown->lock); + read_lock_irqsave(&fown->lock, flags); type = fown->pid_type; pid = fown->pid; @@ -802,7 +803,7 @@ void send_sigio(struct fown_struct *fown, int fd, int band) read_unlock(&tasklist_lock); } out_unlock_fown: - read_unlock(&fown->lock); + read_unlock_irqrestore(&fown->lock, flags); } static void send_sigurg_to_task(struct task_struct *p, @@ -817,9 +818,10 @@ int send_sigurg(struct fown_struct *fown) struct task_struct *p; enum pid_type type; struct pid *pid; + unsigned long flags; int ret = 0; - read_lock(&fown->lock); + read_lock_irqsave(&fown->lock, flags); type = fown->pid_type; pid = fown->pid; @@ -842,7 +844,7 @@ int send_sigurg(struct fown_struct *fown) read_unlock(&tasklist_lock); } out_unlock_fown: - read_unlock(&fown->lock); + read_unlock_irqrestore(&fown->lock, flags); return ret; } From 569da7c3d9a31522433ad71c9d204b1df89bff90 Mon Sep 17 00:00:00 2001 From: Dinghao Liu Date: Tue, 20 Oct 2020 14:12:26 +0800 Subject: [PATCH 34/62] rtc: sun6i: Fix memleak in sun6i_rtc_clk_init [ Upstream commit 28d211919e422f58c1e6c900e5810eee4f1ce4c8 ] When clk_hw_register_fixed_rate_with_accuracy() fails, clk_data should be freed. It's the same for the subsequent two error paths, but we should also unregister the already registered clocks in them. Signed-off-by: Dinghao Liu Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20201020061226.6572-1-dinghao.liu@zju.edu.cn Signed-off-by: Sasha Levin --- drivers/rtc/rtc-sun6i.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/rtc/rtc-sun6i.c b/drivers/rtc/rtc-sun6i.c index fc32be687606..c41bc8084d7c 100644 --- a/drivers/rtc/rtc-sun6i.c +++ b/drivers/rtc/rtc-sun6i.c @@ -276,7 +276,7 @@ static void __init sun6i_rtc_clk_init(struct device_node *node, 300000000); if (IS_ERR(rtc->int_osc)) { pr_crit("Couldn't register the internal oscillator\n"); - return; + goto err; } parents[0] = clk_hw_get_name(rtc->int_osc); @@ -292,7 +292,7 @@ static void __init sun6i_rtc_clk_init(struct device_node *node, rtc->losc = clk_register(NULL, &rtc->hw); if (IS_ERR(rtc->losc)) { pr_crit("Couldn't register the LOSC clock\n"); - return; + goto err_register; } of_property_read_string_index(node, "clock-output-names", 1, @@ -303,7 +303,7 @@ static void __init sun6i_rtc_clk_init(struct device_node *node, &rtc->lock); if (IS_ERR(rtc->ext_losc)) { pr_crit("Couldn't register the LOSC external gate\n"); - return; + goto err_register; } clk_data->num = 2; @@ -316,6 +316,8 @@ static void __init sun6i_rtc_clk_init(struct device_node *node, of_clk_add_hw_provider(node, of_clk_hw_onecell_get, clk_data); return; +err_register: + clk_hw_unregister_fixed_rate(rtc->int_osc); err: kfree(clk_data); } From 1842dde0dd13250a5b019e7d77153163feab353f Mon Sep 17 00:00:00 2001 From: Miroslav Benes Date: Tue, 27 Oct 2020 15:03:36 +0100 Subject: [PATCH 35/62] module: set MODULE_STATE_GOING state when a module fails to load [ Upstream commit 5e8ed280dab9eeabc1ba0b2db5dbe9fe6debb6b5 ] If a module fails to load due to an error in prepare_coming_module(), the following error handling in load_module() runs with MODULE_STATE_COMING in module's state. Fix it by correctly setting MODULE_STATE_GOING under "bug_cleanup" label. Signed-off-by: Miroslav Benes Signed-off-by: Jessica Yu Signed-off-by: Sasha Levin --- kernel/module.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/module.c b/kernel/module.c index 45513909b01d..806a7196754a 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3953,6 +3953,7 @@ static int load_module(struct load_info *info, const char __user *uargs, MODULE_STATE_GOING, mod); klp_module_going(mod); bug_cleanup: + mod->state = MODULE_STATE_GOING; /* module_bug_cleanup needs module_mutex protection */ mutex_lock(&module_mutex); module_bug_cleanup(mod); From e2926630f6539b6809547ae51a878d0f3d8cb74e Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 2 Nov 2020 16:32:10 +0100 Subject: [PATCH 36/62] quota: Don't overflow quota file offsets [ Upstream commit 10f04d40a9fa29785206c619f80d8beedb778837 ] The on-disk quota format supports quota files with upto 2^32 blocks. Be careful when computing quota file offsets in the quota files from block numbers as they can overflow 32-bit types. Since quota files larger than 4GB would require ~26 millions of quota users, this is mostly a theoretical concern now but better be careful, fuzzers would find the problem sooner or later anyway... Reviewed-by: Andreas Dilger Signed-off-by: Jan Kara Signed-off-by: Sasha Levin --- fs/quota/quota_tree.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/quota/quota_tree.c b/fs/quota/quota_tree.c index a6f856f341dc..c5562c871c8b 100644 --- a/fs/quota/quota_tree.c +++ b/fs/quota/quota_tree.c @@ -62,7 +62,7 @@ static ssize_t read_blk(struct qtree_mem_dqinfo *info, uint blk, char *buf) memset(buf, 0, info->dqi_usable_bs); return sb->s_op->quota_read(sb, info->dqi_type, buf, - info->dqi_usable_bs, blk << info->dqi_blocksize_bits); + info->dqi_usable_bs, (loff_t)blk << info->dqi_blocksize_bits); } static ssize_t write_blk(struct qtree_mem_dqinfo *info, uint blk, char *buf) @@ -71,7 +71,7 @@ static ssize_t write_blk(struct qtree_mem_dqinfo *info, uint blk, char *buf) ssize_t ret; ret = sb->s_op->quota_write(sb, info->dqi_type, buf, - info->dqi_usable_bs, blk << info->dqi_blocksize_bits); + info->dqi_usable_bs, (loff_t)blk << info->dqi_blocksize_bits); if (ret != info->dqi_usable_bs) { quota_error(sb, "dquota write failed"); if (ret >= 0) @@ -284,7 +284,7 @@ static uint find_free_dqentry(struct qtree_mem_dqinfo *info, blk); goto out_buf; } - dquot->dq_off = (blk << info->dqi_blocksize_bits) + + dquot->dq_off = ((loff_t)blk << info->dqi_blocksize_bits) + sizeof(struct qt_disk_dqdbheader) + i * info->dqi_entry_size; kfree(buf); @@ -559,7 +559,7 @@ static loff_t find_block_dqentry(struct qtree_mem_dqinfo *info, ret = -EIO; goto out_buf; } else { - ret = (blk << info->dqi_blocksize_bits) + sizeof(struct + ret = ((loff_t)blk << info->dqi_blocksize_bits) + sizeof(struct qt_disk_dqdbheader) + i * info->dqi_entry_size; } out_buf: From a95049c51417522b466ce4fcdfd6668afe226938 Mon Sep 17 00:00:00 2001 From: Zheng Liang Date: Thu, 12 Nov 2020 17:31:39 +0800 Subject: [PATCH 37/62] rtc: pl031: fix resource leak in pl031_probe [ Upstream commit 1eab0fea2514b269e384c117f5b5772b882761f0 ] When devm_rtc_allocate_device is failed in pl031_probe, it should release mem regions with device. Reported-by: Hulk Robot Signed-off-by: Zheng Liang Signed-off-by: Alexandre Belloni Acked-by: Linus Walleij Link: https://lore.kernel.org/r/20201112093139.32566-1-zhengliang6@huawei.com Signed-off-by: Sasha Levin --- drivers/rtc/rtc-pl031.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/rtc/rtc-pl031.c b/drivers/rtc/rtc-pl031.c index 180caebbd355..9566958476df 100644 --- a/drivers/rtc/rtc-pl031.c +++ b/drivers/rtc/rtc-pl031.c @@ -379,8 +379,10 @@ static int pl031_probe(struct amba_device *adev, const struct amba_id *id) device_init_wakeup(&adev->dev, true); ldata->rtc = devm_rtc_allocate_device(&adev->dev); - if (IS_ERR(ldata->rtc)) - return PTR_ERR(ldata->rtc); + if (IS_ERR(ldata->rtc)) { + ret = PTR_ERR(ldata->rtc); + goto out; + } ldata->rtc->ops = ops; From 22f815627c64f35e199e1bc17d7225c127244e2a Mon Sep 17 00:00:00 2001 From: Qinglang Miao Date: Wed, 28 Oct 2020 17:15:51 +0800 Subject: [PATCH 38/62] powerpc: sysdev: add missing iounmap() on error in mpic_msgr_probe() [ Upstream commit ffa1797040c5da391859a9556be7b735acbe1242 ] I noticed that iounmap() of msgr_block_addr before return from mpic_msgr_probe() in the error handling case is missing. So use devm_ioremap() instead of just ioremap() when remapping the message register block, so the mapping will be automatically released on probe failure. Signed-off-by: Qinglang Miao Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20201028091551.136400-1-miaoqinglang@huawei.com Signed-off-by: Sasha Levin --- arch/powerpc/sysdev/mpic_msgr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/sysdev/mpic_msgr.c b/arch/powerpc/sysdev/mpic_msgr.c index f6b253e2be40..36ec0bdd8b63 100644 --- a/arch/powerpc/sysdev/mpic_msgr.c +++ b/arch/powerpc/sysdev/mpic_msgr.c @@ -191,7 +191,7 @@ static int mpic_msgr_probe(struct platform_device *dev) /* IO map the message register block. */ of_address_to_resource(np, 0, &rsrc); - msgr_block_addr = ioremap(rsrc.start, resource_size(&rsrc)); + msgr_block_addr = devm_ioremap(&dev->dev, rsrc.start, resource_size(&rsrc)); if (!msgr_block_addr) { dev_err(&dev->dev, "Failed to iomap MPIC message registers"); return -EFAULT; From d52faa7fb12f444f70eae8b49b3d674ff8c8bf32 Mon Sep 17 00:00:00 2001 From: Qinglang Miao Date: Wed, 28 Oct 2020 17:15:43 +0800 Subject: [PATCH 39/62] i3c master: fix missing destroy_workqueue() on error in i3c_master_register [ Upstream commit 59165d16c699182b86b5c65181013f1fd88feb62 ] Add the missing destroy_workqueue() before return from i3c_master_register in the error handling case. Signed-off-by: Qinglang Miao Signed-off-by: Boris Brezillon Link: https://lore.kernel.org/linux-i3c/20201028091543.136167-1-miaoqinglang@huawei.com Signed-off-by: Sasha Levin --- drivers/i3c/master.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/i3c/master.c b/drivers/i3c/master.c index 6cc71c90f85e..19337aed9f23 100644 --- a/drivers/i3c/master.c +++ b/drivers/i3c/master.c @@ -2492,7 +2492,7 @@ int i3c_master_register(struct i3c_master_controller *master, ret = i3c_master_bus_init(master); if (ret) - goto err_put_dev; + goto err_destroy_wq; ret = device_add(&master->dev); if (ret) @@ -2523,6 +2523,9 @@ err_del_dev: err_cleanup_bus: i3c_master_bus_cleanup(master); +err_destroy_wq: + destroy_workqueue(master->wq); + err_put_dev: put_device(&master->dev); From dbe184f6be1e0408fdec44f836e0a42741fbd7a6 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 25 Nov 2020 12:06:14 -0500 Subject: [PATCH 40/62] NFSv4: Fix a pNFS layout related use-after-free race when freeing the inode [ Upstream commit b6d49ecd1081740b6e632366428b960461f8158b ] When returning the layout in nfs4_evict_inode(), we need to ensure that the layout is actually done being freed before we can proceed to free the inode itself. Signed-off-by: Trond Myklebust Signed-off-by: Sasha Levin --- fs/nfs/nfs4super.c | 2 +- fs/nfs/pnfs.c | 33 +++++++++++++++++++++++++++++++-- fs/nfs/pnfs.h | 5 +++++ 3 files changed, 37 insertions(+), 3 deletions(-) diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c index 04c57066a11a..b90642b022eb 100644 --- a/fs/nfs/nfs4super.c +++ b/fs/nfs/nfs4super.c @@ -96,7 +96,7 @@ static void nfs4_evict_inode(struct inode *inode) nfs_inode_return_delegation_noreclaim(inode); /* Note that above delegreturn would trigger pnfs return-on-close */ pnfs_return_layout(inode); - pnfs_destroy_layout(NFS_I(inode)); + pnfs_destroy_layout_final(NFS_I(inode)); /* First call standard NFS clear_inode() code */ nfs_clear_inode(inode); } diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 9c2b07ce57b2..9fd115c4d0a2 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -294,6 +294,7 @@ void pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo) { struct inode *inode; + unsigned long i_state; if (!lo) return; @@ -304,8 +305,12 @@ pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo) if (!list_empty(&lo->plh_segs)) WARN_ONCE(1, "NFS: BUG unfreed layout segments.\n"); pnfs_detach_layout_hdr(lo); + i_state = inode->i_state; spin_unlock(&inode->i_lock); pnfs_free_layout_hdr(lo); + /* Notify pnfs_destroy_layout_final() that we're done */ + if (i_state & (I_FREEING | I_CLEAR)) + wake_up_var(lo); } } @@ -723,8 +728,7 @@ pnfs_free_lseg_list(struct list_head *free_me) } } -void -pnfs_destroy_layout(struct nfs_inode *nfsi) +static struct pnfs_layout_hdr *__pnfs_destroy_layout(struct nfs_inode *nfsi) { struct pnfs_layout_hdr *lo; LIST_HEAD(tmp_list); @@ -742,9 +746,34 @@ pnfs_destroy_layout(struct nfs_inode *nfsi) pnfs_put_layout_hdr(lo); } else spin_unlock(&nfsi->vfs_inode.i_lock); + return lo; +} + +void pnfs_destroy_layout(struct nfs_inode *nfsi) +{ + __pnfs_destroy_layout(nfsi); } EXPORT_SYMBOL_GPL(pnfs_destroy_layout); +static bool pnfs_layout_removed(struct nfs_inode *nfsi, + struct pnfs_layout_hdr *lo) +{ + bool ret; + + spin_lock(&nfsi->vfs_inode.i_lock); + ret = nfsi->layout != lo; + spin_unlock(&nfsi->vfs_inode.i_lock); + return ret; +} + +void pnfs_destroy_layout_final(struct nfs_inode *nfsi) +{ + struct pnfs_layout_hdr *lo = __pnfs_destroy_layout(nfsi); + + if (lo) + wait_var_event(lo, pnfs_layout_removed(nfsi, lo)); +} + static bool pnfs_layout_add_bulk_destroy_list(struct inode *inode, struct list_head *layout_list) diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index f8a38065c7e4..63da33a92d83 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -255,6 +255,7 @@ struct pnfs_layout_segment *pnfs_layout_process(struct nfs4_layoutget *lgp); void pnfs_layoutget_free(struct nfs4_layoutget *lgp); void pnfs_free_lseg_list(struct list_head *tmp_list); void pnfs_destroy_layout(struct nfs_inode *); +void pnfs_destroy_layout_final(struct nfs_inode *); void pnfs_destroy_all_layouts(struct nfs_client *); int pnfs_destroy_layouts_byfsid(struct nfs_client *clp, struct nfs_fsid *fsid, @@ -651,6 +652,10 @@ static inline void pnfs_destroy_layout(struct nfs_inode *nfsi) { } +static inline void pnfs_destroy_layout_final(struct nfs_inode *nfsi) +{ +} + static inline struct pnfs_layout_segment * pnfs_get_lseg(struct pnfs_layout_segment *lseg) { From 86db71810a27f7d549cd38f4191a605b3253cb3a Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 6 Nov 2020 13:22:05 -0800 Subject: [PATCH 41/62] f2fs: avoid race condition for shrinker count [ Upstream commit a95ba66ac1457b76fe472c8e092ab1006271f16c ] Light reported sometimes shinker gets nat_cnt < dirty_nat_cnt resulting in wrong do_shinker work. Let's avoid to return insane overflowed value by adding single tracking value. Reported-by: Light Hsieh Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim Signed-off-by: Sasha Levin --- fs/f2fs/checkpoint.c | 2 +- fs/f2fs/debug.c | 11 ++++++----- fs/f2fs/f2fs.h | 10 ++++++++-- fs/f2fs/node.c | 29 ++++++++++++++++++----------- fs/f2fs/node.h | 4 ++-- fs/f2fs/shrinker.c | 4 +--- 6 files changed, 36 insertions(+), 24 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index c966ccc44c15..a57219c51c01 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1596,7 +1596,7 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) goto out; } - if (NM_I(sbi)->dirty_nat_cnt == 0 && + if (NM_I(sbi)->nat_cnt[DIRTY_NAT] == 0 && SIT_I(sbi)->dirty_sentries == 0 && prefree_segments(sbi) == 0) { f2fs_flush_sit_entries(sbi, cpc); diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 9b0bedd82581..d8d64447bc94 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -107,8 +107,8 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->node_pages = NODE_MAPPING(sbi)->nrpages; if (sbi->meta_inode) si->meta_pages = META_MAPPING(sbi)->nrpages; - si->nats = NM_I(sbi)->nat_cnt; - si->dirty_nats = NM_I(sbi)->dirty_nat_cnt; + si->nats = NM_I(sbi)->nat_cnt[TOTAL_NAT]; + si->dirty_nats = NM_I(sbi)->nat_cnt[DIRTY_NAT]; si->sits = MAIN_SEGS(sbi); si->dirty_sits = SIT_I(sbi)->dirty_sentries; si->free_nids = NM_I(sbi)->nid_cnt[FREE_NID]; @@ -254,9 +254,10 @@ get_cache: si->cache_mem += (NM_I(sbi)->nid_cnt[FREE_NID] + NM_I(sbi)->nid_cnt[PREALLOC_NID]) * sizeof(struct free_nid); - si->cache_mem += NM_I(sbi)->nat_cnt * sizeof(struct nat_entry); - si->cache_mem += NM_I(sbi)->dirty_nat_cnt * - sizeof(struct nat_entry_set); + si->cache_mem += NM_I(sbi)->nat_cnt[TOTAL_NAT] * + sizeof(struct nat_entry); + si->cache_mem += NM_I(sbi)->nat_cnt[DIRTY_NAT] * + sizeof(struct nat_entry_set); si->cache_mem += si->inmem_pages * sizeof(struct inmem_pages); for (i = 0; i < MAX_INO_ENTRY; i++) si->cache_mem += sbi->im[i].ino_num * sizeof(struct ino_entry); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 0ddc4a74b9d4..4ca3c2a0a0f5 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -797,6 +797,13 @@ enum nid_state { MAX_NID_STATE, }; +enum nat_state { + TOTAL_NAT, + DIRTY_NAT, + RECLAIMABLE_NAT, + MAX_NAT_STATE, +}; + struct f2fs_nm_info { block_t nat_blkaddr; /* base disk address of NAT */ nid_t max_nid; /* maximum possible node ids */ @@ -812,8 +819,7 @@ struct f2fs_nm_info { struct rw_semaphore nat_tree_lock; /* protect nat_tree_lock */ struct list_head nat_entries; /* cached nat entry list (clean) */ spinlock_t nat_list_lock; /* protect clean nat entry list */ - unsigned int nat_cnt; /* the # of cached nat entries */ - unsigned int dirty_nat_cnt; /* total num of nat entries in set */ + unsigned int nat_cnt[MAX_NAT_STATE]; /* the # of cached nat entries */ unsigned int nat_blocks; /* # of nat blocks */ /* free node ids management */ diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 3ac2a4b32375..7ce33698ae38 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -62,8 +62,8 @@ bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type) sizeof(struct free_nid)) >> PAGE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2); } else if (type == NAT_ENTRIES) { - mem_size = (nm_i->nat_cnt * sizeof(struct nat_entry)) >> - PAGE_SHIFT; + mem_size = (nm_i->nat_cnt[TOTAL_NAT] * + sizeof(struct nat_entry)) >> PAGE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2); if (excess_cached_nats(sbi)) res = false; @@ -177,7 +177,8 @@ static struct nat_entry *__init_nat_entry(struct f2fs_nm_info *nm_i, list_add_tail(&ne->list, &nm_i->nat_entries); spin_unlock(&nm_i->nat_list_lock); - nm_i->nat_cnt++; + nm_i->nat_cnt[TOTAL_NAT]++; + nm_i->nat_cnt[RECLAIMABLE_NAT]++; return ne; } @@ -207,7 +208,8 @@ static unsigned int __gang_lookup_nat_cache(struct f2fs_nm_info *nm_i, static void __del_from_nat_cache(struct f2fs_nm_info *nm_i, struct nat_entry *e) { radix_tree_delete(&nm_i->nat_root, nat_get_nid(e)); - nm_i->nat_cnt--; + nm_i->nat_cnt[TOTAL_NAT]--; + nm_i->nat_cnt[RECLAIMABLE_NAT]--; __free_nat_entry(e); } @@ -253,7 +255,8 @@ static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i, if (get_nat_flag(ne, IS_DIRTY)) goto refresh_list; - nm_i->dirty_nat_cnt++; + nm_i->nat_cnt[DIRTY_NAT]++; + nm_i->nat_cnt[RECLAIMABLE_NAT]--; set_nat_flag(ne, IS_DIRTY, true); refresh_list: spin_lock(&nm_i->nat_list_lock); @@ -273,7 +276,8 @@ static void __clear_nat_cache_dirty(struct f2fs_nm_info *nm_i, set_nat_flag(ne, IS_DIRTY, false); set->entry_cnt--; - nm_i->dirty_nat_cnt--; + nm_i->nat_cnt[DIRTY_NAT]--; + nm_i->nat_cnt[RECLAIMABLE_NAT]++; } static unsigned int __gang_lookup_nat_set(struct f2fs_nm_info *nm_i, @@ -2881,14 +2885,17 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) LIST_HEAD(sets); int err = 0; - /* during unmount, let's flush nat_bits before checking dirty_nat_cnt */ + /* + * during unmount, let's flush nat_bits before checking + * nat_cnt[DIRTY_NAT]. + */ if (enabled_nat_bits(sbi, cpc)) { down_write(&nm_i->nat_tree_lock); remove_nats_in_journal(sbi); up_write(&nm_i->nat_tree_lock); } - if (!nm_i->dirty_nat_cnt) + if (!nm_i->nat_cnt[DIRTY_NAT]) return 0; down_write(&nm_i->nat_tree_lock); @@ -2899,7 +2906,8 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) * into nat entry set. */ if (enabled_nat_bits(sbi, cpc) || - !__has_cursum_space(journal, nm_i->dirty_nat_cnt, NAT_JOURNAL)) + !__has_cursum_space(journal, + nm_i->nat_cnt[DIRTY_NAT], NAT_JOURNAL)) remove_nats_in_journal(sbi); while ((found = __gang_lookup_nat_set(nm_i, @@ -3023,7 +3031,6 @@ static int init_node_manager(struct f2fs_sb_info *sbi) F2FS_RESERVED_NODE_NUM; nm_i->nid_cnt[FREE_NID] = 0; nm_i->nid_cnt[PREALLOC_NID] = 0; - nm_i->nat_cnt = 0; nm_i->ram_thresh = DEF_RAM_THRESHOLD; nm_i->ra_nid_pages = DEF_RA_NID_PAGES; nm_i->dirty_nats_ratio = DEF_DIRTY_NAT_RATIO_THRESHOLD; @@ -3160,7 +3167,7 @@ void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi) __del_from_nat_cache(nm_i, natvec[idx]); } } - f2fs_bug_on(sbi, nm_i->nat_cnt); + f2fs_bug_on(sbi, nm_i->nat_cnt[TOTAL_NAT]); /* destroy nat set cache */ nid = 0; diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index e05af5df5648..4a2e7eaf2b02 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -123,13 +123,13 @@ static inline void raw_nat_from_node_info(struct f2fs_nat_entry *raw_ne, static inline bool excess_dirty_nats(struct f2fs_sb_info *sbi) { - return NM_I(sbi)->dirty_nat_cnt >= NM_I(sbi)->max_nid * + return NM_I(sbi)->nat_cnt[DIRTY_NAT] >= NM_I(sbi)->max_nid * NM_I(sbi)->dirty_nats_ratio / 100; } static inline bool excess_cached_nats(struct f2fs_sb_info *sbi) { - return NM_I(sbi)->nat_cnt >= DEF_NAT_CACHE_THRESHOLD; + return NM_I(sbi)->nat_cnt[TOTAL_NAT] >= DEF_NAT_CACHE_THRESHOLD; } static inline bool excess_dirty_nodes(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c index a467aca29cfe..3ceebaaee384 100644 --- a/fs/f2fs/shrinker.c +++ b/fs/f2fs/shrinker.c @@ -18,9 +18,7 @@ static unsigned int shrinker_run_no; static unsigned long __count_nat_entries(struct f2fs_sb_info *sbi) { - long count = NM_I(sbi)->nat_cnt - NM_I(sbi)->dirty_nat_cnt; - - return count > 0 ? count : 0; + return NM_I(sbi)->nat_cnt[RECLAIMABLE_NAT]; } static unsigned long __count_free_nids(struct f2fs_sb_info *sbi) From 9f4e8026d202f7097659a7efd87c02f029cf8ebe Mon Sep 17 00:00:00 2001 From: Jessica Yu Date: Fri, 27 Nov 2020 10:09:39 +0100 Subject: [PATCH 42/62] module: delay kobject uevent until after module init call [ Upstream commit 38dc717e97153e46375ee21797aa54777e5498f3 ] Apparently there has been a longstanding race between udev/systemd and the module loader. Currently, the module loader sends a uevent right after sysfs initialization, but before the module calls its init function. However, some udev rules expect that the module has initialized already upon receiving the uevent. This race has been triggered recently (see link in references) in some systemd mount unit files. For instance, the configfs module creates the /sys/kernel/config mount point in its init function, however the module loader issues the uevent before this happens. sys-kernel-config.mount expects to be able to mount /sys/kernel/config upon receipt of the module loading uevent, but if the configfs module has not called its init function yet, then this directory will not exist and the mount unit fails. A similar situation exists for sys-fs-fuse-connections.mount, as the fuse sysfs mount point is created during the fuse module's init function. If udev is faster than module initialization then the mount unit would fail in a similar fashion. To fix this race, delay the module KOBJ_ADD uevent until after the module has finished calling its init routine. References: https://github.com/systemd/systemd/issues/17586 Reviewed-by: Greg Kroah-Hartman Tested-By: Nicolas Morey-Chaisemartin Signed-off-by: Jessica Yu Signed-off-by: Sasha Levin --- kernel/module.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/module.c b/kernel/module.c index 806a7196754a..9e9af40698ff 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1863,7 +1863,6 @@ static int mod_sysfs_init(struct module *mod) if (err) mod_kobject_put(mod); - /* delay uevent until full sysfs population */ out: return err; } @@ -1900,7 +1899,6 @@ static int mod_sysfs_setup(struct module *mod, add_sect_attrs(mod, info); add_notes_attrs(mod, info); - kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD); return 0; out_unreg_modinfo_attrs: @@ -3608,6 +3606,9 @@ static noinline int do_init_module(struct module *mod) blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_LIVE, mod); + /* Delay uevent until module has finished its init routine */ + kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD); + /* * We need to finish all async code before the module init sequence * is done. This has potential to deadlock. For example, a newly From d32747bb687d176e54e3d356e6a200678c4e9550 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 31 Oct 2020 21:40:21 -0700 Subject: [PATCH 43/62] fs/namespace.c: WARN if mnt_count has become negative [ Upstream commit edf7ddbf1c5eb98b720b063b73e20e8a4a1ce673 ] Missing calls to mntget() (or equivalently, too many calls to mntput()) are hard to detect because mntput() delays freeing mounts using task_work_add(), then again using call_rcu(). As a result, mnt_count can often be decremented to -1 without getting a KASAN use-after-free report. Such cases are still bugs though, and they point to real use-after-frees being possible. For an example of this, see the bug fixed by commit 1b0b9cc8d379 ("vfs: fsmount: add missing mntget()"), discussed at https://lkml.kernel.org/linux-fsdevel/20190605135401.GB30925@xxxxxxxxxxxxxxxxxxxxxxxxx/T/#u. This bug *should* have been trivial to find. But actually, it wasn't found until syzkaller happened to use fchdir() to manipulate the reference count just right for the bug to be noticeable. Address this by making mntput_no_expire() issue a WARN if mnt_count has become negative. Suggested-by: Miklos Szeredi Signed-off-by: Eric Biggers Signed-off-by: Al Viro Signed-off-by: Sasha Levin --- fs/namespace.c | 9 ++++++--- fs/pnode.h | 2 +- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 2adfe7b166a3..76ea92994d26 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -156,10 +156,10 @@ static inline void mnt_add_count(struct mount *mnt, int n) /* * vfsmount lock must be held for write */ -unsigned int mnt_get_count(struct mount *mnt) +int mnt_get_count(struct mount *mnt) { #ifdef CONFIG_SMP - unsigned int count = 0; + int count = 0; int cpu; for_each_possible_cpu(cpu) { @@ -1123,6 +1123,7 @@ static DECLARE_DELAYED_WORK(delayed_mntput_work, delayed_mntput); static void mntput_no_expire(struct mount *mnt) { LIST_HEAD(list); + int count; rcu_read_lock(); if (likely(READ_ONCE(mnt->mnt_ns))) { @@ -1146,7 +1147,9 @@ static void mntput_no_expire(struct mount *mnt) */ smp_mb(); mnt_add_count(mnt, -1); - if (mnt_get_count(mnt)) { + count = mnt_get_count(mnt); + if (count != 0) { + WARN_ON(count < 0); rcu_read_unlock(); unlock_mount_hash(); return; diff --git a/fs/pnode.h b/fs/pnode.h index 49a058c73e4c..26f74e092bd9 100644 --- a/fs/pnode.h +++ b/fs/pnode.h @@ -44,7 +44,7 @@ int propagate_mount_busy(struct mount *, int); void propagate_mount_unlock(struct mount *); void mnt_release_group_id(struct mount *); int get_dominating_id(struct mount *mnt, const struct path *root); -unsigned int mnt_get_count(struct mount *mnt); +int mnt_get_count(struct mount *mnt); void mnt_set_mountpoint(struct mount *, struct mountpoint *, struct mount *); void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, From 1dab82dd202d27ad4266e2711e250bfe5127bcbf Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Sat, 21 Nov 2020 23:13:56 -0500 Subject: [PATCH 44/62] um: ubd: Submit all data segments atomically [ Upstream commit fc6b6a872dcd48c6f39c7975836d75113db67d37 ] Internally, UBD treats each physical IO segment as a separate command to be submitted in the execution pipe. If the pipe returns a transient error after a few segments have already been written, UBD will tell the block layer to requeue the request, but there is no way to reclaim the segments already submitted. When a new attempt to dispatch the request is done, those segments already submitted will get duplicated, causing the WARN_ON below in the best case, and potentially data corruption. In my system, running a UML instance with 2GB of RAM and a 50M UBD disk, I can reproduce the WARN_ON by simply running mkfs.fvat against the disk on a freshly booted system. There are a few ways to around this, like reducing the pressure on the pipe by reducing the queue depth, which almost eliminates the occurrence of the problem, increasing the pipe buffer size on the host system, or by limiting the request to one physical segment, which causes the block layer to submit way more requests to resolve a single operation. Instead, this patch modifies the format of a UBD command, such that all segments are sent through a single element in the communication pipe, turning the command submission atomic from the point of view of the block layer. The new format has a variable size, depending on the number of elements, and looks like this: +------------+-----------+-----------+------------ | cmd_header | segment 0 | segment 1 | segment ... +------------+-----------+-----------+------------ With this format, we push a pointer to cmd_header in the submission pipe. This has the advantage of reducing the memory footprint of executing a single request, since it allow us to merge some fields in the header. It is possible to reduce even further each segment memory footprint, by merging bitmap_words and cow_offset, for instance, but this is not the focus of this patch and is left as future work. One issue with the patch is that for a big number of segments, we now perform one big memory allocation instead of multiple small ones, but I wasn't able to trigger any real issues or -ENOMEM because of this change, that wouldn't be reproduced otherwise. This was tested using fio with the verify-crc32 option, and by running an ext4 filesystem over this UBD device. The original WARN_ON was: ------------[ cut here ]------------ WARNING: CPU: 0 PID: 0 at lib/refcount.c:28 refcount_warn_saturate+0x13f/0x141 refcount_t: underflow; use-after-free. Modules linked in: CPU: 0 PID: 0 Comm: swapper Not tainted 5.5.0-rc6-00002-g2a5bb2cf75c8 #346 Stack: 6084eed0 6063dc77 00000009 6084ef60 00000000 604b8d9f 6084eee0 6063dcbc 6084ef40 6006ab8d e013d780 1c00000000 Call Trace: [<600a0c1c>] ? printk+0x0/0x94 [<6004a888>] show_stack+0x13b/0x155 [<6063dc77>] ? dump_stack_print_info+0xdf/0xe8 [<604b8d9f>] ? refcount_warn_saturate+0x13f/0x141 [<6063dcbc>] dump_stack+0x2a/0x2c [<6006ab8d>] __warn+0x107/0x134 [<6008da6c>] ? wake_up_process+0x17/0x19 [<60487628>] ? blk_queue_max_discard_sectors+0x0/0xd [<6006b05f>] warn_slowpath_fmt+0xd1/0xdf [<6006af8e>] ? warn_slowpath_fmt+0x0/0xdf [<600acc14>] ? raw_read_seqcount_begin.constprop.0+0x0/0x15 [<600619ae>] ? os_nsecs+0x1d/0x2b [<604b8d9f>] refcount_warn_saturate+0x13f/0x141 [<6048bc8f>] refcount_sub_and_test.constprop.0+0x2f/0x37 [<6048c8de>] blk_mq_free_request+0xf1/0x10d [<6048ca06>] __blk_mq_end_request+0x10c/0x114 [<6005ac0f>] ubd_intr+0xb5/0x169 [<600a1a37>] __handle_irq_event_percpu+0x6b/0x17e [<600a1b70>] handle_irq_event_percpu+0x26/0x69 [<600a1bd9>] handle_irq_event+0x26/0x34 [<600a1bb3>] ? handle_irq_event+0x0/0x34 [<600a5186>] ? unmask_irq+0x0/0x37 [<600a57e6>] handle_edge_irq+0xbc/0xd6 [<600a131a>] generic_handle_irq+0x21/0x29 [<60048f6e>] do_IRQ+0x39/0x54 [...] ---[ end trace c6e7444e55386c0f ]--- Cc: Christopher Obbard Reported-by: Martyn Welch Signed-off-by: Gabriel Krisman Bertazi Tested-by: Christopher Obbard Acked-by: Anton Ivanov Signed-off-by: Richard Weinberger Signed-off-by: Sasha Levin --- arch/um/drivers/ubd_kern.c | 193 ++++++++++++++++++++++--------------- 1 file changed, 116 insertions(+), 77 deletions(-) diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c index 0f5d0a699a49..4e59ab817d3e 100644 --- a/arch/um/drivers/ubd_kern.c +++ b/arch/um/drivers/ubd_kern.c @@ -47,18 +47,25 @@ /* Max request size is determined by sector mask - 32K */ #define UBD_MAX_REQUEST (8 * sizeof(long)) +struct io_desc { + char *buffer; + unsigned long length; + unsigned long sector_mask; + unsigned long long cow_offset; + unsigned long bitmap_words[2]; +}; + struct io_thread_req { struct request *req; int fds[2]; unsigned long offsets[2]; unsigned long long offset; - unsigned long length; - char *buffer; int sectorsize; - unsigned long sector_mask; - unsigned long long cow_offset; - unsigned long bitmap_words[2]; int error; + + int desc_cnt; + /* io_desc has to be the last element of the struct */ + struct io_desc io_desc[]; }; @@ -524,12 +531,7 @@ static void ubd_handler(void) blk_queue_max_write_zeroes_sectors(io_req->req->q, 0); blk_queue_flag_clear(QUEUE_FLAG_DISCARD, io_req->req->q); } - if ((io_req->error) || (io_req->buffer == NULL)) - blk_mq_end_request(io_req->req, io_req->error); - else { - if (!blk_update_request(io_req->req, io_req->error, io_req->length)) - __blk_mq_end_request(io_req->req, io_req->error); - } + blk_mq_end_request(io_req->req, io_req->error); kfree(io_req); } } @@ -945,6 +947,7 @@ static int ubd_add(int n, char **error_out) blk_queue_write_cache(ubd_dev->queue, true, false); blk_queue_max_segments(ubd_dev->queue, MAX_SG); + blk_queue_segment_boundary(ubd_dev->queue, PAGE_SIZE - 1); err = ubd_disk_register(UBD_MAJOR, ubd_dev->size, n, &ubd_gendisk[n]); if(err){ *error_out = "Failed to register device"; @@ -1288,37 +1291,74 @@ static void cowify_bitmap(__u64 io_offset, int length, unsigned long *cow_mask, *cow_offset += bitmap_offset; } -static void cowify_req(struct io_thread_req *req, unsigned long *bitmap, +static void cowify_req(struct io_thread_req *req, struct io_desc *segment, + unsigned long offset, unsigned long *bitmap, __u64 bitmap_offset, __u64 bitmap_len) { - __u64 sector = req->offset >> SECTOR_SHIFT; + __u64 sector = offset >> SECTOR_SHIFT; int i; - if (req->length > (sizeof(req->sector_mask) * 8) << SECTOR_SHIFT) + if (segment->length > (sizeof(segment->sector_mask) * 8) << SECTOR_SHIFT) panic("Operation too long"); if (req_op(req->req) == REQ_OP_READ) { - for (i = 0; i < req->length >> SECTOR_SHIFT; i++) { + for (i = 0; i < segment->length >> SECTOR_SHIFT; i++) { if(ubd_test_bit(sector + i, (unsigned char *) bitmap)) ubd_set_bit(i, (unsigned char *) - &req->sector_mask); + &segment->sector_mask); } + } else { + cowify_bitmap(offset, segment->length, &segment->sector_mask, + &segment->cow_offset, bitmap, bitmap_offset, + segment->bitmap_words, bitmap_len); } - else cowify_bitmap(req->offset, req->length, &req->sector_mask, - &req->cow_offset, bitmap, bitmap_offset, - req->bitmap_words, bitmap_len); } -static int ubd_queue_one_vec(struct blk_mq_hw_ctx *hctx, struct request *req, - u64 off, struct bio_vec *bvec) +static void ubd_map_req(struct ubd *dev, struct io_thread_req *io_req, + struct request *req) { - struct ubd *dev = hctx->queue->queuedata; - struct io_thread_req *io_req; - int ret; + struct bio_vec bvec; + struct req_iterator iter; + int i = 0; + unsigned long byte_offset = io_req->offset; + int op = req_op(req); - io_req = kmalloc(sizeof(struct io_thread_req), GFP_ATOMIC); + if (op == REQ_OP_WRITE_ZEROES || op == REQ_OP_DISCARD) { + io_req->io_desc[0].buffer = NULL; + io_req->io_desc[0].length = blk_rq_bytes(req); + } else { + rq_for_each_segment(bvec, req, iter) { + BUG_ON(i >= io_req->desc_cnt); + + io_req->io_desc[i].buffer = + page_address(bvec.bv_page) + bvec.bv_offset; + io_req->io_desc[i].length = bvec.bv_len; + i++; + } + } + + if (dev->cow.file) { + for (i = 0; i < io_req->desc_cnt; i++) { + cowify_req(io_req, &io_req->io_desc[i], byte_offset, + dev->cow.bitmap, dev->cow.bitmap_offset, + dev->cow.bitmap_len); + byte_offset += io_req->io_desc[i].length; + } + + } +} + +static struct io_thread_req *ubd_alloc_req(struct ubd *dev, struct request *req, + int desc_cnt) +{ + struct io_thread_req *io_req; + int i; + + io_req = kmalloc(sizeof(*io_req) + + (desc_cnt * sizeof(struct io_desc)), + GFP_ATOMIC); if (!io_req) - return -ENOMEM; + return NULL; io_req->req = req; if (dev->cow.file) @@ -1326,26 +1366,41 @@ static int ubd_queue_one_vec(struct blk_mq_hw_ctx *hctx, struct request *req, else io_req->fds[0] = dev->fd; io_req->error = 0; - - if (bvec != NULL) { - io_req->buffer = page_address(bvec->bv_page) + bvec->bv_offset; - io_req->length = bvec->bv_len; - } else { - io_req->buffer = NULL; - io_req->length = blk_rq_bytes(req); - } - io_req->sectorsize = SECTOR_SIZE; io_req->fds[1] = dev->fd; - io_req->cow_offset = -1; - io_req->offset = off; - io_req->sector_mask = 0; + io_req->offset = (u64) blk_rq_pos(req) << SECTOR_SHIFT; io_req->offsets[0] = 0; io_req->offsets[1] = dev->cow.data_offset; - if (dev->cow.file) - cowify_req(io_req, dev->cow.bitmap, - dev->cow.bitmap_offset, dev->cow.bitmap_len); + for (i = 0 ; i < desc_cnt; i++) { + io_req->io_desc[i].sector_mask = 0; + io_req->io_desc[i].cow_offset = -1; + } + + return io_req; +} + +static int ubd_submit_request(struct ubd *dev, struct request *req) +{ + int segs = 0; + struct io_thread_req *io_req; + int ret; + int op = req_op(req); + + if (op == REQ_OP_FLUSH) + segs = 0; + else if (op == REQ_OP_WRITE_ZEROES || op == REQ_OP_DISCARD) + segs = 1; + else + segs = blk_rq_nr_phys_segments(req); + + io_req = ubd_alloc_req(dev, req, segs); + if (!io_req) + return -ENOMEM; + + io_req->desc_cnt = segs; + if (segs) + ubd_map_req(dev, io_req, req); ret = os_write_file(thread_fd, &io_req, sizeof(io_req)); if (ret != sizeof(io_req)) { @@ -1356,22 +1411,6 @@ static int ubd_queue_one_vec(struct blk_mq_hw_ctx *hctx, struct request *req, return ret; } -static int queue_rw_req(struct blk_mq_hw_ctx *hctx, struct request *req) -{ - struct req_iterator iter; - struct bio_vec bvec; - int ret; - u64 off = (u64)blk_rq_pos(req) << SECTOR_SHIFT; - - rq_for_each_segment(bvec, req, iter) { - ret = ubd_queue_one_vec(hctx, req, off, &bvec); - if (ret < 0) - return ret; - off += bvec.bv_len; - } - return 0; -} - static blk_status_t ubd_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { @@ -1384,17 +1423,12 @@ static blk_status_t ubd_queue_rq(struct blk_mq_hw_ctx *hctx, spin_lock_irq(&ubd_dev->lock); switch (req_op(req)) { - /* operations with no lentgth/offset arguments */ case REQ_OP_FLUSH: - ret = ubd_queue_one_vec(hctx, req, 0, NULL); - break; case REQ_OP_READ: case REQ_OP_WRITE: - ret = queue_rw_req(hctx, req); - break; case REQ_OP_DISCARD: case REQ_OP_WRITE_ZEROES: - ret = ubd_queue_one_vec(hctx, req, (u64)blk_rq_pos(req) << 9, NULL); + ret = ubd_submit_request(ubd_dev, req); break; default: WARN_ON_ONCE(1); @@ -1482,22 +1516,22 @@ static int map_error(int error_code) * will result in unpredictable behaviour and/or crashes. */ -static int update_bitmap(struct io_thread_req *req) +static int update_bitmap(struct io_thread_req *req, struct io_desc *segment) { int n; - if(req->cow_offset == -1) + if (segment->cow_offset == -1) return map_error(0); - n = os_pwrite_file(req->fds[1], &req->bitmap_words, - sizeof(req->bitmap_words), req->cow_offset); - if (n != sizeof(req->bitmap_words)) + n = os_pwrite_file(req->fds[1], &segment->bitmap_words, + sizeof(segment->bitmap_words), segment->cow_offset); + if (n != sizeof(segment->bitmap_words)) return map_error(-n); return map_error(0); } -static void do_io(struct io_thread_req *req) +static void do_io(struct io_thread_req *req, struct io_desc *desc) { char *buf = NULL; unsigned long len; @@ -1512,21 +1546,20 @@ static void do_io(struct io_thread_req *req) return; } - nsectors = req->length / req->sectorsize; + nsectors = desc->length / req->sectorsize; start = 0; do { - bit = ubd_test_bit(start, (unsigned char *) &req->sector_mask); + bit = ubd_test_bit(start, (unsigned char *) &desc->sector_mask); end = start; while((end < nsectors) && - (ubd_test_bit(end, (unsigned char *) - &req->sector_mask) == bit)) + (ubd_test_bit(end, (unsigned char *) &desc->sector_mask) == bit)) end++; off = req->offset + req->offsets[bit] + start * req->sectorsize; len = (end - start) * req->sectorsize; - if (req->buffer != NULL) - buf = &req->buffer[start * req->sectorsize]; + if (desc->buffer != NULL) + buf = &desc->buffer[start * req->sectorsize]; switch (req_op(req->req)) { case REQ_OP_READ: @@ -1566,7 +1599,8 @@ static void do_io(struct io_thread_req *req) start = end; } while(start < nsectors); - req->error = update_bitmap(req); + req->offset += len; + req->error = update_bitmap(req, desc); } /* Changed in start_io_thread, which is serialized by being called only @@ -1599,8 +1633,13 @@ int io_thread(void *arg) } for (count = 0; count < n/sizeof(struct io_thread_req *); count++) { + struct io_thread_req *req = (*io_req_buffer)[count]; + int i; + io_count++; - do_io((*io_req_buffer)[count]); + for (i = 0; !req->error && i < req->desc_cnt; i++) + do_io(req, &(req->io_desc[i])); + } written = 0; From 480abac78e032d83a5797e39947526e94e7c8427 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 6 Dec 2020 22:12:55 +0100 Subject: [PATCH 45/62] tick/sched: Remove bogus boot "safety" check [ Upstream commit ba8ea8e7dd6e1662e34e730eadfc52aa6816f9dd ] can_stop_idle_tick() checks whether the do_timer() duty has been taken over by a CPU on boot. That's silly because the boot CPU always takes over with the initial clockevent device. But even if no CPU would have installed a clockevent and taken over the duty then the question whether the tick on the current CPU can be stopped or not is moot. In that case the current CPU would have no clockevent either, so there would be nothing to keep ticking. Remove it. Signed-off-by: Thomas Gleixner Acked-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20201206212002.725238293@linutronix.de Signed-off-by: Sasha Levin --- kernel/time/tick-sched.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 5c9fcc72460d..4419486d7413 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -916,13 +916,6 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) */ if (tick_do_timer_cpu == cpu) return false; - /* - * Boot safety: make sure the timekeeping duty has been - * assigned before entering dyntick-idle mode, - * tick_do_timer_cpu is TICK_DO_TIMER_BOOT - */ - if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_BOOT)) - return false; /* Should not happen for nohz-full */ if (WARN_ON_ONCE(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) From 8b3c00977264340450ea4131e97c7be30944edc1 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Fri, 18 Dec 2020 15:56:25 +0100 Subject: [PATCH 46/62] ALSA: pcm: Clear the full allocated memory at hw_params [ Upstream commit 618de0f4ef11acd8cf26902e65493d46cc20cc89 ] The PCM hw_params core function tries to clear up the PCM buffer before actually using for avoiding the information leak from the previous usages or the usage before a new allocation. It performs the memset() with runtime->dma_bytes, but this might still leave some remaining bytes untouched; namely, the PCM buffer size is aligned in page size for mmap, hence runtime->dma_bytes doesn't necessarily cover all PCM buffer pages, and the remaining bytes are exposed via mmap. This patch changes the memory clearance to cover the all buffer pages if the stream is supposed to be mmap-ready (that guarantees that the buffer size is aligned in page size). Reviewed-by: Lars-Peter Clausen Link: https://lore.kernel.org/r/20201218145625.2045-3-tiwai@suse.de Signed-off-by: Takashi Iwai Signed-off-by: Sasha Levin --- sound/core/pcm_native.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c index ec501fbaabe4..0c5b7a54ca81 100644 --- a/sound/core/pcm_native.c +++ b/sound/core/pcm_native.c @@ -717,8 +717,13 @@ static int snd_pcm_hw_params(struct snd_pcm_substream *substream, runtime->boundary *= 2; /* clear the buffer for avoiding possible kernel info leaks */ - if (runtime->dma_area && !substream->ops->copy_user) - memset(runtime->dma_area, 0, runtime->dma_bytes); + if (runtime->dma_area && !substream->ops->copy_user) { + size_t size = runtime->dma_bytes; + + if (runtime->info & SNDRV_PCM_INFO_MMAP) + size = PAGE_ALIGN(size); + memset(runtime->dma_area, 0, size); + } snd_pcm_timer_resolution_change(substream); snd_pcm_set_state(substream, SNDRV_PCM_STATE_SETUP); From 41ae3e574ccf41e814faee335b762cb3abf5598c Mon Sep 17 00:00:00 2001 From: Hyeongseok Kim Date: Thu, 3 Dec 2020 09:46:59 +0900 Subject: [PATCH 47/62] dm verity: skip verity work if I/O error when system is shutting down [ Upstream commit 252bd1256396cebc6fc3526127fdb0b317601318 ] If emergency system shutdown is called, like by thermal shutdown, a dm device could be alive when the block device couldn't process I/O requests anymore. In this state, the handling of I/O errors by new dm I/O requests or by those already in-flight can lead to a verity corruption state, which is a misjudgment. So, skip verity work in response to I/O error when system is shutting down. Signed-off-by: Hyeongseok Kim Reviewed-by: Sami Tolvanen Signed-off-by: Mike Snitzer Signed-off-by: Sasha Levin --- drivers/md/dm-verity-target.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index 4fb33e7562c5..2aeb922e2365 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -533,6 +533,15 @@ static int verity_verify_io(struct dm_verity_io *io) return 0; } +/* + * Skip verity work in response to I/O error when system is shutting down. + */ +static inline bool verity_is_system_shutting_down(void) +{ + return system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF + || system_state == SYSTEM_RESTART; +} + /* * End one "io" structure with a given error. */ @@ -560,7 +569,8 @@ static void verity_end_io(struct bio *bio) { struct dm_verity_io *io = bio->bi_private; - if (bio->bi_status && !verity_fec_is_enabled(io->v)) { + if (bio->bi_status && + (!verity_fec_is_enabled(io->v) || verity_is_system_shutting_down())) { verity_finish_io(io, bio->bi_status); return; } From b3f656a592f3ade657d14888fd3dc92a14975890 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 6 Jan 2021 14:48:41 +0100 Subject: [PATCH 48/62] Linux 5.4.87 Tested-by: Jon Hunter Tested-by: Linux Kernel Functional Testing Tested-by: Shuah Khan Tested-by: Guenter Roeck Link: https://lore.kernel.org/r/20210104155705.740576914@linuxfoundation.org Signed-off-by: Greg Kroah-Hartman --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index e1a94c8d278e..71968b4bb313 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 VERSION = 5 PATCHLEVEL = 4 -SUBLEVEL = 86 +SUBLEVEL = 87 EXTRAVERSION = NAME = Kleptomaniac Octopus From da5b4cf021b9b239377a139e1d6a9055c89d984a Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Tue, 5 Jan 2021 11:45:45 -0500 Subject: [PATCH 49/62] Revert "drm/amd/display: Fix memory leaks in S3 resume" This reverts commit a135a1b4c4db1f3b8cbed9676a40ede39feb3362. This leads to blank screens on some boards after replugging a display. Revert until we understand the root cause and can fix both the leak and the blank screen after replug. Cc: Stylon Wang Cc: Harry Wentland Cc: Nicholas Kazlauskas Cc: Andre Tomt Cc: Oleksandr Natalenko Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 09410971615c..d2dd387c95d8 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -1434,8 +1434,7 @@ amdgpu_dm_update_connector_after_detect(struct amdgpu_dm_connector *aconnector) drm_connector_update_edid_property(connector, aconnector->edid); - aconnector->num_modes = drm_add_edid_modes(connector, aconnector->edid); - drm_connector_list_update(connector); + drm_add_edid_modes(connector, aconnector->edid); if (aconnector->dc_link->aux_mode) drm_dp_cec_set_edid(&aconnector->dm_dp_aux.aux, From 4d3ba541bede68f9fc3d767396a94cfe20af6fbb Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Tue, 5 Jan 2021 11:18:21 +0100 Subject: [PATCH 50/62] Revert "mtd: spinand: Fix OOB read" This reverts stable commit baad618d078c857f99cc286ea249e9629159901f. This commit is adding lines to spinand_write_to_cache_op, wheras the upstream commit 868cbe2a6dcee451bd8f87cbbb2a73cf463b57e5 that this was supposed to backport was touching spinand_read_from_cache_op. It causes a crash on writing OOB data by attempting to write to read-only kernel memory. Cc: Miquel Raynal Signed-off-by: Felix Fietkau Signed-off-by: Greg Kroah-Hartman --- drivers/mtd/nand/spi/core.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/mtd/nand/spi/core.c b/drivers/mtd/nand/spi/core.c index 671700af9180..0d21c68bfe24 100644 --- a/drivers/mtd/nand/spi/core.c +++ b/drivers/mtd/nand/spi/core.c @@ -317,10 +317,6 @@ static int spinand_write_to_cache_op(struct spinand_device *spinand, buf += ret; } - if (req->ooblen) - memcpy(req->oobbuf.in, spinand->oobbuf + req->ooboffs, - req->ooblen); - return 0; } From f2a0b7677444e12e604ed84b6cecc01ec6228b48 Mon Sep 17 00:00:00 2001 From: Tudor Ambarus Date: Thu, 23 Jan 2020 14:03:02 +0000 Subject: [PATCH 51/62] dmaengine: at_hdmac: Substitute kzalloc with kmalloc commit a6e7f19c910068cb54983f36acebedb376f3a9ac upstream. All members of the structure are initialized below in the function, there is no need to use kzalloc. Signed-off-by: Tudor Ambarus Acked-by: Ludovic Desroches Link: https://lore.kernel.org/r/20200123140237.125799-1-tudor.ambarus@microchip.com Signed-off-by: Vinod Koul Signed-off-by: Greg Kroah-Hartman --- drivers/dma/at_hdmac.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/dma/at_hdmac.c b/drivers/dma/at_hdmac.c index ff366c2f58c1..9f2c377c8d18 100644 --- a/drivers/dma/at_hdmac.c +++ b/drivers/dma/at_hdmac.c @@ -1673,7 +1673,7 @@ static struct dma_chan *at_dma_xlate(struct of_phandle_args *dma_spec, dma_cap_zero(mask); dma_cap_set(DMA_SLAVE, mask); - atslave = kzalloc(sizeof(*atslave), GFP_KERNEL); + atslave = kmalloc(sizeof(*atslave), GFP_KERNEL); if (!atslave) return NULL; From 20d5ee563bfdcd45cfcb9c898102e0c2f63f8b6f Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 17 Aug 2020 19:57:27 +0800 Subject: [PATCH 52/62] dmaengine: at_hdmac: add missing put_device() call in at_dma_xlate() commit 3832b78b3ec2cf51e07102f9b4480e343459b20f upstream. If of_find_device_by_node() succeed, at_dma_xlate() doesn't have a corresponding put_device(). Thus add put_device() to fix the exception handling for this function implementation. Fixes: bbe89c8e3d59 ("at_hdmac: move to generic DMA binding") Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20200817115728.1706719-3-yukuai3@huawei.com Signed-off-by: Vinod Koul Signed-off-by: Greg Kroah-Hartman --- drivers/dma/at_hdmac.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/dma/at_hdmac.c b/drivers/dma/at_hdmac.c index 9f2c377c8d18..80c1974ef539 100644 --- a/drivers/dma/at_hdmac.c +++ b/drivers/dma/at_hdmac.c @@ -1674,8 +1674,10 @@ static struct dma_chan *at_dma_xlate(struct of_phandle_args *dma_spec, dma_cap_set(DMA_SLAVE, mask); atslave = kmalloc(sizeof(*atslave), GFP_KERNEL); - if (!atslave) + if (!atslave) { + put_device(&dmac_pdev->dev); return NULL; + } atslave->cfg = ATC_DST_H2SEL_HW | ATC_SRC_H2SEL_HW; /* @@ -1704,8 +1706,10 @@ static struct dma_chan *at_dma_xlate(struct of_phandle_args *dma_spec, atslave->dma_dev = &dmac_pdev->dev; chan = dma_request_channel(mask, at_dma_filter, atslave); - if (!chan) + if (!chan) { + put_device(&dmac_pdev->dev); return NULL; + } atchan = to_at_dma_chan(chan); atchan->per_if = dma_spec->args[0] & 0xff; From 61a0d8e437bbb2e496acfff7f356cebf92c5bf1a Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 17 Aug 2020 19:57:28 +0800 Subject: [PATCH 53/62] dmaengine: at_hdmac: add missing kfree() call in at_dma_xlate() commit e097eb7473d9e70da9e03276f61cd392ccb9d79f upstream. If memory allocation for 'atslave' succeed, at_dma_xlate() doesn't have a corresponding kfree() in exception handling. Thus add kfree() for this function implementation. Fixes: bbe89c8e3d59 ("at_hdmac: move to generic DMA binding") Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20200817115728.1706719-4-yukuai3@huawei.com Signed-off-by: Vinod Koul Signed-off-by: Greg Kroah-Hartman --- drivers/dma/at_hdmac.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/dma/at_hdmac.c b/drivers/dma/at_hdmac.c index 80c1974ef539..303bc3e601a1 100644 --- a/drivers/dma/at_hdmac.c +++ b/drivers/dma/at_hdmac.c @@ -1708,6 +1708,7 @@ static struct dma_chan *at_dma_xlate(struct of_phandle_args *dma_spec, chan = dma_request_channel(mask, at_dma_filter, atslave); if (!chan) { put_device(&dmac_pdev->dev); + kfree(atslave); return NULL; } From 7a736f41013e8fa53629ac04f3528ef897e36e95 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Tue, 29 Dec 2020 15:14:55 -0800 Subject: [PATCH 54/62] kdev_t: always inline major/minor helper functions commit aa8c7db494d0a83ecae583aa193f1134ef25d506 upstream. Silly GCC doesn't always inline these trivial functions. Fixes the following warning: arch/x86/kernel/sys_ia32.o: warning: objtool: cp_stat64()+0xd8: call to new_encode_dev() with UACCESS enabled Link: https://lkml.kernel.org/r/984353b44a4484d86ba9f73884b7306232e25e30.1608737428.git.jpoimboe@redhat.com Signed-off-by: Josh Poimboeuf Reported-by: Randy Dunlap Acked-by: Randy Dunlap [build-tested] Cc: Peter Zijlstra Cc: Greg Kroah-Hartman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- include/linux/kdev_t.h | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/include/linux/kdev_t.h b/include/linux/kdev_t.h index 85b5151911cf..4856706fbfeb 100644 --- a/include/linux/kdev_t.h +++ b/include/linux/kdev_t.h @@ -21,61 +21,61 @@ }) /* acceptable for old filesystems */ -static inline bool old_valid_dev(dev_t dev) +static __always_inline bool old_valid_dev(dev_t dev) { return MAJOR(dev) < 256 && MINOR(dev) < 256; } -static inline u16 old_encode_dev(dev_t dev) +static __always_inline u16 old_encode_dev(dev_t dev) { return (MAJOR(dev) << 8) | MINOR(dev); } -static inline dev_t old_decode_dev(u16 val) +static __always_inline dev_t old_decode_dev(u16 val) { return MKDEV((val >> 8) & 255, val & 255); } -static inline u32 new_encode_dev(dev_t dev) +static __always_inline u32 new_encode_dev(dev_t dev) { unsigned major = MAJOR(dev); unsigned minor = MINOR(dev); return (minor & 0xff) | (major << 8) | ((minor & ~0xff) << 12); } -static inline dev_t new_decode_dev(u32 dev) +static __always_inline dev_t new_decode_dev(u32 dev) { unsigned major = (dev & 0xfff00) >> 8; unsigned minor = (dev & 0xff) | ((dev >> 12) & 0xfff00); return MKDEV(major, minor); } -static inline u64 huge_encode_dev(dev_t dev) +static __always_inline u64 huge_encode_dev(dev_t dev) { return new_encode_dev(dev); } -static inline dev_t huge_decode_dev(u64 dev) +static __always_inline dev_t huge_decode_dev(u64 dev) { return new_decode_dev(dev); } -static inline int sysv_valid_dev(dev_t dev) +static __always_inline int sysv_valid_dev(dev_t dev) { return MAJOR(dev) < (1<<14) && MINOR(dev) < (1<<18); } -static inline u32 sysv_encode_dev(dev_t dev) +static __always_inline u32 sysv_encode_dev(dev_t dev) { return MINOR(dev) | (MAJOR(dev) << 18); } -static inline unsigned sysv_major(u32 dev) +static __always_inline unsigned sysv_major(u32 dev) { return (dev >> 18) & 0x3fff; } -static inline unsigned sysv_minor(u32 dev) +static __always_inline unsigned sysv_minor(u32 dev) { return dev & 0x3ffff; } From 06c672dd61b50ab589c33a3e65e183fa16df9394 Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Sun, 20 Sep 2020 12:27:39 +0100 Subject: [PATCH 55/62] iio:imu:bmi160: Fix alignment and data leak issues commit 7b6b51234df6cd8b04fe736b0b89c25612d896b8 upstream One of a class of bugs pointed out by Lars in a recent review. iio_push_to_buffers_with_timestamp assumes the buffer used is aligned to the size of the timestamp (8 bytes). This is not guaranteed in this driver which uses an array of smaller elements on the stack. As Lars also noted this anti pattern can involve a leak of data to userspace and that indeed can happen here. We close both issues by moving to a suitable array in the iio_priv() data with alignment explicitly requested. This data is allocated with kzalloc() so no data can leak apart from previous readings. In this driver, depending on which channels are enabled, the timestamp can be in a number of locations. Hence we cannot use a structure to specify the data layout without it being misleading. Fixes: 77c4ad2d6a9b ("iio: imu: Add initial support for Bosch BMI160") Reported-by: Lars-Peter Clausen Signed-off-by: Jonathan Cameron Reviewed-by: Alexandru Ardelean Cc: Daniel Baluta Cc: Daniel Baluta Cc: Link: https://lore.kernel.org/r/20200920112742.170751-6-jic23@kernel.org [sudip: adjust context] Signed-off-by: Sudip Mukherjee Signed-off-by: Greg Kroah-Hartman --- drivers/iio/imu/bmi160/bmi160.h | 7 +++++++ drivers/iio/imu/bmi160/bmi160_core.c | 6 ++---- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/drivers/iio/imu/bmi160/bmi160.h b/drivers/iio/imu/bmi160/bmi160.h index 621f5309d735..431f10c2b951 100644 --- a/drivers/iio/imu/bmi160/bmi160.h +++ b/drivers/iio/imu/bmi160/bmi160.h @@ -7,6 +7,13 @@ struct bmi160_data { struct regmap *regmap; struct iio_trigger *trig; + /* + * Ensure natural alignment for timestamp if present. + * Max length needed: 2 * 3 channels + 4 bytes padding + 8 byte ts. + * If fewer channels are enabled, less space may be needed, as + * long as the timestamp is still aligned to 8 bytes. + */ + __le16 buf[12] __aligned(8); }; extern const struct regmap_config bmi160_regmap_config; diff --git a/drivers/iio/imu/bmi160/bmi160_core.c b/drivers/iio/imu/bmi160/bmi160_core.c index a5994899e396..088694c82327 100644 --- a/drivers/iio/imu/bmi160/bmi160_core.c +++ b/drivers/iio/imu/bmi160/bmi160_core.c @@ -411,8 +411,6 @@ static irqreturn_t bmi160_trigger_handler(int irq, void *p) struct iio_poll_func *pf = p; struct iio_dev *indio_dev = pf->indio_dev; struct bmi160_data *data = iio_priv(indio_dev); - __le16 buf[12]; - /* 2 sens x 3 axis x __le16 + 2 x __le16 pad + 4 x __le16 tstamp */ int i, ret, j = 0, base = BMI160_REG_DATA_MAGN_XOUT_L; __le16 sample; @@ -422,10 +420,10 @@ static irqreturn_t bmi160_trigger_handler(int irq, void *p) &sample, sizeof(sample)); if (ret) goto done; - buf[j++] = sample; + data->buf[j++] = sample; } - iio_push_to_buffers_with_timestamp(indio_dev, buf, pf->timestamp); + iio_push_to_buffers_with_timestamp(indio_dev, data->buf, pf->timestamp); done: iio_trigger_notify_done(indio_dev->trig); return IRQ_HANDLED; From 732251cabeb3bfd917d453a42274d769d6883fc4 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 10 Dec 2020 15:33:14 +0100 Subject: [PATCH 56/62] fuse: fix bad inode [ Upstream commit 5d069dbe8aaf2a197142558b6fb2978189ba3454 ] Jan Kara's analysis of the syzbot report (edited): The reproducer opens a directory on FUSE filesystem, it then attaches dnotify mark to the open directory. After that a fuse_do_getattr() call finds that attributes returned by the server are inconsistent, and calls make_bad_inode() which, among other things does: inode->i_mode = S_IFREG; This then confuses dnotify which doesn't tear down its structures properly and eventually crashes. Avoid calling make_bad_inode() on a live inode: switch to a private flag on the fuse inode. Also add the test to ops which the bad_inode_ops would have caught. This bug goes back to the initial merge of fuse in 2.6.14... Reported-by: syzbot+f427adf9324b92652ccc@syzkaller.appspotmail.com Signed-off-by: Miklos Szeredi Tested-by: Jan Kara Cc: Signed-off-by: Sasha Levin --- fs/fuse/acl.c | 6 ++++++ fs/fuse/dir.c | 37 ++++++++++++++++++++++++++++++++----- fs/fuse/file.c | 19 +++++++++++-------- fs/fuse/fuse_i.h | 12 ++++++++++++ fs/fuse/inode.c | 4 ++-- fs/fuse/readdir.c | 4 ++-- fs/fuse/xattr.c | 9 +++++++++ 7 files changed, 74 insertions(+), 17 deletions(-) diff --git a/fs/fuse/acl.c b/fs/fuse/acl.c index 5a48cee6d7d3..f529075a2ce8 100644 --- a/fs/fuse/acl.c +++ b/fs/fuse/acl.c @@ -19,6 +19,9 @@ struct posix_acl *fuse_get_acl(struct inode *inode, int type) void *value = NULL; struct posix_acl *acl; + if (fuse_is_bad(inode)) + return ERR_PTR(-EIO); + if (!fc->posix_acl || fc->no_getxattr) return NULL; @@ -53,6 +56,9 @@ int fuse_set_acl(struct inode *inode, struct posix_acl *acl, int type) const char *name; int ret; + if (fuse_is_bad(inode)) + return -EIO; + if (!fc->posix_acl || fc->no_setxattr) return -EOPNOTSUPP; diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index ee190119f45c..60378f3baaae 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -201,7 +201,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) int ret; inode = d_inode_rcu(entry); - if (inode && is_bad_inode(inode)) + if (inode && fuse_is_bad(inode)) goto invalid; else if (time_before64(fuse_dentry_time(entry), get_jiffies_64()) || (flags & LOOKUP_REVAL)) { @@ -386,6 +386,9 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry, bool outarg_valid = true; bool locked; + if (fuse_is_bad(dir)) + return ERR_PTR(-EIO); + locked = fuse_lock_inode(dir); err = fuse_lookup_name(dir->i_sb, get_node_id(dir), &entry->d_name, &outarg, &inode); @@ -529,6 +532,9 @@ static int fuse_atomic_open(struct inode *dir, struct dentry *entry, struct fuse_conn *fc = get_fuse_conn(dir); struct dentry *res = NULL; + if (fuse_is_bad(dir)) + return -EIO; + if (d_in_lookup(entry)) { res = fuse_lookup(dir, entry, 0); if (IS_ERR(res)) @@ -577,6 +583,9 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_args *args, int err; struct fuse_forget_link *forget; + if (fuse_is_bad(dir)) + return -EIO; + forget = fuse_alloc_forget(); if (!forget) return -ENOMEM; @@ -704,6 +713,9 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry) struct fuse_conn *fc = get_fuse_conn(dir); FUSE_ARGS(args); + if (fuse_is_bad(dir)) + return -EIO; + args.opcode = FUSE_UNLINK; args.nodeid = get_node_id(dir); args.in_numargs = 1; @@ -740,6 +752,9 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry) struct fuse_conn *fc = get_fuse_conn(dir); FUSE_ARGS(args); + if (fuse_is_bad(dir)) + return -EIO; + args.opcode = FUSE_RMDIR; args.nodeid = get_node_id(dir); args.in_numargs = 1; @@ -818,6 +833,9 @@ static int fuse_rename2(struct inode *olddir, struct dentry *oldent, struct fuse_conn *fc = get_fuse_conn(olddir); int err; + if (fuse_is_bad(olddir)) + return -EIO; + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) return -EINVAL; @@ -953,7 +971,7 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat, if (!err) { if (fuse_invalid_attr(&outarg.attr) || (inode->i_mode ^ outarg.attr.mode) & S_IFMT) { - make_bad_inode(inode); + fuse_make_bad(inode); err = -EIO; } else { fuse_change_attributes(inode, &outarg.attr, @@ -1155,6 +1173,9 @@ static int fuse_permission(struct inode *inode, int mask) bool refreshed = false; int err = 0; + if (fuse_is_bad(inode)) + return -EIO; + if (!fuse_allow_current_process(fc)) return -EACCES; @@ -1250,7 +1271,7 @@ static const char *fuse_get_link(struct dentry *dentry, struct inode *inode, int err; err = -EIO; - if (is_bad_inode(inode)) + if (fuse_is_bad(inode)) goto out_err; if (fc->cache_symlinks) @@ -1298,7 +1319,7 @@ static int fuse_dir_fsync(struct file *file, loff_t start, loff_t end, struct fuse_conn *fc = get_fuse_conn(inode); int err; - if (is_bad_inode(inode)) + if (fuse_is_bad(inode)) return -EIO; if (fc->no_fsyncdir) @@ -1575,7 +1596,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, if (fuse_invalid_attr(&outarg.attr) || (inode->i_mode ^ outarg.attr.mode) & S_IFMT) { - make_bad_inode(inode); + fuse_make_bad(inode); err = -EIO; goto error; } @@ -1631,6 +1652,9 @@ static int fuse_setattr(struct dentry *entry, struct iattr *attr) struct file *file = (attr->ia_valid & ATTR_FILE) ? attr->ia_file : NULL; int ret; + if (fuse_is_bad(inode)) + return -EIO; + if (!fuse_allow_current_process(get_fuse_conn(inode))) return -EACCES; @@ -1689,6 +1713,9 @@ static int fuse_getattr(const struct path *path, struct kstat *stat, struct inode *inode = d_inode(path->dentry); struct fuse_conn *fc = get_fuse_conn(inode); + if (fuse_is_bad(inode)) + return -EIO; + if (!fuse_allow_current_process(fc)) return -EACCES; diff --git a/fs/fuse/file.c b/fs/fuse/file.c index ab4fc1255aca..1e1aef1bc20b 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -222,6 +222,9 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir) fc->atomic_o_trunc && fc->writeback_cache; + if (fuse_is_bad(inode)) + return -EIO; + err = generic_file_open(inode, file); if (err) return err; @@ -443,7 +446,7 @@ static int fuse_flush(struct file *file, fl_owner_t id) FUSE_ARGS(args); int err; - if (is_bad_inode(inode)) + if (fuse_is_bad(inode)) return -EIO; if (fc->no_flush) @@ -506,7 +509,7 @@ static int fuse_fsync(struct file *file, loff_t start, loff_t end, struct fuse_conn *fc = get_fuse_conn(inode); int err; - if (is_bad_inode(inode)) + if (fuse_is_bad(inode)) return -EIO; inode_lock(inode); @@ -830,7 +833,7 @@ static int fuse_readpage(struct file *file, struct page *page) int err; err = -EIO; - if (is_bad_inode(inode)) + if (fuse_is_bad(inode)) goto out; err = fuse_do_readpage(file, page); @@ -973,7 +976,7 @@ static int fuse_readpages(struct file *file, struct address_space *mapping, int err; err = -EIO; - if (is_bad_inode(inode)) + if (fuse_is_bad(inode)) goto out; data.file = file; @@ -1569,7 +1572,7 @@ static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to) struct file *file = iocb->ki_filp; struct fuse_file *ff = file->private_data; - if (is_bad_inode(file_inode(file))) + if (fuse_is_bad(file_inode(file))) return -EIO; if (!(ff->open_flags & FOPEN_DIRECT_IO)) @@ -1583,7 +1586,7 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from) struct file *file = iocb->ki_filp; struct fuse_file *ff = file->private_data; - if (is_bad_inode(file_inode(file))) + if (fuse_is_bad(file_inode(file))) return -EIO; if (!(ff->open_flags & FOPEN_DIRECT_IO)) @@ -2133,7 +2136,7 @@ static int fuse_writepages(struct address_space *mapping, int err; err = -EIO; - if (is_bad_inode(inode)) + if (fuse_is_bad(inode)) goto out; data.inode = inode; @@ -2911,7 +2914,7 @@ long fuse_ioctl_common(struct file *file, unsigned int cmd, if (!fuse_allow_current_process(fc)) return -EACCES; - if (is_bad_inode(inode)) + if (fuse_is_bad(inode)) return -EIO; return fuse_do_ioctl(file, cmd, arg, flags); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index d7cde216fc87..e3688312e9f1 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -158,6 +158,8 @@ enum { FUSE_I_INIT_RDPLUS, /** An operation changing file size is in progress */ FUSE_I_SIZE_UNSTABLE, + /* Bad inode */ + FUSE_I_BAD, }; struct fuse_conn; @@ -787,6 +789,16 @@ static inline u64 fuse_get_attr_version(struct fuse_conn *fc) return atomic64_read(&fc->attr_version); } +static inline void fuse_make_bad(struct inode *inode) +{ + set_bit(FUSE_I_BAD, &get_fuse_inode(inode)->state); +} + +static inline bool fuse_is_bad(struct inode *inode) +{ + return unlikely(test_bit(FUSE_I_BAD, &get_fuse_inode(inode)->state)); +} + /** Device operations */ extern const struct file_operations fuse_dev_operations; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index f58ab84b09fb..aa1d5cf1bc3a 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -115,7 +115,7 @@ static void fuse_evict_inode(struct inode *inode) fuse_queue_forget(fc, fi->forget, fi->nodeid, fi->nlookup); fi->forget = NULL; } - if (S_ISREG(inode->i_mode) && !is_bad_inode(inode)) { + if (S_ISREG(inode->i_mode) && !fuse_is_bad(inode)) { WARN_ON(!list_empty(&fi->write_files)); WARN_ON(!list_empty(&fi->queued_writes)); } @@ -306,7 +306,7 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid, unlock_new_inode(inode); } else if ((inode->i_mode ^ attr->mode) & S_IFMT) { /* Inode has changed type, any I/O on the old should fail */ - make_bad_inode(inode); + fuse_make_bad(inode); iput(inode); goto retry; } diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c index 6a40f75a0d25..70f685b61e3a 100644 --- a/fs/fuse/readdir.c +++ b/fs/fuse/readdir.c @@ -207,7 +207,7 @@ retry: dput(dentry); goto retry; } - if (is_bad_inode(inode)) { + if (fuse_is_bad(inode)) { dput(dentry); return -EIO; } @@ -568,7 +568,7 @@ int fuse_readdir(struct file *file, struct dir_context *ctx) struct inode *inode = file_inode(file); int err; - if (is_bad_inode(inode)) + if (fuse_is_bad(inode)) return -EIO; mutex_lock(&ff->readdir.lock); diff --git a/fs/fuse/xattr.c b/fs/fuse/xattr.c index 20d052e08b3b..28fed5295770 100644 --- a/fs/fuse/xattr.c +++ b/fs/fuse/xattr.c @@ -113,6 +113,9 @@ ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size) struct fuse_getxattr_out outarg; ssize_t ret; + if (fuse_is_bad(inode)) + return -EIO; + if (!fuse_allow_current_process(fc)) return -EACCES; @@ -178,6 +181,9 @@ static int fuse_xattr_get(const struct xattr_handler *handler, struct dentry *dentry, struct inode *inode, const char *name, void *value, size_t size) { + if (fuse_is_bad(inode)) + return -EIO; + return fuse_getxattr(inode, name, value, size); } @@ -186,6 +192,9 @@ static int fuse_xattr_set(const struct xattr_handler *handler, const char *name, const void *value, size_t size, int flags) { + if (fuse_is_bad(inode)) + return -EIO; + if (!value) return fuse_removexattr(inode, name); From 71b8355ba667028557a88253096d6991a11abfba Mon Sep 17 00:00:00 2001 From: "peterz@infradead.org" Date: Fri, 28 Aug 2020 14:37:20 +0200 Subject: [PATCH 57/62] perf: Break deadlock involving exec_update_mutex [ Upstream commit 78af4dc949daaa37b3fcd5f348f373085b4e858f ] Syzbot reported a lock inversion involving perf. The sore point being perf holding exec_update_mutex() for a very long time, specifically across a whole bunch of filesystem ops in pmu::event_init() (uprobes) and anon_inode_getfile(). This then inverts against procfs code trying to take exec_update_mutex. Move the permission checks later, such that we need to hold the mutex over less code. Reported-by: syzbot+db9cdf3dd1f64252c6ef@syzkaller.appspotmail.com Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Sasha Levin --- kernel/events/core.c | 46 ++++++++++++++++++++++---------------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 9f7c2da99299..18dbdf248ed8 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -11001,24 +11001,6 @@ SYSCALL_DEFINE5(perf_event_open, goto err_task; } - if (task) { - err = mutex_lock_interruptible(&task->signal->exec_update_mutex); - if (err) - goto err_task; - - /* - * Reuse ptrace permission checks for now. - * - * We must hold exec_update_mutex across this and any potential - * perf_install_in_context() call for this new event to - * serialize against exec() altering our credentials (and the - * perf_event_exit_task() that could imply). - */ - err = -EACCES; - if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) - goto err_cred; - } - if (flags & PERF_FLAG_PID_CGROUP) cgroup_fd = pid; @@ -11026,7 +11008,7 @@ SYSCALL_DEFINE5(perf_event_open, NULL, NULL, cgroup_fd); if (IS_ERR(event)) { err = PTR_ERR(event); - goto err_cred; + goto err_task; } if (is_sampling_event(event)) { @@ -11145,6 +11127,24 @@ SYSCALL_DEFINE5(perf_event_open, goto err_context; } + if (task) { + err = mutex_lock_interruptible(&task->signal->exec_update_mutex); + if (err) + goto err_file; + + /* + * Preserve ptrace permission check for backwards compatibility. + * + * We must hold exec_update_mutex across this and any potential + * perf_install_in_context() call for this new event to + * serialize against exec() altering our credentials (and the + * perf_event_exit_task() that could imply). + */ + err = -EACCES; + if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) + goto err_cred; + } + if (move_group) { gctx = __perf_event_ctx_lock_double(group_leader, ctx); @@ -11320,7 +11320,10 @@ err_locked: if (move_group) perf_event_ctx_unlock(group_leader, gctx); mutex_unlock(&ctx->mutex); -/* err_file: */ +err_cred: + if (task) + mutex_unlock(&task->signal->exec_update_mutex); +err_file: fput(event_file); err_context: perf_unpin_context(ctx); @@ -11332,9 +11335,6 @@ err_alloc: */ if (!event_file) free_event(event); -err_cred: - if (task) - mutex_unlock(&task->signal->exec_update_mutex); err_task: if (task) put_task_struct(task); From 1b75a263fbd95f7e03af23921eb3f32086db66f1 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 3 Dec 2020 14:10:32 -0600 Subject: [PATCH 58/62] rwsem: Implement down_read_killable_nested [ Upstream commit 0f9368b5bf6db0c04afc5454b1be79022a681615 ] In preparation for converting exec_update_mutex to a rwsem so that multiple readers can execute in parallel and not deadlock, add down_read_killable_nested. This is needed so that kcmp_lock can be converted from working on a mutexes to working on rw_semaphores. Signed-off-by: Eric W. Biederman Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/87o8jabqh3.fsf@x220.int.ebiederm.org Signed-off-by: Sasha Levin --- include/linux/rwsem.h | 2 ++ kernel/locking/rwsem.c | 14 ++++++++++++++ 2 files changed, 16 insertions(+) diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index 00d6054687dd..c91ac00d1ff8 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -173,6 +173,7 @@ extern void downgrade_write(struct rw_semaphore *sem); * See Documentation/locking/lockdep-design.rst for more details.) */ extern void down_read_nested(struct rw_semaphore *sem, int subclass); +extern int __must_check down_read_killable_nested(struct rw_semaphore *sem, int subclass); extern void down_write_nested(struct rw_semaphore *sem, int subclass); extern int down_write_killable_nested(struct rw_semaphore *sem, int subclass); extern void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest_lock); @@ -193,6 +194,7 @@ extern void down_read_non_owner(struct rw_semaphore *sem); extern void up_read_non_owner(struct rw_semaphore *sem); #else # define down_read_nested(sem, subclass) down_read(sem) +# define down_read_killable_nested(sem, subclass) down_read_killable(sem) # define down_write_nest_lock(sem, nest_lock) down_write(sem) # define down_write_nested(sem, subclass) down_write(sem) # define down_write_killable_nested(sem, subclass) down_write_killable(sem) diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index baafa1dd9fcc..2ce13f958577 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -1608,6 +1608,20 @@ void down_read_nested(struct rw_semaphore *sem, int subclass) } EXPORT_SYMBOL(down_read_nested); +int down_read_killable_nested(struct rw_semaphore *sem, int subclass) +{ + might_sleep(); + rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_); + + if (LOCK_CONTENDED_RETURN(sem, __down_read_trylock, __down_read_killable)) { + rwsem_release(&sem->dep_map, _RET_IP_); + return -EINTR; + } + + return 0; +} +EXPORT_SYMBOL(down_read_killable_nested); + void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest) { might_sleep(); From d390fc97df62dd76770eeab53f78e8ce2a07113d Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 3 Dec 2020 14:11:13 -0600 Subject: [PATCH 59/62] rwsem: Implement down_read_interruptible [ Upstream commit 31784cff7ee073b34d6eddabb95e3be2880a425c ] In preparation for converting exec_update_mutex to a rwsem so that multiple readers can execute in parallel and not deadlock, add down_read_interruptible. This is needed for perf_event_open to be converted (with no semantic changes) from working on a mutex to wroking on a rwsem. Signed-off-by: Eric W. Biederman Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/87k0tybqfy.fsf@x220.int.ebiederm.org Signed-off-by: Sasha Levin --- include/linux/rwsem.h | 1 + kernel/locking/rwsem.c | 26 ++++++++++++++++++++++++++ 2 files changed, 27 insertions(+) diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index c91ac00d1ff8..8a3606372abc 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -125,6 +125,7 @@ static inline int rwsem_is_contended(struct rw_semaphore *sem) * lock for reading */ extern void down_read(struct rw_semaphore *sem); +extern int __must_check down_read_interruptible(struct rw_semaphore *sem); extern int __must_check down_read_killable(struct rw_semaphore *sem); /* diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 2ce13f958577..a5eb87f2c581 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -1348,6 +1348,18 @@ inline void __down_read(struct rw_semaphore *sem) } } +static inline int __down_read_interruptible(struct rw_semaphore *sem) +{ + if (!rwsem_read_trylock(sem)) { + if (IS_ERR(rwsem_down_read_slowpath(sem, TASK_INTERRUPTIBLE))) + return -EINTR; + DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem); + } else { + rwsem_set_reader_owned(sem); + } + return 0; +} + static inline int __down_read_killable(struct rw_semaphore *sem) { if (!rwsem_read_trylock(sem)) { @@ -1498,6 +1510,20 @@ void __sched down_read(struct rw_semaphore *sem) } EXPORT_SYMBOL(down_read); +int __sched down_read_interruptible(struct rw_semaphore *sem) +{ + might_sleep(); + rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_); + + if (LOCK_CONTENDED_RETURN(sem, __down_read_trylock, __down_read_interruptible)) { + rwsem_release(&sem->dep_map, _RET_IP_); + return -EINTR; + } + + return 0; +} +EXPORT_SYMBOL(down_read_interruptible); + int __sched down_read_killable(struct rw_semaphore *sem) { might_sleep(); From 117433236ae296d9770442960ddf57459177e90e Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 3 Dec 2020 14:12:00 -0600 Subject: [PATCH 60/62] exec: Transform exec_update_mutex into a rw_semaphore [ Upstream commit f7cfd871ae0c5008d94b6f66834e7845caa93c15 ] Recently syzbot reported[0] that there is a deadlock amongst the users of exec_update_mutex. The problematic lock ordering found by lockdep was: perf_event_open (exec_update_mutex -> ovl_i_mutex) chown (ovl_i_mutex -> sb_writes) sendfile (sb_writes -> p->lock) by reading from a proc file and writing to overlayfs proc_pid_syscall (p->lock -> exec_update_mutex) While looking at possible solutions it occured to me that all of the users and possible users involved only wanted to state of the given process to remain the same. They are all readers. The only writer is exec. There is no reason for readers to block on each other. So fix this deadlock by transforming exec_update_mutex into a rw_semaphore named exec_update_lock that only exec takes for writing. Cc: Jann Horn Cc: Vasiliy Kulikov Cc: Al Viro Cc: Bernd Edlinger Cc: Oleg Nesterov Cc: Christopher Yeoh Cc: Cyrill Gorcunov Cc: Sargun Dhillon Cc: Christian Brauner Cc: Arnd Bergmann Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Fixes: eea9673250db ("exec: Add exec_update_mutex to replace cred_guard_mutex") [0] https://lkml.kernel.org/r/00000000000063640c05ade8e3de@google.com Reported-by: syzbot+db9cdf3dd1f64252c6ef@syzkaller.appspotmail.com Link: https://lkml.kernel.org/r/87ft4mbqen.fsf@x220.int.ebiederm.org Signed-off-by: Eric W. Biederman Signed-off-by: Sasha Levin --- fs/exec.c | 12 ++++++------ fs/proc/base.c | 10 +++++----- include/linux/sched/signal.h | 11 ++++++----- init/init_task.c | 2 +- kernel/events/core.c | 12 ++++++------ kernel/fork.c | 6 +++--- kernel/kcmp.c | 30 +++++++++++++++--------------- kernel/locking/rwsem.c | 4 ++-- 8 files changed, 44 insertions(+), 43 deletions(-) diff --git a/fs/exec.c b/fs/exec.c index 2441eb1a1e2d..1b4d2206d53a 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1009,8 +1009,8 @@ EXPORT_SYMBOL(read_code); /* * Maps the mm_struct mm into the current task struct. - * On success, this function returns with the mutex - * exec_update_mutex locked. + * On success, this function returns with exec_update_lock + * held for writing. */ static int exec_mmap(struct mm_struct *mm) { @@ -1023,7 +1023,7 @@ static int exec_mmap(struct mm_struct *mm) old_mm = current->mm; exec_mm_release(tsk, old_mm); - ret = mutex_lock_killable(&tsk->signal->exec_update_mutex); + ret = down_write_killable(&tsk->signal->exec_update_lock); if (ret) return ret; @@ -1038,7 +1038,7 @@ static int exec_mmap(struct mm_struct *mm) down_read(&old_mm->mmap_sem); if (unlikely(old_mm->core_state)) { up_read(&old_mm->mmap_sem); - mutex_unlock(&tsk->signal->exec_update_mutex); + up_write(&tsk->signal->exec_update_lock); return -EINTR; } } @@ -1450,7 +1450,7 @@ static void free_bprm(struct linux_binprm *bprm) free_arg_pages(bprm); if (bprm->cred) { if (bprm->called_exec_mmap) - mutex_unlock(¤t->signal->exec_update_mutex); + up_write(¤t->signal->exec_update_lock); mutex_unlock(¤t->signal->cred_guard_mutex); abort_creds(bprm->cred); } @@ -1500,7 +1500,7 @@ void install_exec_creds(struct linux_binprm *bprm) * credentials; any time after this it may be unlocked. */ security_bprm_committed_creds(bprm); - mutex_unlock(¤t->signal->exec_update_mutex); + up_write(¤t->signal->exec_update_lock); mutex_unlock(¤t->signal->cred_guard_mutex); } EXPORT_SYMBOL(install_exec_creds); diff --git a/fs/proc/base.c b/fs/proc/base.c index b690074e65ff..653c2d8aa1cd 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -403,11 +403,11 @@ print0: static int lock_trace(struct task_struct *task) { - int err = mutex_lock_killable(&task->signal->exec_update_mutex); + int err = down_read_killable(&task->signal->exec_update_lock); if (err) return err; if (!ptrace_may_access(task, PTRACE_MODE_ATTACH_FSCREDS)) { - mutex_unlock(&task->signal->exec_update_mutex); + up_read(&task->signal->exec_update_lock); return -EPERM; } return 0; @@ -415,7 +415,7 @@ static int lock_trace(struct task_struct *task) static void unlock_trace(struct task_struct *task) { - mutex_unlock(&task->signal->exec_update_mutex); + up_read(&task->signal->exec_update_lock); } #ifdef CONFIG_STACKTRACE @@ -2769,7 +2769,7 @@ static int do_io_accounting(struct task_struct *task, struct seq_file *m, int wh unsigned long flags; int result; - result = mutex_lock_killable(&task->signal->exec_update_mutex); + result = down_read_killable(&task->signal->exec_update_lock); if (result) return result; @@ -2805,7 +2805,7 @@ static int do_io_accounting(struct task_struct *task, struct seq_file *m, int wh result = 0; out_unlock: - mutex_unlock(&task->signal->exec_update_mutex); + up_read(&task->signal->exec_update_lock); return result; } diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index a29df79540ce..baf58f4cb057 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -226,12 +226,13 @@ struct signal_struct { * credential calculations * (notably. ptrace) * Deprecated do not use in new code. - * Use exec_update_mutex instead. - */ - struct mutex exec_update_mutex; /* Held while task_struct is being - * updated during exec, and may have - * inconsistent permissions. + * Use exec_update_lock instead. */ + struct rw_semaphore exec_update_lock; /* Held while task_struct is + * being updated during exec, + * and may have inconsistent + * permissions. + */ } __randomize_layout; /* diff --git a/init/init_task.c b/init/init_task.c index bd403ed3e418..df7041be96fc 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -26,7 +26,7 @@ static struct signal_struct init_signals = { .multiprocess = HLIST_HEAD_INIT, .rlim = INIT_RLIMITS, .cred_guard_mutex = __MUTEX_INITIALIZER(init_signals.cred_guard_mutex), - .exec_update_mutex = __MUTEX_INITIALIZER(init_signals.exec_update_mutex), + .exec_update_lock = __RWSEM_INITIALIZER(init_signals.exec_update_lock), #ifdef CONFIG_POSIX_TIMERS .posix_timers = LIST_HEAD_INIT(init_signals.posix_timers), .cputimer = { diff --git a/kernel/events/core.c b/kernel/events/core.c index 18dbdf248ed8..2ef33e9a7591 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1254,7 +1254,7 @@ static void put_ctx(struct perf_event_context *ctx) * function. * * Lock order: - * exec_update_mutex + * exec_update_lock * task_struct::perf_event_mutex * perf_event_context::mutex * perf_event::child_mutex; @@ -11128,14 +11128,14 @@ SYSCALL_DEFINE5(perf_event_open, } if (task) { - err = mutex_lock_interruptible(&task->signal->exec_update_mutex); + err = down_read_interruptible(&task->signal->exec_update_lock); if (err) goto err_file; /* * Preserve ptrace permission check for backwards compatibility. * - * We must hold exec_update_mutex across this and any potential + * We must hold exec_update_lock across this and any potential * perf_install_in_context() call for this new event to * serialize against exec() altering our credentials (and the * perf_event_exit_task() that could imply). @@ -11298,7 +11298,7 @@ SYSCALL_DEFINE5(perf_event_open, mutex_unlock(&ctx->mutex); if (task) { - mutex_unlock(&task->signal->exec_update_mutex); + up_read(&task->signal->exec_update_lock); put_task_struct(task); } @@ -11322,7 +11322,7 @@ err_locked: mutex_unlock(&ctx->mutex); err_cred: if (task) - mutex_unlock(&task->signal->exec_update_mutex); + up_read(&task->signal->exec_update_lock); err_file: fput(event_file); err_context: @@ -11639,7 +11639,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) /* * When a child task exits, feed back event values to parent events. * - * Can be called with exec_update_mutex held when called from + * Can be called with exec_update_lock held when called from * install_exec_creds(). */ void perf_event_exit_task(struct task_struct *child) diff --git a/kernel/fork.c b/kernel/fork.c index 419fff8eb9e5..50f37d5afb32 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1221,7 +1221,7 @@ struct mm_struct *mm_access(struct task_struct *task, unsigned int mode) struct mm_struct *mm; int err; - err = mutex_lock_killable(&task->signal->exec_update_mutex); + err = down_read_killable(&task->signal->exec_update_lock); if (err) return ERR_PTR(err); @@ -1231,7 +1231,7 @@ struct mm_struct *mm_access(struct task_struct *task, unsigned int mode) mmput(mm); mm = ERR_PTR(-EACCES); } - mutex_unlock(&task->signal->exec_update_mutex); + up_read(&task->signal->exec_update_lock); return mm; } @@ -1586,7 +1586,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->oom_score_adj_min = current->signal->oom_score_adj_min; mutex_init(&sig->cred_guard_mutex); - mutex_init(&sig->exec_update_mutex); + init_rwsem(&sig->exec_update_lock); return 0; } diff --git a/kernel/kcmp.c b/kernel/kcmp.c index b3ff9288c6cc..c0d2ad9b4705 100644 --- a/kernel/kcmp.c +++ b/kernel/kcmp.c @@ -75,25 +75,25 @@ get_file_raw_ptr(struct task_struct *task, unsigned int idx) return file; } -static void kcmp_unlock(struct mutex *m1, struct mutex *m2) +static void kcmp_unlock(struct rw_semaphore *l1, struct rw_semaphore *l2) { - if (likely(m2 != m1)) - mutex_unlock(m2); - mutex_unlock(m1); + if (likely(l2 != l1)) + up_read(l2); + up_read(l1); } -static int kcmp_lock(struct mutex *m1, struct mutex *m2) +static int kcmp_lock(struct rw_semaphore *l1, struct rw_semaphore *l2) { int err; - if (m2 > m1) - swap(m1, m2); + if (l2 > l1) + swap(l1, l2); - err = mutex_lock_killable(m1); - if (!err && likely(m1 != m2)) { - err = mutex_lock_killable_nested(m2, SINGLE_DEPTH_NESTING); + err = down_read_killable(l1); + if (!err && likely(l1 != l2)) { + err = down_read_killable_nested(l2, SINGLE_DEPTH_NESTING); if (err) - mutex_unlock(m1); + up_read(l1); } return err; @@ -173,8 +173,8 @@ SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type, /* * One should have enough rights to inspect task details. */ - ret = kcmp_lock(&task1->signal->exec_update_mutex, - &task2->signal->exec_update_mutex); + ret = kcmp_lock(&task1->signal->exec_update_lock, + &task2->signal->exec_update_lock); if (ret) goto err; if (!ptrace_may_access(task1, PTRACE_MODE_READ_REALCREDS) || @@ -229,8 +229,8 @@ SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type, } err_unlock: - kcmp_unlock(&task1->signal->exec_update_mutex, - &task2->signal->exec_update_mutex); + kcmp_unlock(&task1->signal->exec_update_lock, + &task2->signal->exec_update_lock); err: put_task_struct(task1); put_task_struct(task2); diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index a5eb87f2c581..5d54ff3179b8 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -1516,7 +1516,7 @@ int __sched down_read_interruptible(struct rw_semaphore *sem) rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_); if (LOCK_CONTENDED_RETURN(sem, __down_read_trylock, __down_read_interruptible)) { - rwsem_release(&sem->dep_map, _RET_IP_); + rwsem_release(&sem->dep_map, 1, _RET_IP_); return -EINTR; } @@ -1640,7 +1640,7 @@ int down_read_killable_nested(struct rw_semaphore *sem, int subclass) rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_); if (LOCK_CONTENDED_RETURN(sem, __down_read_trylock, __down_read_killable)) { - rwsem_release(&sem->dep_map, _RET_IP_); + rwsem_release(&sem->dep_map, 1, _RET_IP_); return -EINTR; } From 0a49aaf4df2936bca119ee38fe5a570a7024efdc Mon Sep 17 00:00:00 2001 From: Zhang Xiaohui Date: Sun, 6 Dec 2020 16:48:01 +0800 Subject: [PATCH 61/62] mwifiex: Fix possible buffer overflows in mwifiex_cmd_802_11_ad_hoc_start [ Upstream commit 5c455c5ab332773464d02ba17015acdca198f03d ] mwifiex_cmd_802_11_ad_hoc_start() calls memcpy() without checking the destination size may trigger a buffer overflower, which a local user could use to cause denial of service or the execution of arbitrary code. Fix it by putting the length check before calling memcpy(). Signed-off-by: Zhang Xiaohui Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20201206084801.26479-1-ruc_zhangxiaohui@163.com Signed-off-by: Sasha Levin --- drivers/net/wireless/marvell/mwifiex/join.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/wireless/marvell/mwifiex/join.c b/drivers/net/wireless/marvell/mwifiex/join.c index d87aeff70cef..c2cb1e711c06 100644 --- a/drivers/net/wireless/marvell/mwifiex/join.c +++ b/drivers/net/wireless/marvell/mwifiex/join.c @@ -877,6 +877,8 @@ mwifiex_cmd_802_11_ad_hoc_start(struct mwifiex_private *priv, memset(adhoc_start->ssid, 0, IEEE80211_MAX_SSID_LEN); + if (req_ssid->ssid_len > IEEE80211_MAX_SSID_LEN) + req_ssid->ssid_len = IEEE80211_MAX_SSID_LEN; memcpy(adhoc_start->ssid, req_ssid->ssid, req_ssid->ssid_len); mwifiex_dbg(adapter, INFO, "info: ADHOC_S_CMD: SSID = %s\n", From f3a4c8d501452b8c2e04c4500c317ce4bdb1b47c Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sat, 9 Jan 2021 13:44:55 +0100 Subject: [PATCH 62/62] Linux 5.4.88 Tested-by: Jon Hunter Tested-by: Shuah Khan Tested-by: Linux Kernel Functional Testing Tested-by: Guenter Roeck Link: https://lore.kernel.org/r/20210107143049.929352526@linuxfoundation.org Signed-off-by: Greg Kroah-Hartman --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 71968b4bb313..450ebe152806 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 VERSION = 5 PATCHLEVEL = 4 -SUBLEVEL = 87 +SUBLEVEL = 88 EXTRAVERSION = NAME = Kleptomaniac Octopus