remarkable-linux/fs/read_write.c
Andi Kleen 9465efc9e9 Remove BKL from remote_llseek v2
- Replace remote_llseek with generic_file_llseek_unlocked (to force compilation
failures in all users)
- Change all users to either use generic_file_llseek_unlocked directly or
take the BKL around. I changed the file systems who don't use the BKL
for anything (CIFS, GFS) to call it directly. NCPFS and SMBFS and NFS
take the BKL, but explicitely in their own source now.

I moved them all over in a single patch to avoid unbisectable sections.

Open problem: 32bit kernels can corrupt fpos because its modification
is not atomic, but they can do that anyways because there's other paths who
modify it without BKL.

Do we need a special lock for the pos/f_version = 0 checks?

Trond says the NFS BKL is likely not needed, but keep it for now
until his full audit.

v2: Use generic_file_llseek_unlocked instead of remote_llseek_unlocked
    and factor duplicated code (suggested by hch)

Cc: Trond.Myklebust@netapp.com
Cc: swhiteho@redhat.com
Cc: sfrench@samba.org
Cc: vandrove@vc.cvut.cz

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
2008-07-02 15:06:27 -06:00

815 lines
17 KiB
C

/*
* linux/fs/read_write.c
*
* Copyright (C) 1991, 1992 Linus Torvalds
*/
#include <linux/slab.h>
#include <linux/stat.h>
#include <linux/fcntl.h>
#include <linux/file.h>
#include <linux/uio.h>
#include <linux/smp_lock.h>
#include <linux/fsnotify.h>
#include <linux/security.h>
#include <linux/module.h>
#include <linux/syscalls.h>
#include <linux/pagemap.h>
#include <linux/splice.h>
#include "read_write.h"
#include <asm/uaccess.h>
#include <asm/unistd.h>
const struct file_operations generic_ro_fops = {
.llseek = generic_file_llseek,
.read = do_sync_read,
.aio_read = generic_file_aio_read,
.mmap = generic_file_readonly_mmap,
.splice_read = generic_file_splice_read,
};
EXPORT_SYMBOL(generic_ro_fops);
loff_t
generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin)
{
loff_t retval;
struct inode *inode = file->f_mapping->host;
switch (origin) {
case SEEK_END:
offset += inode->i_size;
break;
case SEEK_CUR:
offset += file->f_pos;
}
retval = -EINVAL;
if (offset>=0 && offset<=inode->i_sb->s_maxbytes) {
/* Special lock needed here? */
if (offset != file->f_pos) {
file->f_pos = offset;
file->f_version = 0;
}
retval = offset;
}
return retval;
}
EXPORT_SYMBOL(generic_file_llseek_unlocked);
loff_t generic_file_llseek(struct file *file, loff_t offset, int origin)
{
loff_t n;
mutex_lock(&file->f_dentry->d_inode->i_mutex);
n = generic_file_llseek_unlocked(file, offset, origin);
mutex_unlock(&file->f_dentry->d_inode->i_mutex);
return n;
}
EXPORT_SYMBOL(generic_file_llseek);
loff_t no_llseek(struct file *file, loff_t offset, int origin)
{
return -ESPIPE;
}
EXPORT_SYMBOL(no_llseek);
loff_t default_llseek(struct file *file, loff_t offset, int origin)
{
loff_t retval;
lock_kernel();
switch (origin) {
case SEEK_END:
offset += i_size_read(file->f_path.dentry->d_inode);
break;
case SEEK_CUR:
offset += file->f_pos;
}
retval = -EINVAL;
if (offset >= 0) {
if (offset != file->f_pos) {
file->f_pos = offset;
file->f_version = 0;
}
retval = offset;
}
unlock_kernel();
return retval;
}
EXPORT_SYMBOL(default_llseek);
loff_t vfs_llseek(struct file *file, loff_t offset, int origin)
{
loff_t (*fn)(struct file *, loff_t, int);
fn = no_llseek;
if (file->f_mode & FMODE_LSEEK) {
fn = default_llseek;
if (file->f_op && file->f_op->llseek)
fn = file->f_op->llseek;
}
return fn(file, offset, origin);
}
EXPORT_SYMBOL(vfs_llseek);
asmlinkage off_t sys_lseek(unsigned int fd, off_t offset, unsigned int origin)
{
off_t retval;
struct file * file;
int fput_needed;
retval = -EBADF;
file = fget_light(fd, &fput_needed);
if (!file)
goto bad;
retval = -EINVAL;
if (origin <= SEEK_MAX) {
loff_t res = vfs_llseek(file, offset, origin);
retval = res;
if (res != (loff_t)retval)
retval = -EOVERFLOW; /* LFS: should only happen on 32 bit platforms */
}
fput_light(file, fput_needed);
bad:
return retval;
}
#ifdef __ARCH_WANT_SYS_LLSEEK
asmlinkage long sys_llseek(unsigned int fd, unsigned long offset_high,
unsigned long offset_low, loff_t __user * result,
unsigned int origin)
{
int retval;
struct file * file;
loff_t offset;
int fput_needed;
retval = -EBADF;
file = fget_light(fd, &fput_needed);
if (!file)
goto bad;
retval = -EINVAL;
if (origin > SEEK_MAX)
goto out_putf;
offset = vfs_llseek(file, ((loff_t) offset_high << 32) | offset_low,
origin);
retval = (int)offset;
if (offset >= 0) {
retval = -EFAULT;
if (!copy_to_user(result, &offset, sizeof(offset)))
retval = 0;
}
out_putf:
fput_light(file, fput_needed);
bad:
return retval;
}
#endif
/*
* rw_verify_area doesn't like huge counts. We limit
* them to something that fits in "int" so that others
* won't have to do range checks all the time.
*/
#define MAX_RW_COUNT (INT_MAX & PAGE_CACHE_MASK)
int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count)
{
struct inode *inode;
loff_t pos;
int retval = -EINVAL;
inode = file->f_path.dentry->d_inode;
if (unlikely((ssize_t) count < 0))
return retval;
pos = *ppos;
if (unlikely((pos < 0) || (loff_t) (pos + count) < 0))
return retval;
if (unlikely(inode->i_flock && mandatory_lock(inode))) {
retval = locks_mandatory_area(
read_write == READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE,
inode, file, pos, count);
if (retval < 0)
return retval;
}
retval = security_file_permission(file,
read_write == READ ? MAY_READ : MAY_WRITE);
if (retval)
return retval;
return count > MAX_RW_COUNT ? MAX_RW_COUNT : count;
}
static void wait_on_retry_sync_kiocb(struct kiocb *iocb)
{
set_current_state(TASK_UNINTERRUPTIBLE);
if (!kiocbIsKicked(iocb))
schedule();
else
kiocbClearKicked(iocb);
__set_current_state(TASK_RUNNING);
}
ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
{
struct iovec iov = { .iov_base = buf, .iov_len = len };
struct kiocb kiocb;
ssize_t ret;
init_sync_kiocb(&kiocb, filp);
kiocb.ki_pos = *ppos;
kiocb.ki_left = len;
for (;;) {
ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
if (ret != -EIOCBRETRY)
break;
wait_on_retry_sync_kiocb(&kiocb);
}
if (-EIOCBQUEUED == ret)
ret = wait_on_sync_kiocb(&kiocb);
*ppos = kiocb.ki_pos;
return ret;
}
EXPORT_SYMBOL(do_sync_read);
ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
{
ssize_t ret;
if (!(file->f_mode & FMODE_READ))
return -EBADF;
if (!file->f_op || (!file->f_op->read && !file->f_op->aio_read))
return -EINVAL;
if (unlikely(!access_ok(VERIFY_WRITE, buf, count)))
return -EFAULT;
ret = rw_verify_area(READ, file, pos, count);
if (ret >= 0) {
count = ret;
if (file->f_op->read)
ret = file->f_op->read(file, buf, count, pos);
else
ret = do_sync_read(file, buf, count, pos);
if (ret > 0) {
fsnotify_access(file->f_path.dentry);
add_rchar(current, ret);
}
inc_syscr(current);
}
return ret;
}
EXPORT_SYMBOL(vfs_read);
ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
{
struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };
struct kiocb kiocb;
ssize_t ret;
init_sync_kiocb(&kiocb, filp);
kiocb.ki_pos = *ppos;
kiocb.ki_left = len;
for (;;) {
ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
if (ret != -EIOCBRETRY)
break;
wait_on_retry_sync_kiocb(&kiocb);
}
if (-EIOCBQUEUED == ret)
ret = wait_on_sync_kiocb(&kiocb);
*ppos = kiocb.ki_pos;
return ret;
}
EXPORT_SYMBOL(do_sync_write);
ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
{
ssize_t ret;
if (!(file->f_mode & FMODE_WRITE))
return -EBADF;
if (!file->f_op || (!file->f_op->write && !file->f_op->aio_write))
return -EINVAL;
if (unlikely(!access_ok(VERIFY_READ, buf, count)))
return -EFAULT;
ret = rw_verify_area(WRITE, file, pos, count);
if (ret >= 0) {
count = ret;
if (file->f_op->write)
ret = file->f_op->write(file, buf, count, pos);
else
ret = do_sync_write(file, buf, count, pos);
if (ret > 0) {
fsnotify_modify(file->f_path.dentry);
add_wchar(current, ret);
}
inc_syscw(current);
}
return ret;
}
EXPORT_SYMBOL(vfs_write);
static inline loff_t file_pos_read(struct file *file)
{
return file->f_pos;
}
static inline void file_pos_write(struct file *file, loff_t pos)
{
file->f_pos = pos;
}
asmlinkage ssize_t sys_read(unsigned int fd, char __user * buf, size_t count)
{
struct file *file;
ssize_t ret = -EBADF;
int fput_needed;
file = fget_light(fd, &fput_needed);
if (file) {
loff_t pos = file_pos_read(file);
ret = vfs_read(file, buf, count, &pos);
file_pos_write(file, pos);
fput_light(file, fput_needed);
}
return ret;
}
asmlinkage ssize_t sys_write(unsigned int fd, const char __user * buf, size_t count)
{
struct file *file;
ssize_t ret = -EBADF;
int fput_needed;
file = fget_light(fd, &fput_needed);
if (file) {
loff_t pos = file_pos_read(file);
ret = vfs_write(file, buf, count, &pos);
file_pos_write(file, pos);
fput_light(file, fput_needed);
}
return ret;
}
asmlinkage ssize_t sys_pread64(unsigned int fd, char __user *buf,
size_t count, loff_t pos)
{
struct file *file;
ssize_t ret = -EBADF;
int fput_needed;
if (pos < 0)
return -EINVAL;
file = fget_light(fd, &fput_needed);
if (file) {
ret = -ESPIPE;
if (file->f_mode & FMODE_PREAD)
ret = vfs_read(file, buf, count, &pos);
fput_light(file, fput_needed);
}
return ret;
}
asmlinkage ssize_t sys_pwrite64(unsigned int fd, const char __user *buf,
size_t count, loff_t pos)
{
struct file *file;
ssize_t ret = -EBADF;
int fput_needed;
if (pos < 0)
return -EINVAL;
file = fget_light(fd, &fput_needed);
if (file) {
ret = -ESPIPE;
if (file->f_mode & FMODE_PWRITE)
ret = vfs_write(file, buf, count, &pos);
fput_light(file, fput_needed);
}
return ret;
}
/*
* Reduce an iovec's length in-place. Return the resulting number of segments
*/
unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to)
{
unsigned long seg = 0;
size_t len = 0;
while (seg < nr_segs) {
seg++;
if (len + iov->iov_len >= to) {
iov->iov_len = to - len;
break;
}
len += iov->iov_len;
iov++;
}
return seg;
}
EXPORT_SYMBOL(iov_shorten);
ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov,
unsigned long nr_segs, size_t len, loff_t *ppos, iov_fn_t fn)
{
struct kiocb kiocb;
ssize_t ret;
init_sync_kiocb(&kiocb, filp);
kiocb.ki_pos = *ppos;
kiocb.ki_left = len;
kiocb.ki_nbytes = len;
for (;;) {
ret = fn(&kiocb, iov, nr_segs, kiocb.ki_pos);
if (ret != -EIOCBRETRY)
break;
wait_on_retry_sync_kiocb(&kiocb);
}
if (ret == -EIOCBQUEUED)
ret = wait_on_sync_kiocb(&kiocb);
*ppos = kiocb.ki_pos;
return ret;
}
/* Do it by hand, with file-ops */
ssize_t do_loop_readv_writev(struct file *filp, struct iovec *iov,
unsigned long nr_segs, loff_t *ppos, io_fn_t fn)
{
struct iovec *vector = iov;
ssize_t ret = 0;
while (nr_segs > 0) {
void __user *base;
size_t len;
ssize_t nr;
base = vector->iov_base;
len = vector->iov_len;
vector++;
nr_segs--;
nr = fn(filp, base, len, ppos);
if (nr < 0) {
if (!ret)
ret = nr;
break;
}
ret += nr;
if (nr != len)
break;
}
return ret;
}
/* A write operation does a read from user space and vice versa */
#define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ)
ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
unsigned long nr_segs, unsigned long fast_segs,
struct iovec *fast_pointer,
struct iovec **ret_pointer)
{
unsigned long seg;
ssize_t ret;
struct iovec *iov = fast_pointer;
/*
* SuS says "The readv() function *may* fail if the iovcnt argument
* was less than or equal to 0, or greater than {IOV_MAX}. Linux has
* traditionally returned zero for zero segments, so...
*/
if (nr_segs == 0) {
ret = 0;
goto out;
}
/*
* First get the "struct iovec" from user memory and
* verify all the pointers
*/
if (nr_segs > UIO_MAXIOV) {
ret = -EINVAL;
goto out;
}
if (nr_segs > fast_segs) {
iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
if (iov == NULL) {
ret = -ENOMEM;
goto out;
}
}
if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector))) {
ret = -EFAULT;
goto out;
}
/*
* According to the Single Unix Specification we should return EINVAL
* if an element length is < 0 when cast to ssize_t or if the
* total length would overflow the ssize_t return value of the
* system call.
*/
ret = 0;
for (seg = 0; seg < nr_segs; seg++) {
void __user *buf = iov[seg].iov_base;
ssize_t len = (ssize_t)iov[seg].iov_len;
/* see if we we're about to use an invalid len or if
* it's about to overflow ssize_t */
if (len < 0 || (ret + len < ret)) {
ret = -EINVAL;
goto out;
}
if (unlikely(!access_ok(vrfy_dir(type), buf, len))) {
ret = -EFAULT;
goto out;
}
ret += len;
}
out:
*ret_pointer = iov;
return ret;
}
static ssize_t do_readv_writev(int type, struct file *file,
const struct iovec __user * uvector,
unsigned long nr_segs, loff_t *pos)
{
size_t tot_len;
struct iovec iovstack[UIO_FASTIOV];
struct iovec *iov = iovstack;
ssize_t ret;
io_fn_t fn;
iov_fn_t fnv;
if (!file->f_op) {
ret = -EINVAL;
goto out;
}
ret = rw_copy_check_uvector(type, uvector, nr_segs,
ARRAY_SIZE(iovstack), iovstack, &iov);
if (ret <= 0)
goto out;
tot_len = ret;
ret = rw_verify_area(type, file, pos, tot_len);
if (ret < 0)
goto out;
fnv = NULL;
if (type == READ) {
fn = file->f_op->read;
fnv = file->f_op->aio_read;
} else {
fn = (io_fn_t)file->f_op->write;
fnv = file->f_op->aio_write;
}
if (fnv)
ret = do_sync_readv_writev(file, iov, nr_segs, tot_len,
pos, fnv);
else
ret = do_loop_readv_writev(file, iov, nr_segs, pos, fn);
out:
if (iov != iovstack)
kfree(iov);
if ((ret + (type == READ)) > 0) {
if (type == READ)
fsnotify_access(file->f_path.dentry);
else
fsnotify_modify(file->f_path.dentry);
}
return ret;
}
ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
unsigned long vlen, loff_t *pos)
{
if (!(file->f_mode & FMODE_READ))
return -EBADF;
if (!file->f_op || (!file->f_op->aio_read && !file->f_op->read))
return -EINVAL;
return do_readv_writev(READ, file, vec, vlen, pos);
}
EXPORT_SYMBOL(vfs_readv);
ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
unsigned long vlen, loff_t *pos)
{
if (!(file->f_mode & FMODE_WRITE))
return -EBADF;
if (!file->f_op || (!file->f_op->aio_write && !file->f_op->write))
return -EINVAL;
return do_readv_writev(WRITE, file, vec, vlen, pos);
}
EXPORT_SYMBOL(vfs_writev);
asmlinkage ssize_t
sys_readv(unsigned long fd, const struct iovec __user *vec, unsigned long vlen)
{
struct file *file;
ssize_t ret = -EBADF;
int fput_needed;
file = fget_light(fd, &fput_needed);
if (file) {
loff_t pos = file_pos_read(file);
ret = vfs_readv(file, vec, vlen, &pos);
file_pos_write(file, pos);
fput_light(file, fput_needed);
}
if (ret > 0)
add_rchar(current, ret);
inc_syscr(current);
return ret;
}
asmlinkage ssize_t
sys_writev(unsigned long fd, const struct iovec __user *vec, unsigned long vlen)
{
struct file *file;
ssize_t ret = -EBADF;
int fput_needed;
file = fget_light(fd, &fput_needed);
if (file) {
loff_t pos = file_pos_read(file);
ret = vfs_writev(file, vec, vlen, &pos);
file_pos_write(file, pos);
fput_light(file, fput_needed);
}
if (ret > 0)
add_wchar(current, ret);
inc_syscw(current);
return ret;
}
static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
size_t count, loff_t max)
{
struct file * in_file, * out_file;
struct inode * in_inode, * out_inode;
loff_t pos;
ssize_t retval;
int fput_needed_in, fput_needed_out, fl;
/*
* Get input file, and verify that it is ok..
*/
retval = -EBADF;
in_file = fget_light(in_fd, &fput_needed_in);
if (!in_file)
goto out;
if (!(in_file->f_mode & FMODE_READ))
goto fput_in;
retval = -EINVAL;
in_inode = in_file->f_path.dentry->d_inode;
if (!in_inode)
goto fput_in;
if (!in_file->f_op || !in_file->f_op->splice_read)
goto fput_in;
retval = -ESPIPE;
if (!ppos)
ppos = &in_file->f_pos;
else
if (!(in_file->f_mode & FMODE_PREAD))
goto fput_in;
retval = rw_verify_area(READ, in_file, ppos, count);
if (retval < 0)
goto fput_in;
count = retval;
/*
* Get output file, and verify that it is ok..
*/
retval = -EBADF;
out_file = fget_light(out_fd, &fput_needed_out);
if (!out_file)
goto fput_in;
if (!(out_file->f_mode & FMODE_WRITE))
goto fput_out;
retval = -EINVAL;
if (!out_file->f_op || !out_file->f_op->sendpage)
goto fput_out;
out_inode = out_file->f_path.dentry->d_inode;
retval = rw_verify_area(WRITE, out_file, &out_file->f_pos, count);
if (retval < 0)
goto fput_out;
count = retval;
if (!max)
max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);
pos = *ppos;
retval = -EINVAL;
if (unlikely(pos < 0))
goto fput_out;
if (unlikely(pos + count > max)) {
retval = -EOVERFLOW;
if (pos >= max)
goto fput_out;
count = max - pos;
}
fl = 0;
#if 0
/*
* We need to debate whether we can enable this or not. The
* man page documents EAGAIN return for the output at least,
* and the application is arguably buggy if it doesn't expect
* EAGAIN on a non-blocking file descriptor.
*/
if (in_file->f_flags & O_NONBLOCK)
fl = SPLICE_F_NONBLOCK;
#endif
retval = do_splice_direct(in_file, ppos, out_file, count, fl);
if (retval > 0) {
add_rchar(current, retval);
add_wchar(current, retval);
}
inc_syscr(current);
inc_syscw(current);
if (*ppos > max)
retval = -EOVERFLOW;
fput_out:
fput_light(out_file, fput_needed_out);
fput_in:
fput_light(in_file, fput_needed_in);
out:
return retval;
}
asmlinkage ssize_t sys_sendfile(int out_fd, int in_fd, off_t __user *offset, size_t count)
{
loff_t pos;
off_t off;
ssize_t ret;
if (offset) {
if (unlikely(get_user(off, offset)))
return -EFAULT;
pos = off;
ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
if (unlikely(put_user(pos, offset)))
return -EFAULT;
return ret;
}
return do_sendfile(out_fd, in_fd, NULL, count, 0);
}
asmlinkage ssize_t sys_sendfile64(int out_fd, int in_fd, loff_t __user *offset, size_t count)
{
loff_t pos;
ssize_t ret;
if (offset) {
if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
return -EFAULT;
ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
if (unlikely(put_user(pos, offset)))
return -EFAULT;
return ret;
}
return do_sendfile(out_fd, in_fd, NULL, count, 0);
}