From 5417169026c3df151adf5a65eb061278b0a72e69 Mon Sep 17 00:00:00 2001 From: David Chinner Date: Thu, 19 Jul 2007 17:39:55 +1000 Subject: [PATCH 1/4] [FS] Implement block_page_mkwrite. Many filesystems need a ->page-mkwrite callout to correctly set up pages that have been written to by mmap. This is especially important when mmap is writing into holes as it allows filesystems to correctly account for and allocate space before the mmap write is allowed to proceed. Protection against truncate races is provided by locking the page and checking to see whether the page mapping is correct and whether it is beyond EOF so we don't end up allowing allocations beyond the current EOF or changing EOF as a result of a mmap write. SGI-PV: 940392 SGI-Modid: 2.6.x-xfs-melb:linux:29146a Signed-off-by: David Chinner Signed-off-by: Christoph Hellwig Signed-off-by: Tim Shimmin --- fs/buffer.c | 47 +++++++++++++++++++++++++++++++++++++ include/linux/buffer_head.h | 2 ++ 2 files changed, 49 insertions(+) diff --git a/fs/buffer.c b/fs/buffer.c index 0f9006714230..02ebb1f1d3b0 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2194,6 +2194,52 @@ int generic_commit_write(struct file *file, struct page *page, return 0; } +/* + * block_page_mkwrite() is not allowed to change the file size as it gets + * called from a page fault handler when a page is first dirtied. Hence we must + * be careful to check for EOF conditions here. We set the page up correctly + * for a written page which means we get ENOSPC checking when writing into + * holes and correct delalloc and unwritten extent mapping on filesystems that + * support these features. + * + * We are not allowed to take the i_mutex here so we have to play games to + * protect against truncate races as the page could now be beyond EOF. Because + * vmtruncate() writes the inode size before removing pages, once we have the + * page lock we can determine safely if the page is beyond EOF. If it is not + * beyond EOF, then the page is guaranteed safe against truncation until we + * unlock the page. + */ +int +block_page_mkwrite(struct vm_area_struct *vma, struct page *page, + get_block_t get_block) +{ + struct inode *inode = vma->vm_file->f_path.dentry->d_inode; + unsigned long end; + loff_t size; + int ret = -EINVAL; + + lock_page(page); + size = i_size_read(inode); + if ((page->mapping != inode->i_mapping) || + ((page->index << PAGE_CACHE_SHIFT) > size)) { + /* page got truncated out from underneath us */ + goto out_unlock; + } + + /* page is wholly or partially inside EOF */ + if (((page->index + 1) << PAGE_CACHE_SHIFT) > size) + end = size & ~PAGE_CACHE_MASK; + else + end = PAGE_CACHE_SIZE; + + ret = block_prepare_write(page, 0, end, get_block); + if (!ret) + ret = block_commit_write(page, 0, end); + +out_unlock: + unlock_page(page); + return ret; +} /* * nobh_prepare_write()'s prereads are special: the buffer_heads are freed @@ -2977,6 +3023,7 @@ EXPORT_SYMBOL(__brelse); EXPORT_SYMBOL(__wait_on_buffer); EXPORT_SYMBOL(block_commit_write); EXPORT_SYMBOL(block_prepare_write); +EXPORT_SYMBOL(block_page_mkwrite); EXPORT_SYMBOL(block_read_full_page); EXPORT_SYMBOL(block_sync_page); EXPORT_SYMBOL(block_truncate_page); diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 5c6e12853a9b..35cadad84b14 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -209,6 +209,8 @@ int cont_prepare_write(struct page*, unsigned, unsigned, get_block_t*, int generic_cont_expand(struct inode *inode, loff_t size); int generic_cont_expand_simple(struct inode *inode, loff_t size); int block_commit_write(struct page *page, unsigned from, unsigned to); +int block_page_mkwrite(struct vm_area_struct *vma, struct page *page, + get_block_t get_block); void block_sync_page(struct page *); sector_t generic_block_bmap(struct address_space *, sector_t, get_block_t *); int generic_commit_write(struct file *, struct page *, unsigned, unsigned); From 4f57dbc6b5bae5a3978d429f45ac597ca7a3b8c6 Mon Sep 17 00:00:00 2001 From: David Chinner Date: Thu, 19 Jul 2007 16:28:17 +1000 Subject: [PATCH 2/4] [XFS] Implement ->page_mkwrite in XFS. Hook XFS up to ->page_mkwrite to ensure that we know about mmap pages being written to. This allows use to do correct delayed allocation and ENOSPC checking as well as remap unwritten extents so that they get converted correctly during writeback. This is done via the generic block_page_mkwrite code. SGI-PV: 940392 SGI-Modid: xfs-linux-melb:xfs-kern:29149a Signed-off-by: David Chinner Signed-off-by: Christoph Hellwig Signed-off-by: Tim Shimmin --- fs/xfs/linux-2.6/xfs_file.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c index cbcd40c8c2a0..b4c936485d11 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/linux-2.6/xfs_file.c @@ -413,6 +413,20 @@ xfs_file_open_exec( } #endif /* HAVE_FOP_OPEN_EXEC */ +/* + * mmap()d file has taken write protection fault and is being made + * writable. We can set the page state up correctly for a writable + * page, which means we can do correct delalloc accounting (ENOSPC + * checking!) and unwritten extent mapping. + */ +STATIC int +xfs_vm_page_mkwrite( + struct vm_area_struct *vma, + struct page *page) +{ + return block_page_mkwrite(vma, page, xfs_get_blocks); +} + const struct file_operations xfs_file_operations = { .llseek = generic_file_llseek, .read = do_sync_read, @@ -466,12 +480,14 @@ const struct file_operations xfs_dir_file_operations = { static struct vm_operations_struct xfs_file_vm_ops = { .nopage = filemap_nopage, .populate = filemap_populate, + .page_mkwrite = xfs_vm_page_mkwrite, }; #ifdef CONFIG_XFS_DMAPI static struct vm_operations_struct xfs_dmapi_file_vm_ops = { .nopage = xfs_vm_nopage, .populate = filemap_populate, + .page_mkwrite = xfs_vm_page_mkwrite, #ifdef HAVE_VMOP_MPROTECT .mprotect = xfs_vm_mprotect, #endif From 91ebecc74eeeeea0a2aa50bf1964ec2214a229c9 Mon Sep 17 00:00:00 2001 From: David Chinner Date: Thu, 19 Jul 2007 16:28:30 +1000 Subject: [PATCH 3/4] [XFS] Allow punching holes to free space when at ENOSPC Make the free file space transaction able to dip into the reserved blocks to ensure that we can successfully free blocks when the filesystem is at ENOSPC. SGI-PV: 967788 SGI-Modid: xfs-linux-melb:xfs-kern:29167a Signed-off-by: David Chinner Signed-off-by: Vlad Apostolov Signed-off-by: Tim Shimmin --- fs/xfs/xfs_vnodeops.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 79b522779aa4..401cb00a55d6 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -4434,9 +4434,12 @@ xfs_free_file_space( while (!error && !done) { /* - * allocate and setup the transaction + * allocate and setup the transaction. Allow this + * transaction to dip into the reserve blocks to ensure + * the freeing of the space succeeds at ENOSPC. */ tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); + tp->t_flags |= XFS_TRANS_RESERVE; error = xfs_trans_reserve(tp, resblks, XFS_WRITE_LOG_RES(mp), From c32676eea19ce29cb74dba0f97b085e83f6b8915 Mon Sep 17 00:00:00 2001 From: David Chinner Date: Thu, 19 Jul 2007 16:28:58 +1000 Subject: [PATCH 4/4] [XFS] Fix inode size update before data write in xfs_setattr When changing the file size by a truncate() call, we log the change in the inode size. However, we do not flush any outstanding data that might not have been written to disk, thereby violating the data/inode size update order. This can leave files full of NULLs on crash. Hence if we are truncating the file, flush any unwritten data that may lie between the curret on disk inode size and the new inode size that is being logged to ensure that ordering is preserved. SGI-PV: 966308 SGI-Modid: xfs-linux-melb:xfs-kern:29174a Signed-off-by: David Chinner Signed-off-by: Christoph Hellwig Signed-off-by: Tim Shimmin --- fs/xfs/xfs_vnodeops.c | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 401cb00a55d6..1a5ad8cd97b0 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -589,7 +589,30 @@ xfs_setattr( code = xfs_igrow_start(ip, vap->va_size, credp); } xfs_iunlock(ip, XFS_ILOCK_EXCL); - vn_iowait(vp); /* wait for the completion of any pending DIOs */ + + /* + * We are going to log the inode size change in this + * transaction so any previous writes that are beyond the on + * disk EOF and the new EOF that have not been written out need + * to be written here. If we do not write the data out, we + * expose ourselves to the null files problem. + * + * Only flush from the on disk size to the smaller of the in + * memory file size or the new size as that's the range we + * really care about here and prevents waiting for other data + * not within the range we care about here. + */ + if (!code && + (ip->i_size != ip->i_d.di_size) && + (vap->va_size > ip->i_d.di_size)) { + code = bhv_vop_flush_pages(XFS_ITOV(ip), + ip->i_d.di_size, vap->va_size, + XFS_B_ASYNC, FI_NONE); + } + + /* wait for all I/O to complete */ + vn_iowait(vp); + if (!code) code = xfs_itruncate_data(ip, vap->va_size); if (code) {