From c5cd8273c5dee3a5f943f4acd379ce281898d6fa Mon Sep 17 00:00:00 2001 From: Arnaud Ebalard Date: Mon, 16 Feb 2015 15:58:31 -0800 Subject: [PATCH 01/37] rtc: isl12022: deprecate use of isl in compatible string for isil MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit "isil" and "isl" prefixes are used at various locations inside the kernel to reference Intersil corporation. This patch is part of a series fixing those locations were "isl" is used in compatible strings to use the now expected "isil" prefix instead (NASDAQ symbol for Intersil and most used version). The old compatible string is kept for backward compatibility. Signed-off-by: Arnaud Ebalard Cc: Rob Herring Cc: Pawel Moll Cc: Mark Rutland Cc: Ian Campbell Cc: Kumar Gala Cc: Russell King Cc: Stephen Warren Cc: Thierry Reding Cc: Alexandre Courbot Cc: Uwe Kleine-Knig Cc: Alessandro Zummo Cc: Peter Huewe Cc: Linus Walleij Cc: Mark Brown Cc: Arnd Bergmann Cc: Darshana Padmadas Cc: Grant Likely Cc: Rob Landley Cc: Jason Cooper Cc: Guenter Roeck Cc: Jason Gunthorpe Cc: Uwe Kleine-König Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-isl12022.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/rtc/rtc-isl12022.c b/drivers/rtc/rtc-isl12022.c index ee3ba7e6b45e..f9b082784b90 100644 --- a/drivers/rtc/rtc-isl12022.c +++ b/drivers/rtc/rtc-isl12022.c @@ -275,7 +275,8 @@ static int isl12022_probe(struct i2c_client *client, #ifdef CONFIG_OF static const struct of_device_id isl12022_dt_match[] = { - { .compatible = "isl,isl12022" }, + { .compatible = "isl,isl12022" }, /* for backward compat., don't use */ + { .compatible = "isil,isl12022" }, { }, }; #endif From 401b3c6adba0de6cd9b5cbf98ec17c0342c1c4be Mon Sep 17 00:00:00 2001 From: Arnaud Ebalard Date: Mon, 16 Feb 2015 15:58:35 -0800 Subject: [PATCH 02/37] rtc: isl12057: deprecate use of isl in compatible string for isil MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit "isil" and "isl" prefixes are used at various locations inside the kernel to reference Intersil corporation. This patch is part of a series fixing those locations were "isl" is used in compatible strings to use the now expected "isil" prefix instead (NASDAQ symbol for Intersil and most used version). The old compatible string is kept for backward compatibility. Signed-off-by: Arnaud Ebalard Cc: Rob Herring Cc: Pawel Moll Cc: Mark Rutland Cc: Ian Campbell Cc: Kumar Gala Cc: Russell King Cc: Stephen Warren Cc: Thierry Reding Cc: Alexandre Courbot Cc: Uwe Kleine-Knig Cc: Alessandro Zummo Cc: Peter Huewe Cc: Linus Walleij Cc: Mark Brown Cc: Arnd Bergmann Cc: Darshana Padmadas Cc: Grant Likely Cc: Rob Landley Cc: Jason Cooper Cc: Guenter Roeck Cc: Jason Gunthorpe Cc: Uwe Kleine-König Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-isl12057.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/rtc/rtc-isl12057.c b/drivers/rtc/rtc-isl12057.c index b8f862953f7f..da818d3337ce 100644 --- a/drivers/rtc/rtc-isl12057.c +++ b/drivers/rtc/rtc-isl12057.c @@ -644,7 +644,8 @@ static SIMPLE_DEV_PM_OPS(isl12057_rtc_pm_ops, isl12057_rtc_suspend, #ifdef CONFIG_OF static const struct of_device_id isl12057_dt_match[] = { - { .compatible = "isl,isl12057" }, + { .compatible = "isl,isl12057" }, /* for backward compat., don't use */ + { .compatible = "isil,isl12057" }, { }, }; #endif From 37e5157d6a5bed74453deb1ee21708a9e9f1eb95 Mon Sep 17 00:00:00 2001 From: Arnaud Ebalard Date: Mon, 16 Feb 2015 15:58:39 -0800 Subject: [PATCH 03/37] staging: iio: isl29028: deprecate use of isl in compatible string for isil MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit "isil" and "isl" prefixes are used at various locations inside the kernel to reference Intersil corporation. This patch is part of a series fixing those locations were "isl" is used in compatible strings to use the now expected "isil" prefix instead (NASDAQ symbol for Intersil and most used version). The old compatible string is kept for backward compatibility. Signed-off-by: Arnaud Ebalard Cc: Rob Herring Cc: Pawel Moll Cc: Mark Rutland Cc: Ian Campbell Cc: Kumar Gala Cc: Russell King Cc: Stephen Warren Cc: Thierry Reding Cc: Alexandre Courbot Cc: Uwe Kleine-Knig Cc: Alessandro Zummo Cc: Peter Huewe Cc: Linus Walleij Cc: Mark Brown Cc: Arnd Bergmann Cc: Darshana Padmadas Cc: Grant Likely Cc: Rob Landley Cc: Jason Cooper Cc: Guenter Roeck Cc: Jason Gunthorpe Cc: Uwe Kleine-König Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/staging/iio/light/isl29028.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/staging/iio/light/isl29028.c b/drivers/staging/iio/light/isl29028.c index e969107ddb47..6440e3b293ca 100644 --- a/drivers/staging/iio/light/isl29028.c +++ b/drivers/staging/iio/light/isl29028.c @@ -537,8 +537,8 @@ static const struct i2c_device_id isl29028_id[] = { MODULE_DEVICE_TABLE(i2c, isl29028_id); static const struct of_device_id isl29028_of_match[] = { - { .compatible = "isl,isl29028", }, - { .compatible = "isil,isl29028", },/* deprecated, don't use */ + { .compatible = "isl,isl29028", }, /* for backward compat., don't use */ + { .compatible = "isil,isl29028", }, { }, }; MODULE_DEVICE_TABLE(of, isl29028_of_match); From b4770fe5270ed4c67a53d5a9791aba63f8141e2a Mon Sep 17 00:00:00 2001 From: Arnaud Ebalard Date: Mon, 16 Feb 2015 15:58:43 -0800 Subject: [PATCH 04/37] arm: dts: zynq: update isl9305 compatible string to use isil vendor prefix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit "isil" and "isl" prefixes are used at various locations inside the kernel to reference Intersil corporation. This patch is part of a series fixing those locations were "isl" is used in compatible strings to use the now expected "isil" prefix instead (NASDAQ symbol for Intersil and most used version). Note: isl9305 is an I2C device so the patch does not in fact currently depend on the introduction of "isil"-based compatible string in isl9305 driver (provided by another patch) because I2C core does not check the prefix yet. Signed-off-by: Arnaud Ebalard Cc: Rob Herring Cc: Pawel Moll Cc: Mark Rutland Cc: Ian Campbell Cc: Kumar Gala Cc: Russell King Cc: Stephen Warren Cc: Thierry Reding Cc: Alexandre Courbot Cc: Uwe Kleine-Knig Cc: Alessandro Zummo Cc: Peter Huewe Cc: Linus Walleij Cc: Mark Brown Cc: Arnd Bergmann Cc: Darshana Padmadas Cc: Grant Likely Cc: Rob Landley Cc: Jason Cooper Cc: Guenter Roeck Cc: Jason Gunthorpe Cc: Uwe Kleine-König Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/boot/dts/zynq-parallella.dts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/boot/dts/zynq-parallella.dts b/arch/arm/boot/dts/zynq-parallella.dts index ab1dc0a56cdd..174571232ea5 100644 --- a/arch/arm/boot/dts/zynq-parallella.dts +++ b/arch/arm/boot/dts/zynq-parallella.dts @@ -58,7 +58,7 @@ status = "okay"; isl9305: isl9305@68 { - compatible = "isl,isl9305"; + compatible = "isil,isl9305"; reg = <0x68>; regulators { From 283307c7607de2a06d3bfae4cfbf5a566d457090 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Mon, 16 Feb 2015 15:58:46 -0800 Subject: [PATCH 05/37] mm: fix XIP fault vs truncate race DAX is a replacement for the variation of XIP currently supported by the ext2 filesystem. We have three different things in the tree called 'XIP', and the new focus is on access to data rather than executables, so a name change was in order. DAX stands for Direct Access. The X is for eXciting. The new focus on data access has resulted in more careful attention to races that exist in the current XIP code, but are not hit by the use-case that it was designed for. XIP's architecture worked fine for ext2, but DAX is architected to work with modern filsystems such as ext4 and XFS. DAX is not intended for use with btrfs; the value that btrfs adds relies on manipulating data and writing data to different locations, while DAX's value is for write-in-place and keeping the kernel from touching the data. DAX was developed in order to support NV-DIMMs, but it's become clear that its usefuless extends beyond NV-DIMMs and there are several potential customers including the tracing machinery. Other people want to place the kernel log in an area of memory, as long as they have a BIOS that does not clear DRAM on reboot. Patch 1 is a bug fix, probably worth including in 3.18. Patches 2 & 3 are infrastructure for DAX. Patches 4-8 replace the XIP code with its DAX equivalents, transforming ext2 to use the DAX code as we go. Note that patch 10 is the Documentation patch. Patches 9-15 clean up after the XIP code, removing the infrastructure that is no longer needed and renaming various XIP things to DAX. Most of these patches were added after Jan found things he didn't like in an earlier version of the ext4 patch ... that had been copied from ext2. So ext2 i being transformed to do things the same way that ext4 will later. The ability to mount ext2 filesystems with the 'xip' option is retained, although the 'dax' option is now preferred. Patch 16 adds some DAX infrastructure to support ext4. Patch 17 adds DAX support to ext4. It is broadly similar to ext2's DAX support, but it is more efficient than ext4's due to its support for unwritten extents. Patch 18 is another cleanup patch renaming XIP to DAX. My thanks to Mathieu Desnoyers for his reviews of the v11 patchset. Most of the changes below were based on his feedback. This patch (of 18): Pagecache faults recheck i_size after taking the page lock to ensure that the fault didn't race against a truncate. We don't have a page to lock in the XIP case, so use i_mmap_lock_read() instead. It is locked in the truncate path in unmap_mapping_range() after updating i_size. So while we hold it in the fault path, we are guaranteed that either i_size has already been updated in the truncate path, or that the truncate will subsequently call zap_page_range_single() and so remove the mapping we have just inserted. There is a window of time in which i_size has been reduced and the thread has a mapping to a page which will be removed from the file, but this is harmless as the page will not be allocated to a different purpose before the thread's access to it is revoked. [akpm@linux-foundation.org: switch to i_mmap_lock_read(), add comment in unmap_single_vma()] Signed-off-by: Matthew Wilcox Reviewed-by: Jan Kara Acked-by: Kirill A. Shutemov Reviewed-by: Mathieu Desnoyers Cc: Andreas Dilger Cc: Boaz Harrosh Cc: Christoph Hellwig Cc: Dave Chinner Cc: Jens Axboe Cc: Randy Dunlap Cc: Ross Zwisler Cc: Theodore Ts'o Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap_xip.c | 30 ++++++++++++++++++++++++++++-- mm/memory.c | 1 + 2 files changed, 29 insertions(+), 2 deletions(-) diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index c175f9f25210..59e1c5585748 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -256,8 +256,20 @@ again: __xip_unmap(mapping, vmf->pgoff); found: + /* + * We must recheck i_size under i_mmap_rwsem to prevent races + * with truncation + */ + i_mmap_lock_read(mapping); + size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT; + if (unlikely(vmf->pgoff >= size)) { + i_mmap_unlock_read(mapping); + return VM_FAULT_SIGBUS; + } err = vm_insert_mixed(vma, (unsigned long)vmf->virtual_address, xip_pfn); + i_mmap_unlock_read(mapping); if (err == -ENOMEM) return VM_FAULT_OOM; /* @@ -281,16 +293,30 @@ found: } if (error != -ENODATA) goto out; + + /* + * We must recheck i_size under i_mmap_rwsem to prevent races + * with truncation + */ + i_mmap_lock_read(mapping); + size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT; + if (unlikely(vmf->pgoff >= size)) { + ret = VM_FAULT_SIGBUS; + goto unlock; + } /* not shared and writable, use xip_sparse_page() */ page = xip_sparse_page(); if (!page) - goto out; + goto unlock; err = vm_insert_page(vma, (unsigned long)vmf->virtual_address, page); if (err == -ENOMEM) - goto out; + goto unlock; ret = VM_FAULT_NOPAGE; +unlock: + i_mmap_unlock_read(mapping); out: write_seqcount_end(&xip_sparse_seq); mutex_unlock(&xip_sparse_mutex); diff --git a/mm/memory.c b/mm/memory.c index 99275325f303..1b04e13b9993 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2329,6 +2329,7 @@ void unmap_mapping_range(struct address_space *mapping, details.last_index = ULONG_MAX; + /* DAX uses i_mmap_lock to serialise file truncate vs page fault */ i_mmap_lock_write(mapping); if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap))) unmap_mapping_range_tree(&mapping->i_mmap, &details); From 2e4cdab0584fa884e0a81c4f45b93ce875c9fcaa Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Mon, 16 Feb 2015 15:58:50 -0800 Subject: [PATCH 06/37] mm: allow page fault handlers to perform the COW Currently COW of an XIP file is done by first bringing in a read-only mapping, then retrying the fault and copying the page. It is much more efficient to tell the fault handler that a COW is being attempted (by passing in the pre-allocated page in the vm_fault structure), and allow the handler to perform the COW operation itself. The handler cannot insert the page itself if there is already a read-only mapping at that address, so allow the handler to return VM_FAULT_LOCKED and set the fault_page to be NULL. This indicates to the MM code that the i_mmap_lock is held instead of the page lock. Signed-off-by: Matthew Wilcox Acked-by: Kirill A. Shutemov Cc: Andreas Dilger Cc: Boaz Harrosh Cc: Christoph Hellwig Cc: Dave Chinner Cc: Jan Kara Cc: Jens Axboe Cc: Mathieu Desnoyers Cc: Randy Dunlap Cc: Ross Zwisler Cc: Theodore Ts'o Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 + mm/memory.c | 41 ++++++++++++++++++++++++++++++++--------- 2 files changed, 33 insertions(+), 9 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 9bee7ec0c31f..47a93928b90f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -224,6 +224,7 @@ struct vm_fault { pgoff_t pgoff; /* Logical page offset based on vma */ void __user *virtual_address; /* Faulting virtual address */ + struct page *cow_page; /* Handler may choose to COW */ struct page *page; /* ->fault handlers should return a * page here, unless VM_FAULT_NOPAGE * is set (which is also implied by diff --git a/mm/memory.c b/mm/memory.c index 1b04e13b9993..8068893697bb 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1965,6 +1965,7 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page, vmf.pgoff = page->index; vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; vmf.page = page; + vmf.cow_page = NULL; ret = vma->vm_ops->page_mkwrite(vma, &vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) @@ -2639,7 +2640,8 @@ oom: * See filemap_fault() and __lock_page_retry(). */ static int __do_fault(struct vm_area_struct *vma, unsigned long address, - pgoff_t pgoff, unsigned int flags, struct page **page) + pgoff_t pgoff, unsigned int flags, + struct page *cow_page, struct page **page) { struct vm_fault vmf; int ret; @@ -2648,10 +2650,13 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address, vmf.pgoff = pgoff; vmf.flags = flags; vmf.page = NULL; + vmf.cow_page = cow_page; ret = vma->vm_ops->fault(vma, &vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) return ret; + if (!vmf.page) + goto out; if (unlikely(PageHWPoison(vmf.page))) { if (ret & VM_FAULT_LOCKED) @@ -2665,6 +2670,7 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address, else VM_BUG_ON_PAGE(!PageLocked(vmf.page), vmf.page); + out: *page = vmf.page; return ret; } @@ -2835,7 +2841,7 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma, pte_unmap_unlock(pte, ptl); } - ret = __do_fault(vma, address, pgoff, flags, &fault_page); + ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) return ret; @@ -2875,26 +2881,43 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, return VM_FAULT_OOM; } - ret = __do_fault(vma, address, pgoff, flags, &fault_page); + ret = __do_fault(vma, address, pgoff, flags, new_page, &fault_page); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) goto uncharge_out; - copy_user_highpage(new_page, fault_page, address, vma); + if (fault_page) + copy_user_highpage(new_page, fault_page, address, vma); __SetPageUptodate(new_page); pte = pte_offset_map_lock(mm, pmd, address, &ptl); if (unlikely(!pte_same(*pte, orig_pte))) { pte_unmap_unlock(pte, ptl); - unlock_page(fault_page); - page_cache_release(fault_page); + if (fault_page) { + unlock_page(fault_page); + page_cache_release(fault_page); + } else { + /* + * The fault handler has no page to lock, so it holds + * i_mmap_lock for read to protect against truncate. + */ + i_mmap_unlock_read(vma->vm_file->f_mapping); + } goto uncharge_out; } do_set_pte(vma, address, new_page, pte, true, true); mem_cgroup_commit_charge(new_page, memcg, false); lru_cache_add_active_or_unevictable(new_page, vma); pte_unmap_unlock(pte, ptl); - unlock_page(fault_page); - page_cache_release(fault_page); + if (fault_page) { + unlock_page(fault_page); + page_cache_release(fault_page); + } else { + /* + * The fault handler has no page to lock, so it holds + * i_mmap_lock for read to protect against truncate. + */ + i_mmap_unlock_read(vma->vm_file->f_mapping); + } return ret; uncharge_out: mem_cgroup_cancel_charge(new_page, memcg); @@ -2913,7 +2936,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma, int dirtied = 0; int ret, tmp; - ret = __do_fault(vma, address, pgoff, flags, &fault_page); + ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) return ret; From fbbbad4bc2101e452b24e6e65d3d5e11314a0b5f Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Mon, 16 Feb 2015 15:58:53 -0800 Subject: [PATCH 07/37] vfs,ext2: introduce IS_DAX(inode) Use an inode flag to tag inodes which should avoid using the page cache. Convert ext2 to use it instead of mapping_is_xip(). Prevent I/Os to files tagged with the DAX flag from falling back to buffered I/O. Signed-off-by: Matthew Wilcox Reviewed-by: Jan Kara Reviewed-by: Mathieu Desnoyers Cc: Andreas Dilger Cc: Boaz Harrosh Cc: Christoph Hellwig Cc: Dave Chinner Cc: Jens Axboe Cc: Kirill A. Shutemov Cc: Randy Dunlap Cc: Ross Zwisler Cc: Theodore Ts'o Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ext2/inode.c | 9 ++++++--- fs/ext2/xip.h | 2 -- include/linux/fs.h | 6 ++++++ mm/filemap.c | 19 ++++++++++++------- 4 files changed, 24 insertions(+), 12 deletions(-) diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 36d35c36311d..0cb04486577d 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -731,7 +731,7 @@ static int ext2_get_blocks(struct inode *inode, goto cleanup; } - if (ext2_use_xip(inode->i_sb)) { + if (IS_DAX(inode)) { /* * we need to clear the block */ @@ -1201,7 +1201,7 @@ static int ext2_setsize(struct inode *inode, loff_t newsize) inode_dio_wait(inode); - if (mapping_is_xip(inode->i_mapping)) + if (IS_DAX(inode)) error = xip_truncate_page(inode->i_mapping, newsize); else if (test_opt(inode->i_sb, NOBH)) error = nobh_truncate_page(inode->i_mapping, @@ -1273,7 +1273,8 @@ void ext2_set_inode_flags(struct inode *inode) { unsigned int flags = EXT2_I(inode)->i_flags; - inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); + inode->i_flags &= ~(S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | + S_DIRSYNC | S_DAX); if (flags & EXT2_SYNC_FL) inode->i_flags |= S_SYNC; if (flags & EXT2_APPEND_FL) @@ -1284,6 +1285,8 @@ void ext2_set_inode_flags(struct inode *inode) inode->i_flags |= S_NOATIME; if (flags & EXT2_DIRSYNC_FL) inode->i_flags |= S_DIRSYNC; + if (test_opt(inode->i_sb, XIP)) + inode->i_flags |= S_DAX; } /* Propagate flags from i_flags to EXT2_I(inode)->i_flags */ diff --git a/fs/ext2/xip.h b/fs/ext2/xip.h index 18b34d2f31b3..29be73781419 100644 --- a/fs/ext2/xip.h +++ b/fs/ext2/xip.h @@ -16,9 +16,7 @@ static inline int ext2_use_xip (struct super_block *sb) } int ext2_get_xip_mem(struct address_space *, pgoff_t, int, void **, unsigned long *); -#define mapping_is_xip(map) unlikely(map->a_ops->get_xip_mem) #else -#define mapping_is_xip(map) 0 #define ext2_xip_verify_sb(sb) do { } while (0) #define ext2_use_xip(sb) 0 #define ext2_clear_xip_target(inode, chain) 0 diff --git a/include/linux/fs.h b/include/linux/fs.h index e49f10cc8a73..fb373bb5cf03 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1677,6 +1677,11 @@ struct super_operations { #define S_IMA 1024 /* Inode has an associated IMA struct */ #define S_AUTOMOUNT 2048 /* Automount/referral quasi-directory */ #define S_NOSEC 4096 /* no suid or xattr security attributes */ +#ifdef CONFIG_FS_XIP +#define S_DAX 8192 /* Direct Access, avoiding the page cache */ +#else +#define S_DAX 0 /* Make all the DAX code disappear */ +#endif /* * Note that nosuid etc flags are inode-specific: setting some file-system @@ -1714,6 +1719,7 @@ struct super_operations { #define IS_IMA(inode) ((inode)->i_flags & S_IMA) #define IS_AUTOMOUNT(inode) ((inode)->i_flags & S_AUTOMOUNT) #define IS_NOSEC(inode) ((inode)->i_flags & S_NOSEC) +#define IS_DAX(inode) ((inode)->i_flags & S_DAX) #define IS_WHITEOUT(inode) (S_ISCHR(inode->i_mode) && \ (inode)->i_rdev == WHITEOUT_DEV) diff --git a/mm/filemap.c b/mm/filemap.c index d9f5336552d7..1578c224285e 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1723,9 +1723,11 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) * we've already read everything we wanted to, or if * there was a short read because we hit EOF, go ahead * and return. Otherwise fallthrough to buffered io for - * the rest of the read. + * the rest of the read. Buffered reads will not work for + * DAX files, so don't bother trying. */ - if (retval < 0 || !iov_iter_count(iter) || *ppos >= size) { + if (retval < 0 || !iov_iter_count(iter) || *ppos >= size || + IS_DAX(inode)) { file_accessed(file); goto out; } @@ -2587,13 +2589,16 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) loff_t endbyte; written = generic_file_direct_write(iocb, from, pos); - if (written < 0 || written == count) + /* + * If the write stopped short of completing, fall back to + * buffered writes. Some filesystems do this for writes to + * holes, for example. For DAX files, a buffered write will + * not succeed (even if it did, DAX does not handle dirty + * page-cache pages correctly). + */ + if (written < 0 || written == count || IS_DAX(inode)) goto out; - /* - * direct-io write to a hole: fall through to buffered I/O - * for completing the rest of the request. - */ pos += written; count -= written; From d475c6346a38aef3058eba96867bfa726a3cc940 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Mon, 16 Feb 2015 15:58:56 -0800 Subject: [PATCH 08/37] dax,ext2: replace XIP read and write with DAX I/O Use the generic AIO infrastructure instead of custom read and write methods. In addition to giving us support for AIO, this adds the missing locking between read() and truncate(). Signed-off-by: Matthew Wilcox Reviewed-by: Ross Zwisler Reviewed-by: Jan Kara Cc: Andreas Dilger Cc: Boaz Harrosh Cc: Christoph Hellwig Cc: Dave Chinner Cc: Jens Axboe Cc: Kirill A. Shutemov Cc: Mathieu Desnoyers Cc: Randy Dunlap Cc: Theodore Ts'o Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 6 ++ fs/Makefile | 1 + fs/dax.c | 186 +++++++++++++++++++++++++++++++++++ fs/ext2/file.c | 6 +- fs/ext2/inode.c | 8 +- include/linux/fs.h | 12 ++- mm/filemap.c | 6 +- mm/filemap_xip.c | 234 --------------------------------------------- 8 files changed, 214 insertions(+), 245 deletions(-) create mode 100644 fs/dax.c diff --git a/MAINTAINERS b/MAINTAINERS index 348f5c16ef50..8670c224c833 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3151,6 +3151,12 @@ L: linux-i2c@vger.kernel.org S: Maintained F: drivers/i2c/busses/i2c-diolan-u2c.c +DIRECT ACCESS (DAX) +M: Matthew Wilcox +L: linux-fsdevel@vger.kernel.org +S: Supported +F: fs/dax.c + DIRECTORY NOTIFICATION (DNOTIFY) M: Eric Paris S: Maintained diff --git a/fs/Makefile b/fs/Makefile index bedff48e8fdc..0534444e257c 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -28,6 +28,7 @@ obj-$(CONFIG_SIGNALFD) += signalfd.o obj-$(CONFIG_TIMERFD) += timerfd.o obj-$(CONFIG_EVENTFD) += eventfd.o obj-$(CONFIG_AIO) += aio.o +obj-$(CONFIG_FS_XIP) += dax.o obj-$(CONFIG_FILE_LOCKING) += locks.o obj-$(CONFIG_COMPAT) += compat.o compat_ioctl.o obj-$(CONFIG_BINFMT_AOUT) += binfmt_aout.o diff --git a/fs/dax.c b/fs/dax.c new file mode 100644 index 000000000000..1a2bdbfa3ea9 --- /dev/null +++ b/fs/dax.c @@ -0,0 +1,186 @@ +/* + * fs/dax.c - Direct Access filesystem code + * Copyright (c) 2013-2014 Intel Corporation + * Author: Matthew Wilcox + * Author: Ross Zwisler + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#include +#include +#include +#include +#include +#include +#include + +static long dax_get_addr(struct buffer_head *bh, void **addr, unsigned blkbits) +{ + unsigned long pfn; + sector_t sector = bh->b_blocknr << (blkbits - 9); + return bdev_direct_access(bh->b_bdev, sector, addr, &pfn, bh->b_size); +} + +static void dax_new_buf(void *addr, unsigned size, unsigned first, loff_t pos, + loff_t end) +{ + loff_t final = end - pos + first; /* The final byte of the buffer */ + + if (first > 0) + memset(addr, 0, first); + if (final < size) + memset(addr + final, 0, size - final); +} + +static bool buffer_written(struct buffer_head *bh) +{ + return buffer_mapped(bh) && !buffer_unwritten(bh); +} + +/* + * When ext4 encounters a hole, it returns without modifying the buffer_head + * which means that we can't trust b_size. To cope with this, we set b_state + * to 0 before calling get_block and, if any bit is set, we know we can trust + * b_size. Unfortunate, really, since ext4 knows precisely how long a hole is + * and would save us time calling get_block repeatedly. + */ +static bool buffer_size_valid(struct buffer_head *bh) +{ + return bh->b_state != 0; +} + +static ssize_t dax_io(int rw, struct inode *inode, struct iov_iter *iter, + loff_t start, loff_t end, get_block_t get_block, + struct buffer_head *bh) +{ + ssize_t retval = 0; + loff_t pos = start; + loff_t max = start; + loff_t bh_max = start; + void *addr; + bool hole = false; + + if (rw != WRITE) + end = min(end, i_size_read(inode)); + + while (pos < end) { + unsigned len; + if (pos == max) { + unsigned blkbits = inode->i_blkbits; + sector_t block = pos >> blkbits; + unsigned first = pos - (block << blkbits); + long size; + + if (pos == bh_max) { + bh->b_size = PAGE_ALIGN(end - pos); + bh->b_state = 0; + retval = get_block(inode, block, bh, + rw == WRITE); + if (retval) + break; + if (!buffer_size_valid(bh)) + bh->b_size = 1 << blkbits; + bh_max = pos - first + bh->b_size; + } else { + unsigned done = bh->b_size - + (bh_max - (pos - first)); + bh->b_blocknr += done >> blkbits; + bh->b_size -= done; + } + + hole = (rw != WRITE) && !buffer_written(bh); + if (hole) { + addr = NULL; + size = bh->b_size - first; + } else { + retval = dax_get_addr(bh, &addr, blkbits); + if (retval < 0) + break; + if (buffer_unwritten(bh) || buffer_new(bh)) + dax_new_buf(addr, retval, first, pos, + end); + addr += first; + size = retval - first; + } + max = min(pos + size, end); + } + + if (rw == WRITE) + len = copy_from_iter(addr, max - pos, iter); + else if (!hole) + len = copy_to_iter(addr, max - pos, iter); + else + len = iov_iter_zero(max - pos, iter); + + if (!len) + break; + + pos += len; + addr += len; + } + + return (pos == start) ? retval : pos - start; +} + +/** + * dax_do_io - Perform I/O to a DAX file + * @rw: READ to read or WRITE to write + * @iocb: The control block for this I/O + * @inode: The file which the I/O is directed at + * @iter: The addresses to do I/O from or to + * @pos: The file offset where the I/O starts + * @get_block: The filesystem method used to translate file offsets to blocks + * @end_io: A filesystem callback for I/O completion + * @flags: See below + * + * This function uses the same locking scheme as do_blockdev_direct_IO: + * If @flags has DIO_LOCKING set, we assume that the i_mutex is held by the + * caller for writes. For reads, we take and release the i_mutex ourselves. + * If DIO_LOCKING is not set, the filesystem takes care of its own locking. + * As with do_blockdev_direct_IO(), we increment i_dio_count while the I/O + * is in progress. + */ +ssize_t dax_do_io(int rw, struct kiocb *iocb, struct inode *inode, + struct iov_iter *iter, loff_t pos, + get_block_t get_block, dio_iodone_t end_io, int flags) +{ + struct buffer_head bh; + ssize_t retval = -EINVAL; + loff_t end = pos + iov_iter_count(iter); + + memset(&bh, 0, sizeof(bh)); + + if ((flags & DIO_LOCKING) && (rw == READ)) { + struct address_space *mapping = inode->i_mapping; + mutex_lock(&inode->i_mutex); + retval = filemap_write_and_wait_range(mapping, pos, end - 1); + if (retval) { + mutex_unlock(&inode->i_mutex); + goto out; + } + } + + /* Protects against truncate */ + atomic_inc(&inode->i_dio_count); + + retval = dax_io(rw, inode, iter, pos, end, get_block, &bh); + + if ((flags & DIO_LOCKING) && (rw == READ)) + mutex_unlock(&inode->i_mutex); + + if ((retval > 0) && end_io) + end_io(iocb, pos, retval, bh.b_private); + + inode_dio_done(inode); + out: + return retval; +} +EXPORT_SYMBOL_GPL(dax_do_io); diff --git a/fs/ext2/file.c b/fs/ext2/file.c index 7c87b22a7228..a247123fd798 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -81,8 +81,10 @@ const struct file_operations ext2_file_operations = { #ifdef CONFIG_EXT2_FS_XIP const struct file_operations ext2_xip_file_operations = { .llseek = generic_file_llseek, - .read = xip_file_read, - .write = xip_file_write, + .read = new_sync_read, + .write = new_sync_write, + .read_iter = generic_file_read_iter, + .write_iter = generic_file_write_iter, .unlocked_ioctl = ext2_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ext2_compat_ioctl, diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 0cb04486577d..3ccd5fd47d66 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -859,7 +859,12 @@ ext2_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, size_t count = iov_iter_count(iter); ssize_t ret; - ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, ext2_get_block); + if (IS_DAX(inode)) + ret = dax_do_io(rw, iocb, inode, iter, offset, ext2_get_block, + NULL, DIO_LOCKING); + else + ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, + ext2_get_block); if (ret < 0 && (rw & WRITE)) ext2_write_failed(mapping, offset + count); return ret; @@ -888,6 +893,7 @@ const struct address_space_operations ext2_aops = { const struct address_space_operations ext2_aops_xip = { .bmap = ext2_bmap, .get_xip_mem = ext2_get_xip_mem, + .direct_IO = ext2_direct_IO, }; const struct address_space_operations ext2_nobh_aops = { diff --git a/include/linux/fs.h b/include/linux/fs.h index fb373bb5cf03..241c3c030fb5 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2587,12 +2587,11 @@ extern loff_t fixed_size_llseek(struct file *file, loff_t offset, extern int generic_file_open(struct inode * inode, struct file * filp); extern int nonseekable_open(struct inode * inode, struct file * filp); +ssize_t dax_do_io(int rw, struct kiocb *, struct inode *, struct iov_iter *, + loff_t, get_block_t, dio_iodone_t, int flags); + #ifdef CONFIG_FS_XIP -extern ssize_t xip_file_read(struct file *filp, char __user *buf, size_t len, - loff_t *ppos); extern int xip_file_mmap(struct file * file, struct vm_area_struct * vma); -extern ssize_t xip_file_write(struct file *filp, const char __user *buf, - size_t len, loff_t *ppos); extern int xip_truncate_page(struct address_space *mapping, loff_t from); #else static inline int xip_truncate_page(struct address_space *mapping, loff_t from) @@ -2756,6 +2755,11 @@ extern int generic_show_options(struct seq_file *m, struct dentry *root); extern void save_mount_options(struct super_block *sb, char *options); extern void replace_mount_options(struct super_block *sb, char *options); +static inline bool io_is_direct(struct file *filp) +{ + return (filp->f_flags & O_DIRECT) || IS_DAX(file_inode(filp)); +} + static inline ino_t parent_ino(struct dentry *dentry) { ino_t res; diff --git a/mm/filemap.c b/mm/filemap.c index 1578c224285e..ad7242043bdb 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1695,8 +1695,7 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) loff_t *ppos = &iocb->ki_pos; loff_t pos = *ppos; - /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ - if (file->f_flags & O_DIRECT) { + if (io_is_direct(file)) { struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; size_t count = iov_iter_count(iter); @@ -2584,8 +2583,7 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (err) goto out; - /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ - if (unlikely(file->f_flags & O_DIRECT)) { + if (io_is_direct(file)) { loff_t endbyte; written = generic_file_direct_write(iocb, from, pos); diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index 59e1c5585748..9c869f402c07 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -42,119 +42,6 @@ static struct page *xip_sparse_page(void) return __xip_sparse_page; } -/* - * This is a file read routine for execute in place files, and uses - * the mapping->a_ops->get_xip_mem() function for the actual low-level - * stuff. - * - * Note the struct file* is not used at all. It may be NULL. - */ -static ssize_t -do_xip_mapping_read(struct address_space *mapping, - struct file_ra_state *_ra, - struct file *filp, - char __user *buf, - size_t len, - loff_t *ppos) -{ - struct inode *inode = mapping->host; - pgoff_t index, end_index; - unsigned long offset; - loff_t isize, pos; - size_t copied = 0, error = 0; - - BUG_ON(!mapping->a_ops->get_xip_mem); - - pos = *ppos; - index = pos >> PAGE_CACHE_SHIFT; - offset = pos & ~PAGE_CACHE_MASK; - - isize = i_size_read(inode); - if (!isize) - goto out; - - end_index = (isize - 1) >> PAGE_CACHE_SHIFT; - do { - unsigned long nr, left; - void *xip_mem; - unsigned long xip_pfn; - int zero = 0; - - /* nr is the maximum number of bytes to copy from this page */ - nr = PAGE_CACHE_SIZE; - if (index >= end_index) { - if (index > end_index) - goto out; - nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1; - if (nr <= offset) { - goto out; - } - } - nr = nr - offset; - if (nr > len - copied) - nr = len - copied; - - error = mapping->a_ops->get_xip_mem(mapping, index, 0, - &xip_mem, &xip_pfn); - if (unlikely(error)) { - if (error == -ENODATA) { - /* sparse */ - zero = 1; - } else - goto out; - } - - /* If users can be writing to this page using arbitrary - * virtual addresses, take care about potential aliasing - * before reading the page on the kernel side. - */ - if (mapping_writably_mapped(mapping)) - /* address based flush */ ; - - /* - * Ok, we have the mem, so now we can copy it to user space... - * - * The actor routine returns how many bytes were actually used.. - * NOTE! This may not be the same as how much of a user buffer - * we filled up (we may be padding etc), so we can only update - * "pos" here (the actor routine has to update the user buffer - * pointers and the remaining count). - */ - if (!zero) - left = __copy_to_user(buf+copied, xip_mem+offset, nr); - else - left = __clear_user(buf + copied, nr); - - if (left) { - error = -EFAULT; - goto out; - } - - copied += (nr - left); - offset += (nr - left); - index += offset >> PAGE_CACHE_SHIFT; - offset &= ~PAGE_CACHE_MASK; - } while (copied < len); - -out: - *ppos = pos + copied; - if (filp) - file_accessed(filp); - - return (copied ? copied : error); -} - -ssize_t -xip_file_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos) -{ - if (!access_ok(VERIFY_WRITE, buf, len)) - return -EFAULT; - - return do_xip_mapping_read(filp->f_mapping, &filp->f_ra, filp, - buf, len, ppos); -} -EXPORT_SYMBOL_GPL(xip_file_read); - /* * __xip_unmap is invoked from xip_unmap and xip_write * @@ -341,127 +228,6 @@ int xip_file_mmap(struct file * file, struct vm_area_struct * vma) } EXPORT_SYMBOL_GPL(xip_file_mmap); -static ssize_t -__xip_file_write(struct file *filp, const char __user *buf, - size_t count, loff_t pos, loff_t *ppos) -{ - struct address_space * mapping = filp->f_mapping; - const struct address_space_operations *a_ops = mapping->a_ops; - struct inode *inode = mapping->host; - long status = 0; - size_t bytes; - ssize_t written = 0; - - BUG_ON(!mapping->a_ops->get_xip_mem); - - do { - unsigned long index; - unsigned long offset; - size_t copied; - void *xip_mem; - unsigned long xip_pfn; - - offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ - index = pos >> PAGE_CACHE_SHIFT; - bytes = PAGE_CACHE_SIZE - offset; - if (bytes > count) - bytes = count; - - status = a_ops->get_xip_mem(mapping, index, 0, - &xip_mem, &xip_pfn); - if (status == -ENODATA) { - /* we allocate a new page unmap it */ - mutex_lock(&xip_sparse_mutex); - status = a_ops->get_xip_mem(mapping, index, 1, - &xip_mem, &xip_pfn); - mutex_unlock(&xip_sparse_mutex); - if (!status) - /* unmap page at pgoff from all other vmas */ - __xip_unmap(mapping, index); - } - - if (status) - break; - - copied = bytes - - __copy_from_user_nocache(xip_mem + offset, buf, bytes); - - if (likely(copied > 0)) { - status = copied; - - if (status >= 0) { - written += status; - count -= status; - pos += status; - buf += status; - } - } - if (unlikely(copied != bytes)) - if (status >= 0) - status = -EFAULT; - if (status < 0) - break; - } while (count); - *ppos = pos; - /* - * No need to use i_size_read() here, the i_size - * cannot change under us because we hold i_mutex. - */ - if (pos > inode->i_size) { - i_size_write(inode, pos); - mark_inode_dirty(inode); - } - - return written ? written : status; -} - -ssize_t -xip_file_write(struct file *filp, const char __user *buf, size_t len, - loff_t *ppos) -{ - struct address_space *mapping = filp->f_mapping; - struct inode *inode = mapping->host; - size_t count; - loff_t pos; - ssize_t ret; - - mutex_lock(&inode->i_mutex); - - if (!access_ok(VERIFY_READ, buf, len)) { - ret=-EFAULT; - goto out_up; - } - - pos = *ppos; - count = len; - - /* We can write back this queue in page reclaim */ - current->backing_dev_info = inode_to_bdi(inode); - - ret = generic_write_checks(filp, &pos, &count, S_ISBLK(inode->i_mode)); - if (ret) - goto out_backing; - if (count == 0) - goto out_backing; - - ret = file_remove_suid(filp); - if (ret) - goto out_backing; - - ret = file_update_time(filp); - if (ret) - goto out_backing; - - ret = __xip_file_write (filp, buf, count, pos, ppos); - - out_backing: - current->backing_dev_info = NULL; - out_up: - mutex_unlock(&inode->i_mutex); - return ret; -} -EXPORT_SYMBOL_GPL(xip_file_write); - /* * truncate a page used for execute in place * functionality is analog to block_truncate_page but does use get_xip_mem From 289c6aedac981533331428bc933fff21ae332c9e Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Mon, 16 Feb 2015 15:58:59 -0800 Subject: [PATCH 09/37] dax,ext2: replace ext2_clear_xip_target with dax_clear_blocks This is practically generic code; other filesystems will want to call it from other places, but there's nothing ext2-specific about it. Make it a little more generic by allowing it to take a count of the number of bytes to zero rather than fixing it to a single page. Thanks to Dave Hansen for suggesting that I need to call cond_resched() if zeroing more than one page. Signed-off-by: Matthew Wilcox Cc: Andreas Dilger Cc: Boaz Harrosh Cc: Christoph Hellwig Cc: Dave Chinner Cc: Jan Kara Cc: Jens Axboe Cc: Kirill A. Shutemov Cc: Mathieu Desnoyers Cc: Randy Dunlap Cc: Ross Zwisler Cc: Theodore Ts'o Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dax.c | 37 +++++++++++++++++++++++++++++++++++++ fs/ext2/inode.c | 8 +++++--- fs/ext2/xip.c | 14 -------------- fs/ext2/xip.h | 3 --- include/linux/fs.h | 1 + 5 files changed, 43 insertions(+), 20 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index 1a2bdbfa3ea9..69c3126a05b4 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -20,8 +20,45 @@ #include #include #include +#include #include +int dax_clear_blocks(struct inode *inode, sector_t block, long size) +{ + struct block_device *bdev = inode->i_sb->s_bdev; + sector_t sector = block << (inode->i_blkbits - 9); + + might_sleep(); + do { + void *addr; + unsigned long pfn; + long count; + + count = bdev_direct_access(bdev, sector, &addr, &pfn, size); + if (count < 0) + return count; + BUG_ON(size < count); + while (count > 0) { + unsigned pgsz = PAGE_SIZE - offset_in_page(addr); + if (pgsz > count) + pgsz = count; + if (pgsz < PAGE_SIZE) + memset(addr, 0, pgsz); + else + clear_page(addr); + addr += pgsz; + size -= pgsz; + count -= pgsz; + BUG_ON(pgsz & 511); + sector += pgsz / 512; + cond_resched(); + } + } while (size); + + return 0; +} +EXPORT_SYMBOL_GPL(dax_clear_blocks); + static long dax_get_addr(struct buffer_head *bh, void **addr, unsigned blkbits) { unsigned long pfn; diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 3ccd5fd47d66..52978b853226 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -733,10 +733,12 @@ static int ext2_get_blocks(struct inode *inode, if (IS_DAX(inode)) { /* - * we need to clear the block + * block must be initialised before we put it in the tree + * so that it's not found by another thread before it's + * initialised */ - err = ext2_clear_xip_target (inode, - le32_to_cpu(chain[depth-1].key)); + err = dax_clear_blocks(inode, le32_to_cpu(chain[depth-1].key), + 1 << inode->i_blkbits); if (err) { mutex_unlock(&ei->truncate_mutex); goto cleanup; diff --git a/fs/ext2/xip.c b/fs/ext2/xip.c index bbc5fec6ff7f..8cfca3a4cd58 100644 --- a/fs/ext2/xip.c +++ b/fs/ext2/xip.c @@ -42,20 +42,6 @@ __ext2_get_block(struct inode *inode, pgoff_t pgoff, int create, return rc; } -int -ext2_clear_xip_target(struct inode *inode, sector_t block) -{ - void *kaddr; - unsigned long pfn; - long size; - - size = __inode_direct_access(inode, block, &kaddr, &pfn, PAGE_SIZE); - if (size < 0) - return size; - clear_page(kaddr); - return 0; -} - void ext2_xip_verify_sb(struct super_block *sb) { struct ext2_sb_info *sbi = EXT2_SB(sb); diff --git a/fs/ext2/xip.h b/fs/ext2/xip.h index 29be73781419..b2592f2f3c9d 100644 --- a/fs/ext2/xip.h +++ b/fs/ext2/xip.h @@ -7,8 +7,6 @@ #ifdef CONFIG_EXT2_FS_XIP extern void ext2_xip_verify_sb (struct super_block *); -extern int ext2_clear_xip_target (struct inode *, sector_t); - static inline int ext2_use_xip (struct super_block *sb) { struct ext2_sb_info *sbi = EXT2_SB(sb); @@ -19,6 +17,5 @@ int ext2_get_xip_mem(struct address_space *, pgoff_t, int, #else #define ext2_xip_verify_sb(sb) do { } while (0) #define ext2_use_xip(sb) 0 -#define ext2_clear_xip_target(inode, chain) 0 #define ext2_get_xip_mem NULL #endif diff --git a/include/linux/fs.h b/include/linux/fs.h index 241c3c030fb5..8084934a5676 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2589,6 +2589,7 @@ extern int nonseekable_open(struct inode * inode, struct file * filp); ssize_t dax_do_io(int rw, struct kiocb *, struct inode *, struct iov_iter *, loff_t, get_block_t, dio_iodone_t, int flags); +int dax_clear_blocks(struct inode *, sector_t block, long size); #ifdef CONFIG_FS_XIP extern int xip_file_mmap(struct file * file, struct vm_area_struct * vma); From f7ca90b160307d63aaedab8bd451c24a182db20f Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Mon, 16 Feb 2015 15:59:02 -0800 Subject: [PATCH 10/37] dax,ext2: replace the XIP page fault handler with the DAX page fault handler Instead of calling aops->get_xip_mem from the fault handler, the filesystem passes a get_block_t that is used to find the appropriate blocks. This requires that all architectures implement copy_user_page(). At the time of writing, mips and arm do not. Patches exist and are in progress. [akpm@linux-foundation.org: remap_file_pages went away] Signed-off-by: Matthew Wilcox Reviewed-by: Jan Kara Cc: Andreas Dilger Cc: Boaz Harrosh Cc: Christoph Hellwig Cc: Dave Chinner Cc: Jens Axboe Cc: Kirill A. Shutemov Cc: Mathieu Desnoyers Cc: Randy Dunlap Cc: Ross Zwisler Cc: Theodore Ts'o Cc: Russell King Cc: Ralf Baechle Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dax.c | 241 +++++++++++++++++++++++++++++++++++++++++++++ fs/ext2/file.c | 34 ++++++- include/linux/fs.h | 4 +- mm/filemap_xip.c | 206 -------------------------------------- 4 files changed, 276 insertions(+), 209 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index 69c3126a05b4..553e55b93495 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -19,9 +19,13 @@ #include #include #include +#include +#include +#include #include #include #include +#include int dax_clear_blocks(struct inode *inode, sector_t block, long size) { @@ -221,3 +225,240 @@ ssize_t dax_do_io(int rw, struct kiocb *iocb, struct inode *inode, return retval; } EXPORT_SYMBOL_GPL(dax_do_io); + +/* + * The user has performed a load from a hole in the file. Allocating + * a new page in the file would cause excessive storage usage for + * workloads with sparse files. We allocate a page cache page instead. + * We'll kick it out of the page cache if it's ever written to, + * otherwise it will simply fall out of the page cache under memory + * pressure without ever having been dirtied. + */ +static int dax_load_hole(struct address_space *mapping, struct page *page, + struct vm_fault *vmf) +{ + unsigned long size; + struct inode *inode = mapping->host; + if (!page) + page = find_or_create_page(mapping, vmf->pgoff, + GFP_KERNEL | __GFP_ZERO); + if (!page) + return VM_FAULT_OOM; + /* Recheck i_size under page lock to avoid truncate race */ + size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (vmf->pgoff >= size) { + unlock_page(page); + page_cache_release(page); + return VM_FAULT_SIGBUS; + } + + vmf->page = page; + return VM_FAULT_LOCKED; +} + +static int copy_user_bh(struct page *to, struct buffer_head *bh, + unsigned blkbits, unsigned long vaddr) +{ + void *vfrom, *vto; + if (dax_get_addr(bh, &vfrom, blkbits) < 0) + return -EIO; + vto = kmap_atomic(to); + copy_user_page(vto, vfrom, vaddr, to); + kunmap_atomic(vto); + return 0; +} + +static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, + struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct address_space *mapping = inode->i_mapping; + sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9); + unsigned long vaddr = (unsigned long)vmf->virtual_address; + void *addr; + unsigned long pfn; + pgoff_t size; + int error; + + i_mmap_lock_read(mapping); + + /* + * Check truncate didn't happen while we were allocating a block. + * If it did, this block may or may not be still allocated to the + * file. We can't tell the filesystem to free it because we can't + * take i_mutex here. In the worst case, the file still has blocks + * allocated past the end of the file. + */ + size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (unlikely(vmf->pgoff >= size)) { + error = -EIO; + goto out; + } + + error = bdev_direct_access(bh->b_bdev, sector, &addr, &pfn, bh->b_size); + if (error < 0) + goto out; + if (error < PAGE_SIZE) { + error = -EIO; + goto out; + } + + if (buffer_unwritten(bh) || buffer_new(bh)) + clear_page(addr); + + error = vm_insert_mixed(vma, vaddr, pfn); + + out: + i_mmap_unlock_read(mapping); + + if (bh->b_end_io) + bh->b_end_io(bh, 1); + + return error; +} + +static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, + get_block_t get_block) +{ + struct file *file = vma->vm_file; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + struct page *page; + struct buffer_head bh; + unsigned long vaddr = (unsigned long)vmf->virtual_address; + unsigned blkbits = inode->i_blkbits; + sector_t block; + pgoff_t size; + int error; + int major = 0; + + size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (vmf->pgoff >= size) + return VM_FAULT_SIGBUS; + + memset(&bh, 0, sizeof(bh)); + block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits); + bh.b_size = PAGE_SIZE; + + repeat: + page = find_get_page(mapping, vmf->pgoff); + if (page) { + if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) { + page_cache_release(page); + return VM_FAULT_RETRY; + } + if (unlikely(page->mapping != mapping)) { + unlock_page(page); + page_cache_release(page); + goto repeat; + } + size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (unlikely(vmf->pgoff >= size)) { + /* + * We have a struct page covering a hole in the file + * from a read fault and we've raced with a truncate + */ + error = -EIO; + goto unlock_page; + } + } + + error = get_block(inode, block, &bh, 0); + if (!error && (bh.b_size < PAGE_SIZE)) + error = -EIO; /* fs corruption? */ + if (error) + goto unlock_page; + + if (!buffer_mapped(&bh) && !buffer_unwritten(&bh) && !vmf->cow_page) { + if (vmf->flags & FAULT_FLAG_WRITE) { + error = get_block(inode, block, &bh, 1); + count_vm_event(PGMAJFAULT); + mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); + major = VM_FAULT_MAJOR; + if (!error && (bh.b_size < PAGE_SIZE)) + error = -EIO; + if (error) + goto unlock_page; + } else { + return dax_load_hole(mapping, page, vmf); + } + } + + if (vmf->cow_page) { + struct page *new_page = vmf->cow_page; + if (buffer_written(&bh)) + error = copy_user_bh(new_page, &bh, blkbits, vaddr); + else + clear_user_highpage(new_page, vaddr); + if (error) + goto unlock_page; + vmf->page = page; + if (!page) { + i_mmap_lock_read(mapping); + /* Check we didn't race with truncate */ + size = (i_size_read(inode) + PAGE_SIZE - 1) >> + PAGE_SHIFT; + if (vmf->pgoff >= size) { + i_mmap_unlock_read(mapping); + error = -EIO; + goto out; + } + } + return VM_FAULT_LOCKED; + } + + /* Check we didn't race with a read fault installing a new page */ + if (!page && major) + page = find_lock_page(mapping, vmf->pgoff); + + if (page) { + unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT, + PAGE_CACHE_SIZE, 0); + delete_from_page_cache(page); + unlock_page(page); + page_cache_release(page); + } + + error = dax_insert_mapping(inode, &bh, vma, vmf); + + out: + if (error == -ENOMEM) + return VM_FAULT_OOM | major; + /* -EBUSY is fine, somebody else faulted on the same PTE */ + if ((error < 0) && (error != -EBUSY)) + return VM_FAULT_SIGBUS | major; + return VM_FAULT_NOPAGE | major; + + unlock_page: + if (page) { + unlock_page(page); + page_cache_release(page); + } + goto out; +} + +/** + * dax_fault - handle a page fault on a DAX file + * @vma: The virtual memory area where the fault occurred + * @vmf: The description of the fault + * @get_block: The filesystem method used to translate file offsets to blocks + * + * When a page fault occurs, filesystems may call this helper in their + * fault handler for DAX files. + */ +int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, + get_block_t get_block) +{ + int result; + struct super_block *sb = file_inode(vma->vm_file)->i_sb; + + if (vmf->flags & FAULT_FLAG_WRITE) { + sb_start_pagefault(sb); + file_update_time(vma->vm_file); + } + result = do_dax_fault(vma, vmf, get_block); + if (vmf->flags & FAULT_FLAG_WRITE) + sb_end_pagefault(sb); + + return result; +} +EXPORT_SYMBOL_GPL(dax_fault); diff --git a/fs/ext2/file.c b/fs/ext2/file.c index a247123fd798..a61c93fd9dce 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -25,6 +25,36 @@ #include "xattr.h" #include "acl.h" +#ifdef CONFIG_EXT2_FS_XIP +static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + return dax_fault(vma, vmf, ext2_get_block); +} + +static int ext2_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + return dax_mkwrite(vma, vmf, ext2_get_block); +} + +static const struct vm_operations_struct ext2_dax_vm_ops = { + .fault = ext2_dax_fault, + .page_mkwrite = ext2_dax_mkwrite, +}; + +static int ext2_file_mmap(struct file *file, struct vm_area_struct *vma) +{ + if (!IS_DAX(file_inode(file))) + return generic_file_mmap(file, vma); + + file_accessed(file); + vma->vm_ops = &ext2_dax_vm_ops; + vma->vm_flags |= VM_MIXEDMAP; + return 0; +} +#else +#define ext2_file_mmap generic_file_mmap +#endif + /* * Called when filp is released. This happens when all file descriptors * for a single struct file are closed. Note that different open() calls @@ -70,7 +100,7 @@ const struct file_operations ext2_file_operations = { #ifdef CONFIG_COMPAT .compat_ioctl = ext2_compat_ioctl, #endif - .mmap = generic_file_mmap, + .mmap = ext2_file_mmap, .open = dquot_file_open, .release = ext2_release_file, .fsync = ext2_fsync, @@ -89,7 +119,7 @@ const struct file_operations ext2_xip_file_operations = { #ifdef CONFIG_COMPAT .compat_ioctl = ext2_compat_ioctl, #endif - .mmap = xip_file_mmap, + .mmap = ext2_file_mmap, .open = dquot_file_open, .release = ext2_release_file, .fsync = ext2_fsync, diff --git a/include/linux/fs.h b/include/linux/fs.h index 8084934a5676..6bad6d4c579b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -51,6 +51,7 @@ struct swap_info_struct; struct seq_file; struct workqueue_struct; struct iov_iter; +struct vm_fault; extern void __init inode_init(void); extern void __init inode_init_early(void); @@ -2590,9 +2591,10 @@ extern int nonseekable_open(struct inode * inode, struct file * filp); ssize_t dax_do_io(int rw, struct kiocb *, struct inode *, struct iov_iter *, loff_t, get_block_t, dio_iodone_t, int flags); int dax_clear_blocks(struct inode *, sector_t block, long size); +int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t); +#define dax_mkwrite(vma, vmf, gb) dax_fault(vma, vmf, gb) #ifdef CONFIG_FS_XIP -extern int xip_file_mmap(struct file * file, struct vm_area_struct * vma); extern int xip_truncate_page(struct address_space *mapping, loff_t from); #else static inline int xip_truncate_page(struct address_space *mapping, loff_t from) diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index 9c869f402c07..59fb387b2238 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -22,212 +22,6 @@ #include #include -/* - * We do use our own empty page to avoid interference with other users - * of ZERO_PAGE(), such as /dev/zero - */ -static DEFINE_MUTEX(xip_sparse_mutex); -static seqcount_t xip_sparse_seq = SEQCNT_ZERO(xip_sparse_seq); -static struct page *__xip_sparse_page; - -/* called under xip_sparse_mutex */ -static struct page *xip_sparse_page(void) -{ - if (!__xip_sparse_page) { - struct page *page = alloc_page(GFP_HIGHUSER | __GFP_ZERO); - - if (page) - __xip_sparse_page = page; - } - return __xip_sparse_page; -} - -/* - * __xip_unmap is invoked from xip_unmap and xip_write - * - * This function walks all vmas of the address_space and unmaps the - * __xip_sparse_page when found at pgoff. - */ -static void __xip_unmap(struct address_space * mapping, unsigned long pgoff) -{ - struct vm_area_struct *vma; - struct page *page; - unsigned count; - int locked = 0; - - count = read_seqcount_begin(&xip_sparse_seq); - - page = __xip_sparse_page; - if (!page) - return; - -retry: - i_mmap_lock_read(mapping); - vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { - pte_t *pte, pteval; - spinlock_t *ptl; - struct mm_struct *mm = vma->vm_mm; - unsigned long address = vma->vm_start + - ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); - - BUG_ON(address < vma->vm_start || address >= vma->vm_end); - pte = page_check_address(page, mm, address, &ptl, 1); - if (pte) { - /* Nuke the page table entry. */ - flush_cache_page(vma, address, pte_pfn(*pte)); - pteval = ptep_clear_flush(vma, address, pte); - page_remove_rmap(page); - dec_mm_counter(mm, MM_FILEPAGES); - BUG_ON(pte_dirty(pteval)); - pte_unmap_unlock(pte, ptl); - /* must invalidate_page _before_ freeing the page */ - mmu_notifier_invalidate_page(mm, address); - page_cache_release(page); - } - } - i_mmap_unlock_read(mapping); - - if (locked) { - mutex_unlock(&xip_sparse_mutex); - } else if (read_seqcount_retry(&xip_sparse_seq, count)) { - mutex_lock(&xip_sparse_mutex); - locked = 1; - goto retry; - } -} - -/* - * xip_fault() is invoked via the vma operations vector for a - * mapped memory region to read in file data during a page fault. - * - * This function is derived from filemap_fault, but used for execute in place - */ -static int xip_file_fault(struct vm_area_struct *vma, struct vm_fault *vmf) -{ - struct file *file = vma->vm_file; - struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; - pgoff_t size; - void *xip_mem; - unsigned long xip_pfn; - struct page *page; - int error; - - /* XXX: are VM_FAULT_ codes OK? */ -again: - size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - if (vmf->pgoff >= size) - return VM_FAULT_SIGBUS; - - error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 0, - &xip_mem, &xip_pfn); - if (likely(!error)) - goto found; - if (error != -ENODATA) - return VM_FAULT_OOM; - - /* sparse block */ - if ((vma->vm_flags & (VM_WRITE | VM_MAYWRITE)) && - (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) && - (!(mapping->host->i_sb->s_flags & MS_RDONLY))) { - int err; - - /* maybe shared writable, allocate new block */ - mutex_lock(&xip_sparse_mutex); - error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 1, - &xip_mem, &xip_pfn); - mutex_unlock(&xip_sparse_mutex); - if (error) - return VM_FAULT_SIGBUS; - /* unmap sparse mappings at pgoff from all other vmas */ - __xip_unmap(mapping, vmf->pgoff); - -found: - /* - * We must recheck i_size under i_mmap_rwsem to prevent races - * with truncation - */ - i_mmap_lock_read(mapping); - size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> - PAGE_CACHE_SHIFT; - if (unlikely(vmf->pgoff >= size)) { - i_mmap_unlock_read(mapping); - return VM_FAULT_SIGBUS; - } - err = vm_insert_mixed(vma, (unsigned long)vmf->virtual_address, - xip_pfn); - i_mmap_unlock_read(mapping); - if (err == -ENOMEM) - return VM_FAULT_OOM; - /* - * err == -EBUSY is fine, we've raced against another thread - * that faulted-in the same page - */ - if (err != -EBUSY) - BUG_ON(err); - return VM_FAULT_NOPAGE; - } else { - int err, ret = VM_FAULT_OOM; - - mutex_lock(&xip_sparse_mutex); - write_seqcount_begin(&xip_sparse_seq); - error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 0, - &xip_mem, &xip_pfn); - if (unlikely(!error)) { - write_seqcount_end(&xip_sparse_seq); - mutex_unlock(&xip_sparse_mutex); - goto again; - } - if (error != -ENODATA) - goto out; - - /* - * We must recheck i_size under i_mmap_rwsem to prevent races - * with truncation - */ - i_mmap_lock_read(mapping); - size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> - PAGE_CACHE_SHIFT; - if (unlikely(vmf->pgoff >= size)) { - ret = VM_FAULT_SIGBUS; - goto unlock; - } - /* not shared and writable, use xip_sparse_page() */ - page = xip_sparse_page(); - if (!page) - goto unlock; - err = vm_insert_page(vma, (unsigned long)vmf->virtual_address, - page); - if (err == -ENOMEM) - goto unlock; - - ret = VM_FAULT_NOPAGE; -unlock: - i_mmap_unlock_read(mapping); -out: - write_seqcount_end(&xip_sparse_seq); - mutex_unlock(&xip_sparse_mutex); - - return ret; - } -} - -static const struct vm_operations_struct xip_file_vm_ops = { - .fault = xip_file_fault, - .page_mkwrite = filemap_page_mkwrite, -}; - -int xip_file_mmap(struct file * file, struct vm_area_struct * vma) -{ - BUG_ON(!file->f_mapping->a_ops->get_xip_mem); - - file_accessed(file); - vma->vm_ops = &xip_file_vm_ops; - vma->vm_flags |= VM_MIXEDMAP; - return 0; -} -EXPORT_SYMBOL_GPL(xip_file_mmap); - /* * truncate a page used for execute in place * functionality is analog to block_truncate_page but does use get_xip_mem From 4c0ccfef2e9f7418a6eb0bf07a2fc8f216365b18 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Mon, 16 Feb 2015 15:59:06 -0800 Subject: [PATCH 11/37] dax,ext2: replace xip_truncate_page with dax_truncate_page It takes a get_block parameter just like nobh_truncate_page() and block_truncate_page() Signed-off-by: Matthew Wilcox Reviewed-by: Mathieu Desnoyers Cc: Andreas Dilger Cc: Boaz Harrosh Cc: Christoph Hellwig Cc: Dave Chinner Cc: Jan Kara Cc: Jens Axboe Cc: Kirill A. Shutemov Cc: Randy Dunlap Cc: Ross Zwisler Cc: Theodore Ts'o Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dax.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ fs/ext2/inode.c | 2 +- include/linux/fs.h | 10 +--------- mm/filemap_xip.c | 40 ---------------------------------------- 4 files changed, 46 insertions(+), 50 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index 553e55b93495..ebf9b2231b0f 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -462,3 +462,47 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, return result; } EXPORT_SYMBOL_GPL(dax_fault); + +/** + * dax_truncate_page - handle a partial page being truncated in a DAX file + * @inode: The file being truncated + * @from: The file offset that is being truncated to + * @get_block: The filesystem method used to translate file offsets to blocks + * + * Similar to block_truncate_page(), this function can be called by a + * filesystem when it is truncating an DAX file to handle the partial page. + * + * We work in terms of PAGE_CACHE_SIZE here for commonality with + * block_truncate_page(), but we could go down to PAGE_SIZE if the filesystem + * took care of disposing of the unnecessary blocks. Even if the filesystem + * block size is smaller than PAGE_SIZE, we have to zero the rest of the page + * since the file might be mmaped. + */ +int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block) +{ + struct buffer_head bh; + pgoff_t index = from >> PAGE_CACHE_SHIFT; + unsigned offset = from & (PAGE_CACHE_SIZE-1); + unsigned length = PAGE_CACHE_ALIGN(from) - from; + int err; + + /* Block boundary? Nothing to do */ + if (!length) + return 0; + + memset(&bh, 0, sizeof(bh)); + bh.b_size = PAGE_CACHE_SIZE; + err = get_block(inode, index, &bh, 0); + if (err < 0) + return err; + if (buffer_written(&bh)) { + void *addr; + err = dax_get_addr(&bh, &addr, inode->i_blkbits); + if (err < 0) + return err; + memset(addr + offset, 0, length); + } + + return 0; +} +EXPORT_SYMBOL_GPL(dax_truncate_page); diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 52978b853226..5ac0a3461b3c 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -1210,7 +1210,7 @@ static int ext2_setsize(struct inode *inode, loff_t newsize) inode_dio_wait(inode); if (IS_DAX(inode)) - error = xip_truncate_page(inode->i_mapping, newsize); + error = dax_truncate_page(inode, newsize, ext2_get_block); else if (test_opt(inode->i_sb, NOBH)) error = nobh_truncate_page(inode->i_mapping, newsize, ext2_get_block); diff --git a/include/linux/fs.h b/include/linux/fs.h index 6bad6d4c579b..2c8f9055af38 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2591,18 +2591,10 @@ extern int nonseekable_open(struct inode * inode, struct file * filp); ssize_t dax_do_io(int rw, struct kiocb *, struct inode *, struct iov_iter *, loff_t, get_block_t, dio_iodone_t, int flags); int dax_clear_blocks(struct inode *, sector_t block, long size); +int dax_truncate_page(struct inode *, loff_t from, get_block_t); int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t); #define dax_mkwrite(vma, vmf, gb) dax_fault(vma, vmf, gb) -#ifdef CONFIG_FS_XIP -extern int xip_truncate_page(struct address_space *mapping, loff_t from); -#else -static inline int xip_truncate_page(struct address_space *mapping, loff_t from) -{ - return 0; -} -#endif - #ifdef CONFIG_BLOCK typedef void (dio_submit_t)(int rw, struct bio *bio, struct inode *inode, loff_t file_offset); diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index 59fb387b2238..8e3f99b61959 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -22,43 +22,3 @@ #include #include -/* - * truncate a page used for execute in place - * functionality is analog to block_truncate_page but does use get_xip_mem - * to get the page instead of page cache - */ -int -xip_truncate_page(struct address_space *mapping, loff_t from) -{ - pgoff_t index = from >> PAGE_CACHE_SHIFT; - unsigned offset = from & (PAGE_CACHE_SIZE-1); - unsigned blocksize; - unsigned length; - void *xip_mem; - unsigned long xip_pfn; - int err; - - BUG_ON(!mapping->a_ops->get_xip_mem); - - blocksize = 1 << mapping->host->i_blkbits; - length = offset & (blocksize - 1); - - /* Block boundary? Nothing to do */ - if (!length) - return 0; - - length = blocksize - length; - - err = mapping->a_ops->get_xip_mem(mapping, index, 0, - &xip_mem, &xip_pfn); - if (unlikely(err)) { - if (err == -ENODATA) - /* Hole? No need to truncate */ - return 0; - else - return err; - } - memset(xip_mem + offset, 0, length); - return 0; -} -EXPORT_SYMBOL_GPL(xip_truncate_page); From 95ec8daba310b44302d2977dd54b16886527b681 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Mon, 16 Feb 2015 15:59:09 -0800 Subject: [PATCH 12/37] dax: replace XIP documentation with DAX documentation Based on the original XIP documentation, this documents the current state of affairs, and includes instructions on how users can enable DAX if their devices and kernel support it. Signed-off-by: Matthew Wilcox Reviewed-by: Randy Dunlap Cc: Andreas Dilger Cc: Boaz Harrosh Cc: Christoph Hellwig Cc: Dave Chinner Cc: Jan Kara Cc: Jens Axboe Cc: Kirill A. Shutemov Cc: Mathieu Desnoyers Cc: Ross Zwisler Cc: Theodore Ts'o Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/00-INDEX | 5 +- Documentation/filesystems/dax.txt | 89 ++++++++++++++++++++++++++++++ Documentation/filesystems/xip.txt | 71 ------------------------ 3 files changed, 92 insertions(+), 73 deletions(-) create mode 100644 Documentation/filesystems/dax.txt delete mode 100644 Documentation/filesystems/xip.txt diff --git a/Documentation/filesystems/00-INDEX b/Documentation/filesystems/00-INDEX index ac28149aede4..9922939e7d99 100644 --- a/Documentation/filesystems/00-INDEX +++ b/Documentation/filesystems/00-INDEX @@ -34,6 +34,9 @@ configfs/ - directory containing configfs documentation and example code. cramfs.txt - info on the cram filesystem for small storage (ROMs etc). +dax.txt + - info on avoiding the page cache for files stored on CPU-addressable + storage devices. debugfs.txt - info on the debugfs filesystem. devpts.txt @@ -154,5 +157,3 @@ xfs-self-describing-metadata.txt - info on XFS Self Describing Metadata. xfs.txt - info and mount options for the XFS filesystem. -xip.txt - - info on execute-in-place for file mappings. diff --git a/Documentation/filesystems/dax.txt b/Documentation/filesystems/dax.txt new file mode 100644 index 000000000000..635adaa1e425 --- /dev/null +++ b/Documentation/filesystems/dax.txt @@ -0,0 +1,89 @@ +Direct Access for files +----------------------- + +Motivation +---------- + +The page cache is usually used to buffer reads and writes to files. +It is also used to provide the pages which are mapped into userspace +by a call to mmap. + +For block devices that are memory-like, the page cache pages would be +unnecessary copies of the original storage. The DAX code removes the +extra copy by performing reads and writes directly to the storage device. +For file mappings, the storage device is mapped directly into userspace. + + +Usage +----- + +If you have a block device which supports DAX, you can make a filesystem +on it as usual. When mounting it, use the -o dax option manually +or add 'dax' to the options in /etc/fstab. + + +Implementation Tips for Block Driver Writers +-------------------------------------------- + +To support DAX in your block driver, implement the 'direct_access' +block device operation. It is used to translate the sector number +(expressed in units of 512-byte sectors) to a page frame number (pfn) +that identifies the physical page for the memory. It also returns a +kernel virtual address that can be used to access the memory. + +The direct_access method takes a 'size' parameter that indicates the +number of bytes being requested. The function should return the number +of bytes that can be contiguously accessed at that offset. It may also +return a negative errno if an error occurs. + +In order to support this method, the storage must be byte-accessible by +the CPU at all times. If your device uses paging techniques to expose +a large amount of memory through a smaller window, then you cannot +implement direct_access. Equally, if your device can occasionally +stall the CPU for an extended period, you should also not attempt to +implement direct_access. + +These block devices may be used for inspiration: +- axonram: Axon DDR2 device driver +- brd: RAM backed block device driver +- dcssblk: s390 dcss block device driver + + +Implementation Tips for Filesystem Writers +------------------------------------------ + +Filesystem support consists of +- adding support to mark inodes as being DAX by setting the S_DAX flag in + i_flags +- implementing the direct_IO address space operation, and calling + dax_do_io() instead of blockdev_direct_IO() if S_DAX is set +- implementing an mmap file operation for DAX files which sets the + VM_MIXEDMAP flag on the VMA, and setting the vm_ops to include handlers + for fault and page_mkwrite (which should probably call dax_fault() and + dax_mkwrite(), passing the appropriate get_block() callback) +- calling dax_truncate_page() instead of block_truncate_page() for DAX files +- ensuring that there is sufficient locking between reads, writes, + truncates and page faults + +The get_block() callback passed to the DAX functions may return +uninitialised extents. If it does, it must ensure that simultaneous +calls to get_block() (for example by a page-fault racing with a read() +or a write()) work correctly. + +These filesystems may be used for inspiration: +- ext2: the second extended filesystem, see Documentation/filesystems/ext2.txt + + +Shortcomings +------------ + +Even if the kernel or its modules are stored on a filesystem that supports +DAX on a block device that supports DAX, they will still be copied into RAM. + +Calling get_user_pages() on a range of user memory that has been mmaped +from a DAX file will fail as there are no 'struct page' to describe +those pages. This problem is being worked on. That means that O_DIRECT +reads/writes to those memory ranges from a non-DAX file will fail (note +that O_DIRECT reads/writes _of a DAX file_ do work, it is the memory +that is being accessed that is key here). Other things that will not +work include RDMA, sendfile() and splice(). diff --git a/Documentation/filesystems/xip.txt b/Documentation/filesystems/xip.txt deleted file mode 100644 index b77472949ede..000000000000 --- a/Documentation/filesystems/xip.txt +++ /dev/null @@ -1,71 +0,0 @@ -Execute-in-place for file mappings ----------------------------------- - -Motivation ----------- -File mappings are performed by mapping page cache pages to userspace. In -addition, read&write type file operations also transfer data from/to the page -cache. - -For memory backed storage devices that use the block device interface, the page -cache pages are in fact copies of the original storage. Various approaches -exist to work around the need for an extra copy. The ramdisk driver for example -does read the data into the page cache, keeps a reference, and discards the -original data behind later on. - -Execute-in-place solves this issue the other way around: instead of keeping -data in the page cache, the need to have a page cache copy is eliminated -completely. With execute-in-place, read&write type operations are performed -directly from/to the memory backed storage device. For file mappings, the -storage device itself is mapped directly into userspace. - -This implementation was initially written for shared memory segments between -different virtual machines on s390 hardware to allow multiple machines to -share the same binaries and libraries. - -Implementation --------------- -Execute-in-place is implemented in three steps: block device operation, -address space operation, and file operations. - -A block device operation named direct_access is used to translate the -block device sector number to a page frame number (pfn) that identifies -the physical page for the memory. It also returns a kernel virtual -address that can be used to access the memory. - -The direct_access method takes a 'size' parameter that indicates the -number of bytes being requested. The function should return the number -of bytes that can be contiguously accessed at that offset. It may also -return a negative errno if an error occurs. - -The block device operation is optional, these block devices support it as of -today: -- dcssblk: s390 dcss block device driver - -An address space operation named get_xip_mem is used to retrieve references -to a page frame number and a kernel address. To obtain these values a reference -to an address_space is provided. This function assigns values to the kmem and -pfn parameters. The third argument indicates whether the function should allocate -blocks if needed. - -This address space operation is mutually exclusive with readpage&writepage that -do page cache read/write operations. -The following filesystems support it as of today: -- ext2: the second extended filesystem, see Documentation/filesystems/ext2.txt - -A set of file operations that do utilize get_xip_page can be found in -mm/filemap_xip.c . The following file operation implementations are provided: -- aio_read/aio_write -- readv/writev -- sendfile - -The generic file operations do_sync_read/do_sync_write can be used to implement -classic synchronous IO calls. - -Shortcomings ------------- -This implementation is limited to storage devices that are cpu addressable at -all times (no highmem or such). It works well on rom/ram, but enhancements are -needed to make it work with flash in read+write mode. -Putting the Linux kernel and/or its modules on a xip filesystem does not mean -they are not copied. From e748dcd095ddee50e7a7deda2e26247715318a2e Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Mon, 16 Feb 2015 15:59:12 -0800 Subject: [PATCH 13/37] vfs: remove get_xip_mem All callers of get_xip_mem() are now gone. Remove checks for it, initialisers of it, documentation of it and the only implementation of it. Also remove mm/filemap_xip.c as it is now empty. Also remove documentation of the long-gone get_xip_page(). Signed-off-by: Matthew Wilcox Cc: Andreas Dilger Cc: Boaz Harrosh Cc: Christoph Hellwig Cc: Dave Chinner Cc: Jan Kara Cc: Jens Axboe Cc: Kirill A. Shutemov Cc: Mathieu Desnoyers Cc: Randy Dunlap Cc: Ross Zwisler Cc: Theodore Ts'o Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/Locking | 3 --- Documentation/filesystems/vfs.txt | 7 ----- fs/exofs/inode.c | 1 - fs/ext2/inode.c | 1 - fs/ext2/xip.c | 45 ------------------------------- fs/ext2/xip.h | 3 --- fs/open.c | 5 +--- include/linux/fs.h | 2 -- include/linux/rmap.h | 2 +- mm/Makefile | 1 - mm/fadvise.c | 6 +++-- mm/filemap_xip.c | 24 ----------------- mm/madvise.c | 2 +- 13 files changed, 7 insertions(+), 95 deletions(-) delete mode 100644 mm/filemap_xip.c diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index b30753cbf431..2ca3d17eee56 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -199,8 +199,6 @@ prototypes: int (*releasepage) (struct page *, int); void (*freepage)(struct page *); int (*direct_IO)(int, struct kiocb *, struct iov_iter *iter, loff_t offset); - int (*get_xip_mem)(struct address_space *, pgoff_t, int, void **, - unsigned long *); int (*migratepage)(struct address_space *, struct page *, struct page *); int (*launder_page)(struct page *); int (*is_partially_uptodate)(struct page *, unsigned long, unsigned long); @@ -225,7 +223,6 @@ invalidatepage: yes releasepage: yes freepage: yes direct_IO: -get_xip_mem: maybe migratepage: yes (both) launder_page: yes is_partially_uptodate: yes diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index 43ce0507ee25..966b22829f3b 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -591,8 +591,6 @@ struct address_space_operations { int (*releasepage) (struct page *, int); void (*freepage)(struct page *); ssize_t (*direct_IO)(int, struct kiocb *, struct iov_iter *iter, loff_t offset); - struct page* (*get_xip_page)(struct address_space *, sector_t, - int); /* migrate the contents of a page to the specified target */ int (*migratepage) (struct page *, struct page *); int (*launder_page) (struct page *); @@ -748,11 +746,6 @@ struct address_space_operations { and transfer data directly between the storage and the application's address space. - get_xip_page: called by the VM to translate a block number to a page. - The page is valid until the corresponding filesystem is unmounted. - Filesystems that want to use execute-in-place (XIP) need to implement - it. An example implementation can be found in fs/ext2/xip.c. - migrate_page: This is used to compact the physical memory usage. If the VM wants to relocate a page (maybe off a memory card that is signalling imminent failure) it will pass a new page diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index 6fc91df99ff8..a198e94813fe 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -985,7 +985,6 @@ const struct address_space_operations exofs_aops = { .direct_IO = exofs_direct_IO, /* With these NULL has special meaning or default is not exported */ - .get_xip_mem = NULL, .migratepage = NULL, .launder_page = NULL, .is_partially_uptodate = NULL, diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 5ac0a3461b3c..59d6c7d43740 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -894,7 +894,6 @@ const struct address_space_operations ext2_aops = { const struct address_space_operations ext2_aops_xip = { .bmap = ext2_bmap, - .get_xip_mem = ext2_get_xip_mem, .direct_IO = ext2_direct_IO, }; diff --git a/fs/ext2/xip.c b/fs/ext2/xip.c index 8cfca3a4cd58..132d4daf98c4 100644 --- a/fs/ext2/xip.c +++ b/fs/ext2/xip.c @@ -13,35 +13,6 @@ #include "ext2.h" #include "xip.h" -static inline long __inode_direct_access(struct inode *inode, sector_t block, - void **kaddr, unsigned long *pfn, long size) -{ - struct block_device *bdev = inode->i_sb->s_bdev; - sector_t sector = block * (PAGE_SIZE / 512); - return bdev_direct_access(bdev, sector, kaddr, pfn, size); -} - -static inline int -__ext2_get_block(struct inode *inode, pgoff_t pgoff, int create, - sector_t *result) -{ - struct buffer_head tmp; - int rc; - - memset(&tmp, 0, sizeof(struct buffer_head)); - tmp.b_size = 1 << inode->i_blkbits; - rc = ext2_get_block(inode, pgoff, &tmp, create); - *result = tmp.b_blocknr; - - /* did we get a sparse block (hole in the file)? */ - if (!tmp.b_blocknr && !rc) { - BUG_ON(create); - rc = -ENODATA; - } - - return rc; -} - void ext2_xip_verify_sb(struct super_block *sb) { struct ext2_sb_info *sbi = EXT2_SB(sb); @@ -54,19 +25,3 @@ void ext2_xip_verify_sb(struct super_block *sb) "not supported by bdev"); } } - -int ext2_get_xip_mem(struct address_space *mapping, pgoff_t pgoff, int create, - void **kmem, unsigned long *pfn) -{ - long rc; - sector_t block; - - /* first, retrieve the sector number */ - rc = __ext2_get_block(mapping->host, pgoff, create, &block); - if (rc) - return rc; - - /* retrieve address of the target data */ - rc = __inode_direct_access(mapping->host, block, kmem, pfn, PAGE_SIZE); - return (rc < 0) ? rc : 0; -} diff --git a/fs/ext2/xip.h b/fs/ext2/xip.h index b2592f2f3c9d..e7b9f0a2cc54 100644 --- a/fs/ext2/xip.h +++ b/fs/ext2/xip.h @@ -12,10 +12,7 @@ static inline int ext2_use_xip (struct super_block *sb) struct ext2_sb_info *sbi = EXT2_SB(sb); return (sbi->s_mount_opt & EXT2_MOUNT_XIP); } -int ext2_get_xip_mem(struct address_space *, pgoff_t, int, - void **, unsigned long *); #else #define ext2_xip_verify_sb(sb) do { } while (0) #define ext2_use_xip(sb) 0 -#define ext2_get_xip_mem NULL #endif diff --git a/fs/open.c b/fs/open.c index 813be037b412..a293c2020676 100644 --- a/fs/open.c +++ b/fs/open.c @@ -667,11 +667,8 @@ int open_check_o_direct(struct file *f) { /* NB: we're sure to have correct a_ops only after f_op->open */ if (f->f_flags & O_DIRECT) { - if (!f->f_mapping->a_ops || - ((!f->f_mapping->a_ops->direct_IO) && - (!f->f_mapping->a_ops->get_xip_mem))) { + if (!f->f_mapping->a_ops || !f->f_mapping->a_ops->direct_IO) return -EINVAL; - } } return 0; } diff --git a/include/linux/fs.h b/include/linux/fs.h index 2c8f9055af38..9772d655f444 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -362,8 +362,6 @@ struct address_space_operations { int (*releasepage) (struct page *, gfp_t); void (*freepage)(struct page *); ssize_t (*direct_IO)(int, struct kiocb *, struct iov_iter *iter, loff_t offset); - int (*get_xip_mem)(struct address_space *, pgoff_t, int, - void **, unsigned long *); /* * migrate the contents of a page to the specified target. If * migrate_mode is MIGRATE_ASYNC, it must not block. diff --git a/include/linux/rmap.h b/include/linux/rmap.h index b38f559130d5..c4c559a45dc8 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -198,7 +198,7 @@ int page_referenced(struct page *, int is_locked, int try_to_unmap(struct page *, enum ttu_flags flags); /* - * Called from mm/filemap_xip.c to unmap empty zero page + * Used by uprobes to replace a userspace page safely */ pte_t *__page_check_address(struct page *, struct mm_struct *, unsigned long, spinlock_t **, int); diff --git a/mm/Makefile b/mm/Makefile index 088c68e9ec35..3c1caa2693bd 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -55,7 +55,6 @@ obj-$(CONFIG_KMEMCHECK) += kmemcheck.o obj-$(CONFIG_KASAN) += kasan/ obj-$(CONFIG_FAILSLAB) += failslab.o obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o -obj-$(CONFIG_FS_XIP) += filemap_xip.o obj-$(CONFIG_MIGRATION) += migrate.o obj-$(CONFIG_QUICKLIST) += quicklist.o obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o diff --git a/mm/fadvise.c b/mm/fadvise.c index fac23ecf8d72..4a3907cf79f8 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -28,6 +28,7 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice) { struct fd f = fdget(fd); + struct inode *inode; struct address_space *mapping; struct backing_dev_info *bdi; loff_t endbyte; /* inclusive */ @@ -39,7 +40,8 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice) if (!f.file) return -EBADF; - if (S_ISFIFO(file_inode(f.file)->i_mode)) { + inode = file_inode(f.file); + if (S_ISFIFO(inode->i_mode)) { ret = -ESPIPE; goto out; } @@ -50,7 +52,7 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice) goto out; } - if (mapping->a_ops->get_xip_mem) { + if (IS_DAX(inode)) { switch (advice) { case POSIX_FADV_NORMAL: case POSIX_FADV_RANDOM: diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c deleted file mode 100644 index 8e3f99b61959..000000000000 --- a/mm/filemap_xip.c +++ /dev/null @@ -1,24 +0,0 @@ -/* - * linux/mm/filemap_xip.c - * - * Copyright (C) 2005 IBM Corporation - * Author: Carsten Otte - * - * derived from linux/mm/filemap.c - Copyright (C) Linus Torvalds - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - diff --git a/mm/madvise.c b/mm/madvise.c index 1077cbdc8b52..d551475517bf 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -239,7 +239,7 @@ static long madvise_willneed(struct vm_area_struct *vma, return -EBADF; #endif - if (file->f_mapping->a_ops->get_xip_mem) { + if (IS_DAX(file_inode(file))) { /* no bad return value, but ignore advice */ return 0; } From 0de4830fd49f60d04ba37e8b32f95100f3953c39 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Mon, 16 Feb 2015 15:59:15 -0800 Subject: [PATCH 14/37] ext2: remove ext2_xip_verify_sb() Jan Kara pointed out that calling ext2_xip_verify_sb() in ext2_remount() doesn't make sense, since changing the XIP option on remount isn't allowed. It also doesn't make sense to re-check whether blocksize is supported since it can't change between mounts. Replace the call to ext2_xip_verify_sb() in ext2_fill_super() with the equivalent check and delete the definition. Signed-off-by: Matthew Wilcox Cc: Andreas Dilger Cc: Boaz Harrosh Cc: Christoph Hellwig Cc: Dave Chinner Cc: Jan Kara Cc: Jens Axboe Cc: Kirill A. Shutemov Cc: Mathieu Desnoyers Cc: Randy Dunlap Cc: Ross Zwisler Cc: Theodore Ts'o Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ext2/super.c | 33 ++++++++++++--------------------- fs/ext2/xip.c | 12 ------------ fs/ext2/xip.h | 2 -- 3 files changed, 12 insertions(+), 35 deletions(-) diff --git a/fs/ext2/super.c b/fs/ext2/super.c index ae55fddc26a9..83c5d51b8ede 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -877,9 +877,6 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) ((EXT2_SB(sb)->s_mount_opt & EXT2_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); - ext2_xip_verify_sb(sb); /* see if bdev supports xip, unset - EXT2_MOUNT_XIP if not */ - if (le32_to_cpu(es->s_rev_level) == EXT2_GOOD_OLD_REV && (EXT2_HAS_COMPAT_FEATURE(sb, ~0U) || EXT2_HAS_RO_COMPAT_FEATURE(sb, ~0U) || @@ -909,11 +906,17 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) blocksize = BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size); - if (ext2_use_xip(sb) && blocksize != PAGE_SIZE) { - if (!silent) + if (sbi->s_mount_opt & EXT2_MOUNT_XIP) { + if (blocksize != PAGE_SIZE) { ext2_msg(sb, KERN_ERR, - "error: unsupported blocksize for xip"); - goto failed_mount; + "error: unsupported blocksize for xip"); + goto failed_mount; + } + if (!sb->s_bdev->bd_disk->fops->direct_access) { + ext2_msg(sb, KERN_ERR, + "error: device does not support xip"); + goto failed_mount; + } } /* If the blocksize doesn't match, re-read the thing.. */ @@ -1259,7 +1262,6 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data) { struct ext2_sb_info * sbi = EXT2_SB(sb); struct ext2_super_block * es; - unsigned long old_mount_opt = sbi->s_mount_opt; struct ext2_mount_options old_opts; unsigned long old_sb_flags; int err; @@ -1284,22 +1286,11 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data) sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | ((sbi->s_mount_opt & EXT2_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); - ext2_xip_verify_sb(sb); /* see if bdev supports xip, unset - EXT2_MOUNT_XIP if not */ - - if ((ext2_use_xip(sb)) && (sb->s_blocksize != PAGE_SIZE)) { - ext2_msg(sb, KERN_WARNING, - "warning: unsupported blocksize for xip"); - err = -EINVAL; - goto restore_opts; - } - es = sbi->s_es; - if ((sbi->s_mount_opt ^ old_mount_opt) & EXT2_MOUNT_XIP) { + if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT2_MOUNT_XIP) { ext2_msg(sb, KERN_WARNING, "warning: refusing change of " "xip flag with busy inodes while remounting"); - sbi->s_mount_opt &= ~EXT2_MOUNT_XIP; - sbi->s_mount_opt |= old_mount_opt & EXT2_MOUNT_XIP; + sbi->s_mount_opt ^= EXT2_MOUNT_XIP; } if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) { spin_unlock(&sbi->s_lock); diff --git a/fs/ext2/xip.c b/fs/ext2/xip.c index 132d4daf98c4..66ca1133827e 100644 --- a/fs/ext2/xip.c +++ b/fs/ext2/xip.c @@ -13,15 +13,3 @@ #include "ext2.h" #include "xip.h" -void ext2_xip_verify_sb(struct super_block *sb) -{ - struct ext2_sb_info *sbi = EXT2_SB(sb); - - if ((sbi->s_mount_opt & EXT2_MOUNT_XIP) && - !sb->s_bdev->bd_disk->fops->direct_access) { - sbi->s_mount_opt &= (~EXT2_MOUNT_XIP); - ext2_msg(sb, KERN_WARNING, - "warning: ignoring xip option - " - "not supported by bdev"); - } -} diff --git a/fs/ext2/xip.h b/fs/ext2/xip.h index e7b9f0a2cc54..87eeb0460e8c 100644 --- a/fs/ext2/xip.h +++ b/fs/ext2/xip.h @@ -6,13 +6,11 @@ */ #ifdef CONFIG_EXT2_FS_XIP -extern void ext2_xip_verify_sb (struct super_block *); static inline int ext2_use_xip (struct super_block *sb) { struct ext2_sb_info *sbi = EXT2_SB(sb); return (sbi->s_mount_opt & EXT2_MOUNT_XIP); } #else -#define ext2_xip_verify_sb(sb) do { } while (0) #define ext2_use_xip(sb) 0 #endif From ed87e9202035c8564472f5d84e7d5e10f1014029 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Mon, 16 Feb 2015 15:59:18 -0800 Subject: [PATCH 15/37] ext2: remove ext2_use_xip Replace ext2_use_xip() with test_opt(XIP) which expands to the same code Signed-off-by: Matthew Wilcox Reviewed-by: Mathieu Desnoyers Cc: Andreas Dilger Cc: Boaz Harrosh Cc: Christoph Hellwig Cc: Dave Chinner Cc: Jan Kara Cc: Jens Axboe Cc: Kirill A. Shutemov Cc: Randy Dunlap Cc: Ross Zwisler Cc: Theodore Ts'o Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ext2/ext2.h | 4 ++++ fs/ext2/inode.c | 2 +- fs/ext2/namei.c | 4 ++-- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index e4279ead4a05..30604c4d70e6 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -380,7 +380,11 @@ struct ext2_inode { #define EXT2_MOUNT_NO_UID32 0x000200 /* Disable 32-bit UIDs */ #define EXT2_MOUNT_XATTR_USER 0x004000 /* Extended user attributes */ #define EXT2_MOUNT_POSIX_ACL 0x008000 /* POSIX Access Control Lists */ +#ifdef CONFIG_FS_XIP #define EXT2_MOUNT_XIP 0x010000 /* Execute in place */ +#else +#define EXT2_MOUNT_XIP 0 +#endif #define EXT2_MOUNT_USRQUOTA 0x020000 /* user quota */ #define EXT2_MOUNT_GRPQUOTA 0x040000 /* group quota */ #define EXT2_MOUNT_RESERVATION 0x080000 /* Preallocation */ diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 59d6c7d43740..cba38331a124 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -1394,7 +1394,7 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino) if (S_ISREG(inode->i_mode)) { inode->i_op = &ext2_file_inode_operations; - if (ext2_use_xip(inode->i_sb)) { + if (test_opt(inode->i_sb, XIP)) { inode->i_mapping->a_ops = &ext2_aops_xip; inode->i_fop = &ext2_xip_file_operations; } else if (test_opt(inode->i_sb, NOBH)) { diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index c268d0af1db9..846c356af7ca 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -105,7 +105,7 @@ static int ext2_create (struct inode * dir, struct dentry * dentry, umode_t mode return PTR_ERR(inode); inode->i_op = &ext2_file_inode_operations; - if (ext2_use_xip(inode->i_sb)) { + if (test_opt(inode->i_sb, XIP)) { inode->i_mapping->a_ops = &ext2_aops_xip; inode->i_fop = &ext2_xip_file_operations; } else if (test_opt(inode->i_sb, NOBH)) { @@ -126,7 +126,7 @@ static int ext2_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) return PTR_ERR(inode); inode->i_op = &ext2_file_inode_operations; - if (ext2_use_xip(inode->i_sb)) { + if (test_opt(inode->i_sb, XIP)) { inode->i_mapping->a_ops = &ext2_aops_xip; inode->i_fop = &ext2_xip_file_operations; } else if (test_opt(inode->i_sb, NOBH)) { From 07642381d5ad800d021bb80842bfebb100b12fc2 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Mon, 16 Feb 2015 15:59:22 -0800 Subject: [PATCH 16/37] ext2: remove xip.c and xip.h These files are now empty, so delete them Signed-off-by: Matthew Wilcox Reviewed-by: Mathieu Desnoyers Cc: Andreas Dilger Cc: Boaz Harrosh Cc: Christoph Hellwig Cc: Dave Chinner Cc: Jan Kara Cc: Jens Axboe Cc: Kirill A. Shutemov Cc: Randy Dunlap Cc: Ross Zwisler Cc: Theodore Ts'o Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ext2/Makefile | 1 - fs/ext2/inode.c | 1 - fs/ext2/namei.c | 1 - fs/ext2/super.c | 1 - fs/ext2/xip.c | 15 --------------- fs/ext2/xip.h | 16 ---------------- 6 files changed, 35 deletions(-) delete mode 100644 fs/ext2/xip.c delete mode 100644 fs/ext2/xip.h diff --git a/fs/ext2/Makefile b/fs/ext2/Makefile index f42af45cfd88..445b0e996a12 100644 --- a/fs/ext2/Makefile +++ b/fs/ext2/Makefile @@ -10,4 +10,3 @@ ext2-y := balloc.o dir.o file.o ialloc.o inode.o \ ext2-$(CONFIG_EXT2_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o ext2-$(CONFIG_EXT2_FS_POSIX_ACL) += acl.o ext2-$(CONFIG_EXT2_FS_SECURITY) += xattr_security.o -ext2-$(CONFIG_EXT2_FS_XIP) += xip.o diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index cba38331a124..154cbcf30f35 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -34,7 +34,6 @@ #include #include "ext2.h" #include "acl.h" -#include "xip.h" #include "xattr.h" static int __ext2_write_inode(struct inode *inode, int do_sync); diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index 846c356af7ca..7ca803f408b0 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -35,7 +35,6 @@ #include "ext2.h" #include "xattr.h" #include "acl.h" -#include "xip.h" static inline int ext2_add_nondir(struct dentry *dentry, struct inode *inode) { diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 83c5d51b8ede..50342583db1f 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -35,7 +35,6 @@ #include "ext2.h" #include "xattr.h" #include "acl.h" -#include "xip.h" static void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es, int wait); diff --git a/fs/ext2/xip.c b/fs/ext2/xip.c deleted file mode 100644 index 66ca1133827e..000000000000 --- a/fs/ext2/xip.c +++ /dev/null @@ -1,15 +0,0 @@ -/* - * linux/fs/ext2/xip.c - * - * Copyright (C) 2005 IBM Corporation - * Author: Carsten Otte (cotte@de.ibm.com) - */ - -#include -#include -#include -#include -#include -#include "ext2.h" -#include "xip.h" - diff --git a/fs/ext2/xip.h b/fs/ext2/xip.h deleted file mode 100644 index 87eeb0460e8c..000000000000 --- a/fs/ext2/xip.h +++ /dev/null @@ -1,16 +0,0 @@ -/* - * linux/fs/ext2/xip.h - * - * Copyright (C) 2005 IBM Corporation - * Author: Carsten Otte (cotte@de.ibm.com) - */ - -#ifdef CONFIG_EXT2_FS_XIP -static inline int ext2_use_xip (struct super_block *sb) -{ - struct ext2_sb_info *sbi = EXT2_SB(sb); - return (sbi->s_mount_opt & EXT2_MOUNT_XIP); -} -#else -#define ext2_use_xip(sb) 0 -#endif From 6cd176a51e52e5218b1aa97e1ec916bac25a9b7e Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Mon, 16 Feb 2015 15:59:25 -0800 Subject: [PATCH 17/37] vfs,ext2: remove CONFIG_EXT2_FS_XIP and rename CONFIG_FS_XIP to CONFIG_FS_DAX The fewer Kconfig options we have the better. Use the generic CONFIG_FS_DAX to enable XIP support in ext2 as well as in the core. Signed-off-by: Matthew Wilcox Cc: Andreas Dilger Cc: Boaz Harrosh Cc: Christoph Hellwig Cc: Dave Chinner Cc: Jan Kara Cc: Jens Axboe Cc: Kirill A. Shutemov Cc: Mathieu Desnoyers Cc: Randy Dunlap Cc: Ross Zwisler Cc: Theodore Ts'o Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/Kconfig | 21 ++++++++++++++------- fs/Makefile | 2 +- fs/ext2/Kconfig | 11 ----------- fs/ext2/ext2.h | 2 +- fs/ext2/file.c | 4 ++-- fs/ext2/super.c | 4 ++-- include/linux/fs.h | 2 +- scripts/diffconfig | 1 - 8 files changed, 21 insertions(+), 26 deletions(-) diff --git a/fs/Kconfig b/fs/Kconfig index a6bb530b1ec5..5331497d5b25 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -13,13 +13,6 @@ if BLOCK source "fs/ext2/Kconfig" source "fs/ext3/Kconfig" source "fs/ext4/Kconfig" - -config FS_XIP -# execute in place - bool - depends on EXT2_FS_XIP - default y - source "fs/jbd/Kconfig" source "fs/jbd2/Kconfig" @@ -40,6 +33,20 @@ source "fs/ocfs2/Kconfig" source "fs/btrfs/Kconfig" source "fs/nilfs2/Kconfig" +config FS_DAX + bool "Direct Access (DAX) support" + depends on MMU + help + Direct Access (DAX) can be used on memory-backed block devices. + If the block device supports DAX and the filesystem supports DAX, + then you can avoid using the pagecache to buffer I/Os. Turning + on this option will compile in support for DAX; you will need to + mount the filesystem using the -o dax option. + + If you do not have a block device that is capable of using this, + or if unsure, say N. Saying Y will increase the size of the kernel + by about 5kB. + endif # BLOCK # Posix ACL utility routines diff --git a/fs/Makefile b/fs/Makefile index 0534444e257c..0f4635f7c49c 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -28,7 +28,7 @@ obj-$(CONFIG_SIGNALFD) += signalfd.o obj-$(CONFIG_TIMERFD) += timerfd.o obj-$(CONFIG_EVENTFD) += eventfd.o obj-$(CONFIG_AIO) += aio.o -obj-$(CONFIG_FS_XIP) += dax.o +obj-$(CONFIG_FS_DAX) += dax.o obj-$(CONFIG_FILE_LOCKING) += locks.o obj-$(CONFIG_COMPAT) += compat.o compat_ioctl.o obj-$(CONFIG_BINFMT_AOUT) += binfmt_aout.o diff --git a/fs/ext2/Kconfig b/fs/ext2/Kconfig index 14a6780fd034..c634874e12d9 100644 --- a/fs/ext2/Kconfig +++ b/fs/ext2/Kconfig @@ -42,14 +42,3 @@ config EXT2_FS_SECURITY If you are not using a security module that requires using extended attributes for file security labels, say N. - -config EXT2_FS_XIP - bool "Ext2 execute in place support" - depends on EXT2_FS && MMU - help - Execute in place can be used on memory-backed block devices. If you - enable this option, you can select to mount block devices which are - capable of this feature without using the page cache. - - If you do not use a block device that is capable of using this, - or if unsure, say N. diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index 30604c4d70e6..6854038c09ae 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -380,7 +380,7 @@ struct ext2_inode { #define EXT2_MOUNT_NO_UID32 0x000200 /* Disable 32-bit UIDs */ #define EXT2_MOUNT_XATTR_USER 0x004000 /* Extended user attributes */ #define EXT2_MOUNT_POSIX_ACL 0x008000 /* POSIX Access Control Lists */ -#ifdef CONFIG_FS_XIP +#ifdef CONFIG_FS_DAX #define EXT2_MOUNT_XIP 0x010000 /* Execute in place */ #else #define EXT2_MOUNT_XIP 0 diff --git a/fs/ext2/file.c b/fs/ext2/file.c index a61c93fd9dce..de8174d1e973 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -25,7 +25,7 @@ #include "xattr.h" #include "acl.h" -#ifdef CONFIG_EXT2_FS_XIP +#ifdef CONFIG_FS_DAX static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { return dax_fault(vma, vmf, ext2_get_block); @@ -108,7 +108,7 @@ const struct file_operations ext2_file_operations = { .splice_write = iter_file_splice_write, }; -#ifdef CONFIG_EXT2_FS_XIP +#ifdef CONFIG_FS_DAX const struct file_operations ext2_xip_file_operations = { .llseek = generic_file_llseek, .read = new_sync_read, diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 50342583db1f..5f029d8c3a02 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -291,7 +291,7 @@ static int ext2_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",grpquota"); #endif -#if defined(CONFIG_EXT2_FS_XIP) +#ifdef CONFIG_FS_DAX if (sbi->s_mount_opt & EXT2_MOUNT_XIP) seq_puts(seq, ",xip"); #endif @@ -558,7 +558,7 @@ static int parse_options(char *options, struct super_block *sb) break; #endif case Opt_xip: -#ifdef CONFIG_EXT2_FS_XIP +#ifdef CONFIG_FS_DAX set_opt (sbi->s_mount_opt, XIP); #else ext2_msg(sb, KERN_INFO, "xip option not supported"); diff --git a/include/linux/fs.h b/include/linux/fs.h index 9772d655f444..d46f8fe6a0ea 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1676,7 +1676,7 @@ struct super_operations { #define S_IMA 1024 /* Inode has an associated IMA struct */ #define S_AUTOMOUNT 2048 /* Automount/referral quasi-directory */ #define S_NOSEC 4096 /* no suid or xattr security attributes */ -#ifdef CONFIG_FS_XIP +#ifdef CONFIG_FS_DAX #define S_DAX 8192 /* Direct Access, avoiding the page cache */ #else #define S_DAX 0 /* Make all the DAX code disappear */ diff --git a/scripts/diffconfig b/scripts/diffconfig index 6d672836e187..0db267d0adc9 100755 --- a/scripts/diffconfig +++ b/scripts/diffconfig @@ -28,7 +28,6 @@ If no config files are specified, .config and .config.old are used. Example usage: $ diffconfig .config config-with-some-changes -EXT2_FS_XATTR n --EXT2_FS_XIP n CRAMFS n -> y EXT2_FS y -> n LOG_BUF_SHIFT 14 -> 16 From 97443aa809a142b1e6db2ccfb046c3a962907204 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Mon, 16 Feb 2015 15:59:28 -0800 Subject: [PATCH 18/37] ext2: remove ext2_aops_xip We shouldn't need a special address_space_operations any more Signed-off-by: Matthew Wilcox Reviewed-by: Mathieu Desnoyers Cc: Andreas Dilger Cc: Boaz Harrosh Cc: Christoph Hellwig Cc: Dave Chinner Cc: Jan Kara Cc: Jens Axboe Cc: Kirill A. Shutemov Cc: Randy Dunlap Cc: Ross Zwisler Cc: Theodore Ts'o Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ext2/ext2.h | 1 - fs/ext2/inode.c | 7 +------ fs/ext2/namei.c | 4 ++-- 3 files changed, 3 insertions(+), 9 deletions(-) diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index 6854038c09ae..ab9b3ec3bac9 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -796,7 +796,6 @@ extern const struct file_operations ext2_xip_file_operations; /* inode.c */ extern const struct address_space_operations ext2_aops; -extern const struct address_space_operations ext2_aops_xip; extern const struct address_space_operations ext2_nobh_aops; /* namei.c */ diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 154cbcf30f35..034fd42eade0 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -891,11 +891,6 @@ const struct address_space_operations ext2_aops = { .error_remove_page = generic_error_remove_page, }; -const struct address_space_operations ext2_aops_xip = { - .bmap = ext2_bmap, - .direct_IO = ext2_direct_IO, -}; - const struct address_space_operations ext2_nobh_aops = { .readpage = ext2_readpage, .readpages = ext2_readpages, @@ -1394,7 +1389,7 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino) if (S_ISREG(inode->i_mode)) { inode->i_op = &ext2_file_inode_operations; if (test_opt(inode->i_sb, XIP)) { - inode->i_mapping->a_ops = &ext2_aops_xip; + inode->i_mapping->a_ops = &ext2_aops; inode->i_fop = &ext2_xip_file_operations; } else if (test_opt(inode->i_sb, NOBH)) { inode->i_mapping->a_ops = &ext2_nobh_aops; diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index 7ca803f408b0..0db888c91bec 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -105,7 +105,7 @@ static int ext2_create (struct inode * dir, struct dentry * dentry, umode_t mode inode->i_op = &ext2_file_inode_operations; if (test_opt(inode->i_sb, XIP)) { - inode->i_mapping->a_ops = &ext2_aops_xip; + inode->i_mapping->a_ops = &ext2_aops; inode->i_fop = &ext2_xip_file_operations; } else if (test_opt(inode->i_sb, NOBH)) { inode->i_mapping->a_ops = &ext2_nobh_aops; @@ -126,7 +126,7 @@ static int ext2_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) inode->i_op = &ext2_file_inode_operations; if (test_opt(inode->i_sb, XIP)) { - inode->i_mapping->a_ops = &ext2_aops_xip; + inode->i_mapping->a_ops = &ext2_aops; inode->i_fop = &ext2_xip_file_operations; } else if (test_opt(inode->i_sb, NOBH)) { inode->i_mapping->a_ops = &ext2_nobh_aops; From 9c3ce9ec58716733232b97771b10f31901caf62e Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Mon, 16 Feb 2015 15:59:31 -0800 Subject: [PATCH 19/37] ext2: get rid of most mentions of XIP in ext2 To help people transition, accept the 'xip' mount option (and report it in /proc/mounts), but print a message encouraging people to switch over to the 'dax' option. Signed-off-by: Matthew Wilcox Reviewed-by: Mathieu Desnoyers Cc: Andreas Dilger Cc: Boaz Harrosh Cc: Christoph Hellwig Cc: Dave Chinner Cc: Jan Kara Cc: Jens Axboe Cc: Kirill A. Shutemov Cc: Randy Dunlap Cc: Ross Zwisler Cc: Theodore Ts'o Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/ext2.txt | 5 +++-- fs/ext2/ext2.h | 13 +++++++------ fs/ext2/file.c | 2 +- fs/ext2/inode.c | 6 +++--- fs/ext2/namei.c | 8 ++++---- fs/ext2/super.c | 25 ++++++++++++++++--------- 6 files changed, 34 insertions(+), 25 deletions(-) diff --git a/Documentation/filesystems/ext2.txt b/Documentation/filesystems/ext2.txt index 67639f905f10..b9714569e472 100644 --- a/Documentation/filesystems/ext2.txt +++ b/Documentation/filesystems/ext2.txt @@ -20,6 +20,9 @@ minixdf Makes `df' act like Minix. check=none, nocheck (*) Don't do extra checking of bitmaps on mount (check=normal and check=strict options removed) +dax Use direct access (no page cache). See + Documentation/filesystems/dax.txt. + debug Extra debugging information is sent to the kernel syslog. Useful for developers. @@ -56,8 +59,6 @@ noacl Don't support POSIX ACLs. nobh Do not attach buffer_heads to file pagecache. -xip Use execute in place (no caching) if possible - grpquota,noquota,quota,usrquota Quota options are silently ignored by ext2. diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index ab9b3ec3bac9..678f9ab08c48 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -380,14 +380,15 @@ struct ext2_inode { #define EXT2_MOUNT_NO_UID32 0x000200 /* Disable 32-bit UIDs */ #define EXT2_MOUNT_XATTR_USER 0x004000 /* Extended user attributes */ #define EXT2_MOUNT_POSIX_ACL 0x008000 /* POSIX Access Control Lists */ -#ifdef CONFIG_FS_DAX -#define EXT2_MOUNT_XIP 0x010000 /* Execute in place */ -#else -#define EXT2_MOUNT_XIP 0 -#endif +#define EXT2_MOUNT_XIP 0x010000 /* Obsolete, use DAX */ #define EXT2_MOUNT_USRQUOTA 0x020000 /* user quota */ #define EXT2_MOUNT_GRPQUOTA 0x040000 /* group quota */ #define EXT2_MOUNT_RESERVATION 0x080000 /* Preallocation */ +#ifdef CONFIG_FS_DAX +#define EXT2_MOUNT_DAX 0x100000 /* Direct Access */ +#else +#define EXT2_MOUNT_DAX 0 +#endif #define clear_opt(o, opt) o &= ~EXT2_MOUNT_##opt @@ -792,7 +793,7 @@ extern int ext2_fsync(struct file *file, loff_t start, loff_t end, int datasync); extern const struct inode_operations ext2_file_inode_operations; extern const struct file_operations ext2_file_operations; -extern const struct file_operations ext2_xip_file_operations; +extern const struct file_operations ext2_dax_file_operations; /* inode.c */ extern const struct address_space_operations ext2_aops; diff --git a/fs/ext2/file.c b/fs/ext2/file.c index de8174d1e973..e31701713516 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -109,7 +109,7 @@ const struct file_operations ext2_file_operations = { }; #ifdef CONFIG_FS_DAX -const struct file_operations ext2_xip_file_operations = { +const struct file_operations ext2_dax_file_operations = { .llseek = generic_file_llseek, .read = new_sync_read, .write = new_sync_write, diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 034fd42eade0..6434bc000125 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -1286,7 +1286,7 @@ void ext2_set_inode_flags(struct inode *inode) inode->i_flags |= S_NOATIME; if (flags & EXT2_DIRSYNC_FL) inode->i_flags |= S_DIRSYNC; - if (test_opt(inode->i_sb, XIP)) + if (test_opt(inode->i_sb, DAX)) inode->i_flags |= S_DAX; } @@ -1388,9 +1388,9 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino) if (S_ISREG(inode->i_mode)) { inode->i_op = &ext2_file_inode_operations; - if (test_opt(inode->i_sb, XIP)) { + if (test_opt(inode->i_sb, DAX)) { inode->i_mapping->a_ops = &ext2_aops; - inode->i_fop = &ext2_xip_file_operations; + inode->i_fop = &ext2_dax_file_operations; } else if (test_opt(inode->i_sb, NOBH)) { inode->i_mapping->a_ops = &ext2_nobh_aops; inode->i_fop = &ext2_file_operations; diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index 0db888c91bec..148f6e3789ea 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -104,9 +104,9 @@ static int ext2_create (struct inode * dir, struct dentry * dentry, umode_t mode return PTR_ERR(inode); inode->i_op = &ext2_file_inode_operations; - if (test_opt(inode->i_sb, XIP)) { + if (test_opt(inode->i_sb, DAX)) { inode->i_mapping->a_ops = &ext2_aops; - inode->i_fop = &ext2_xip_file_operations; + inode->i_fop = &ext2_dax_file_operations; } else if (test_opt(inode->i_sb, NOBH)) { inode->i_mapping->a_ops = &ext2_nobh_aops; inode->i_fop = &ext2_file_operations; @@ -125,9 +125,9 @@ static int ext2_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) return PTR_ERR(inode); inode->i_op = &ext2_file_inode_operations; - if (test_opt(inode->i_sb, XIP)) { + if (test_opt(inode->i_sb, DAX)) { inode->i_mapping->a_ops = &ext2_aops; - inode->i_fop = &ext2_xip_file_operations; + inode->i_fop = &ext2_dax_file_operations; } else if (test_opt(inode->i_sb, NOBH)) { inode->i_mapping->a_ops = &ext2_nobh_aops; inode->i_fop = &ext2_file_operations; diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 5f029d8c3a02..d0e746e96511 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -294,6 +294,8 @@ static int ext2_show_options(struct seq_file *seq, struct dentry *root) #ifdef CONFIG_FS_DAX if (sbi->s_mount_opt & EXT2_MOUNT_XIP) seq_puts(seq, ",xip"); + if (sbi->s_mount_opt & EXT2_MOUNT_DAX) + seq_puts(seq, ",dax"); #endif if (!test_opt(sb, RESERVATION)) @@ -402,7 +404,7 @@ enum { Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro, Opt_nouid32, Opt_nocheck, Opt_debug, Opt_oldalloc, Opt_orlov, Opt_nobh, Opt_user_xattr, Opt_nouser_xattr, - Opt_acl, Opt_noacl, Opt_xip, Opt_ignore, Opt_err, Opt_quota, + Opt_acl, Opt_noacl, Opt_xip, Opt_dax, Opt_ignore, Opt_err, Opt_quota, Opt_usrquota, Opt_grpquota, Opt_reservation, Opt_noreservation }; @@ -431,6 +433,7 @@ static const match_table_t tokens = { {Opt_acl, "acl"}, {Opt_noacl, "noacl"}, {Opt_xip, "xip"}, + {Opt_dax, "dax"}, {Opt_grpquota, "grpquota"}, {Opt_ignore, "noquota"}, {Opt_quota, "quota"}, @@ -558,10 +561,14 @@ static int parse_options(char *options, struct super_block *sb) break; #endif case Opt_xip: + ext2_msg(sb, KERN_INFO, "use dax instead of xip"); + set_opt(sbi->s_mount_opt, XIP); + /* Fall through */ + case Opt_dax: #ifdef CONFIG_FS_DAX - set_opt (sbi->s_mount_opt, XIP); + set_opt(sbi->s_mount_opt, DAX); #else - ext2_msg(sb, KERN_INFO, "xip option not supported"); + ext2_msg(sb, KERN_INFO, "dax option not supported"); #endif break; @@ -905,15 +912,15 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) blocksize = BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size); - if (sbi->s_mount_opt & EXT2_MOUNT_XIP) { + if (sbi->s_mount_opt & EXT2_MOUNT_DAX) { if (blocksize != PAGE_SIZE) { ext2_msg(sb, KERN_ERR, - "error: unsupported blocksize for xip"); + "error: unsupported blocksize for dax"); goto failed_mount; } if (!sb->s_bdev->bd_disk->fops->direct_access) { ext2_msg(sb, KERN_ERR, - "error: device does not support xip"); + "error: device does not support dax"); goto failed_mount; } } @@ -1286,10 +1293,10 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data) ((sbi->s_mount_opt & EXT2_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); es = sbi->s_es; - if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT2_MOUNT_XIP) { + if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT2_MOUNT_DAX) { ext2_msg(sb, KERN_WARNING, "warning: refusing change of " - "xip flag with busy inodes while remounting"); - sbi->s_mount_opt ^= EXT2_MOUNT_XIP; + "dax flag with busy inodes while remounting"); + sbi->s_mount_opt ^= EXT2_MOUNT_DAX; } if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) { spin_unlock(&sbi->s_lock); From 25726bc15731d42112b579cf73f30edbc43d3973 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Mon, 16 Feb 2015 15:59:35 -0800 Subject: [PATCH 20/37] dax: add dax_zero_page_range This new function allows us to support hole-punch for DAX files by zeroing a partial page, as opposed to the dax_truncate_page() function which can only truncate to the end of the page. Reimplement dax_truncate_page() to call dax_zero_page_range(). [ross.zwisler@linux.intel.com: ported to 3.13-rc2] [akpm@linux-foundation.org: fix typos in comments] Signed-off-by: Matthew Wilcox Signed-off-by: Ross Zwisler Cc: Andreas Dilger Cc: Boaz Harrosh Cc: Christoph Hellwig Cc: Dave Chinner Cc: Jan Kara Cc: Jens Axboe Cc: Kirill A. Shutemov Cc: Mathieu Desnoyers Cc: Randy Dunlap Cc: Theodore Ts'o Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/dax.txt | 1 + fs/dax.c | 38 ++++++++++++++++++++++++++----- include/linux/fs.h | 1 + 3 files changed, 34 insertions(+), 6 deletions(-) diff --git a/Documentation/filesystems/dax.txt b/Documentation/filesystems/dax.txt index 635adaa1e425..ebcd97f03654 100644 --- a/Documentation/filesystems/dax.txt +++ b/Documentation/filesystems/dax.txt @@ -62,6 +62,7 @@ Filesystem support consists of for fault and page_mkwrite (which should probably call dax_fault() and dax_mkwrite(), passing the appropriate get_block() callback) - calling dax_truncate_page() instead of block_truncate_page() for DAX files +- calling dax_zero_page_range() instead of zero_user() for DAX files - ensuring that there is sufficient locking between reads, writes, truncates and page faults diff --git a/fs/dax.c b/fs/dax.c index ebf9b2231b0f..ed1619ec6537 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -464,31 +464,35 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, EXPORT_SYMBOL_GPL(dax_fault); /** - * dax_truncate_page - handle a partial page being truncated in a DAX file + * dax_zero_page_range - zero a range within a page of a DAX file * @inode: The file being truncated * @from: The file offset that is being truncated to + * @length: The number of bytes to zero * @get_block: The filesystem method used to translate file offsets to blocks * - * Similar to block_truncate_page(), this function can be called by a - * filesystem when it is truncating an DAX file to handle the partial page. + * This function can be called by a filesystem when it is zeroing part of a + * page in a DAX file. This is intended for hole-punch operations. If + * you are truncating a file, the helper function dax_truncate_page() may be + * more convenient. * * We work in terms of PAGE_CACHE_SIZE here for commonality with * block_truncate_page(), but we could go down to PAGE_SIZE if the filesystem * took care of disposing of the unnecessary blocks. Even if the filesystem * block size is smaller than PAGE_SIZE, we have to zero the rest of the page - * since the file might be mmaped. + * since the file might be mmapped. */ -int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block) +int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length, + get_block_t get_block) { struct buffer_head bh; pgoff_t index = from >> PAGE_CACHE_SHIFT; unsigned offset = from & (PAGE_CACHE_SIZE-1); - unsigned length = PAGE_CACHE_ALIGN(from) - from; int err; /* Block boundary? Nothing to do */ if (!length) return 0; + BUG_ON((offset + length) > PAGE_CACHE_SIZE); memset(&bh, 0, sizeof(bh)); bh.b_size = PAGE_CACHE_SIZE; @@ -505,4 +509,26 @@ int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block) return 0; } +EXPORT_SYMBOL_GPL(dax_zero_page_range); + +/** + * dax_truncate_page - handle a partial page being truncated in a DAX file + * @inode: The file being truncated + * @from: The file offset that is being truncated to + * @get_block: The filesystem method used to translate file offsets to blocks + * + * Similar to block_truncate_page(), this function can be called by a + * filesystem when it is truncating a DAX file to handle the partial page. + * + * We work in terms of PAGE_CACHE_SIZE here for commonality with + * block_truncate_page(), but we could go down to PAGE_SIZE if the filesystem + * took care of disposing of the unnecessary blocks. Even if the filesystem + * block size is smaller than PAGE_SIZE, we have to zero the rest of the page + * since the file might be mmapped. + */ +int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block) +{ + unsigned length = PAGE_CACHE_ALIGN(from) - from; + return dax_zero_page_range(inode, from, length, get_block); +} EXPORT_SYMBOL_GPL(dax_truncate_page); diff --git a/include/linux/fs.h b/include/linux/fs.h index d46f8fe6a0ea..ed5a0900b94d 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2589,6 +2589,7 @@ extern int nonseekable_open(struct inode * inode, struct file * filp); ssize_t dax_do_io(int rw, struct kiocb *, struct inode *, struct iov_iter *, loff_t, get_block_t, dio_iodone_t, int flags); int dax_clear_blocks(struct inode *, sector_t block, long size); +int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t); int dax_truncate_page(struct inode *, loff_t from, get_block_t); int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t); #define dax_mkwrite(vma, vmf, gb) dax_fault(vma, vmf, gb) From 923ae0ff9250430133b3310fe62c47538cf1cbc1 Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Mon, 16 Feb 2015 15:59:38 -0800 Subject: [PATCH 21/37] ext4: add DAX functionality This is a port of the DAX functionality found in the current version of ext2. [matthew.r.wilcox@intel.com: heavily tweaked] [akpm@linux-foundation.org: remap_pages went away] Signed-off-by: Ross Zwisler Reviewed-by: Andreas Dilger Signed-off-by: Matthew Wilcox Cc: Boaz Harrosh Cc: Christoph Hellwig Cc: Dave Chinner Cc: Jan Kara Cc: Jens Axboe Cc: Kirill A. Shutemov Cc: Mathieu Desnoyers Cc: Randy Dunlap Cc: Theodore Ts'o Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/dax.txt | 1 + Documentation/filesystems/ext4.txt | 4 ++ fs/ext4/ext4.h | 6 ++ fs/ext4/file.c | 49 +++++++++++++++- fs/ext4/indirect.c | 18 ++++-- fs/ext4/inode.c | 89 +++++++++++++++++++++--------- fs/ext4/namei.c | 10 +++- fs/ext4/super.c | 39 ++++++++++++- 8 files changed, 179 insertions(+), 37 deletions(-) diff --git a/Documentation/filesystems/dax.txt b/Documentation/filesystems/dax.txt index ebcd97f03654..be376d91d058 100644 --- a/Documentation/filesystems/dax.txt +++ b/Documentation/filesystems/dax.txt @@ -73,6 +73,7 @@ or a write()) work correctly. These filesystems may be used for inspiration: - ext2: the second extended filesystem, see Documentation/filesystems/ext2.txt +- ext4: the fourth extended filesystem, see Documentation/filesystems/ext4.txt Shortcomings diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt index 919a3293aaa4..6c0108eb0137 100644 --- a/Documentation/filesystems/ext4.txt +++ b/Documentation/filesystems/ext4.txt @@ -386,6 +386,10 @@ max_dir_size_kb=n This limits the size of directories so that any i_version Enable 64-bit inode version support. This option is off by default. +dax Use direct access (no page cache). See + Documentation/filesystems/dax.txt. Note that + this option is incompatible with data=journal. + Data Mode ========= There are 3 different data modes: diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index a75fba67bb1f..982d934fd9ac 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -965,6 +965,11 @@ struct ext4_inode_info { #define EXT4_MOUNT_ERRORS_MASK 0x00070 #define EXT4_MOUNT_MINIX_DF 0x00080 /* Mimics the Minix statfs */ #define EXT4_MOUNT_NOLOAD 0x00100 /* Don't use existing journal*/ +#ifdef CONFIG_FS_DAX +#define EXT4_MOUNT_DAX 0x00200 /* Direct Access */ +#else +#define EXT4_MOUNT_DAX 0 +#endif #define EXT4_MOUNT_DATA_FLAGS 0x00C00 /* Mode for data writes: */ #define EXT4_MOUNT_JOURNAL_DATA 0x00400 /* Write data to journal */ #define EXT4_MOUNT_ORDERED_DATA 0x00800 /* Flush data before commit */ @@ -2578,6 +2583,7 @@ extern const struct file_operations ext4_dir_operations; /* file.c */ extern const struct inode_operations ext4_file_inode_operations; extern const struct file_operations ext4_file_operations; +extern const struct file_operations ext4_dax_file_operations; extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin); /* inline.c */ diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 7cb592386121..33a09da16c9c 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -95,7 +95,7 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) struct inode *inode = file_inode(iocb->ki_filp); struct mutex *aio_mutex = NULL; struct blk_plug plug; - int o_direct = file->f_flags & O_DIRECT; + int o_direct = io_is_direct(file); int overwrite = 0; size_t length = iov_iter_count(from); ssize_t ret; @@ -191,6 +191,26 @@ errout: return ret; } +#ifdef CONFIG_FS_DAX +static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + return dax_fault(vma, vmf, ext4_get_block); + /* Is this the right get_block? */ +} + +static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + return dax_mkwrite(vma, vmf, ext4_get_block); +} + +static const struct vm_operations_struct ext4_dax_vm_ops = { + .fault = ext4_dax_fault, + .page_mkwrite = ext4_dax_mkwrite, +}; +#else +#define ext4_dax_vm_ops ext4_file_vm_ops +#endif + static const struct vm_operations_struct ext4_file_vm_ops = { .fault = filemap_fault, .map_pages = filemap_map_pages, @@ -200,7 +220,12 @@ static const struct vm_operations_struct ext4_file_vm_ops = { static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) { file_accessed(file); - vma->vm_ops = &ext4_file_vm_ops; + if (IS_DAX(file_inode(file))) { + vma->vm_ops = &ext4_dax_vm_ops; + vma->vm_flags |= VM_MIXEDMAP; + } else { + vma->vm_ops = &ext4_file_vm_ops; + } return 0; } @@ -599,6 +624,26 @@ const struct file_operations ext4_file_operations = { .fallocate = ext4_fallocate, }; +#ifdef CONFIG_FS_DAX +const struct file_operations ext4_dax_file_operations = { + .llseek = ext4_llseek, + .read = new_sync_read, + .write = new_sync_write, + .read_iter = generic_file_read_iter, + .write_iter = ext4_file_write_iter, + .unlocked_ioctl = ext4_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ext4_compat_ioctl, +#endif + .mmap = ext4_file_mmap, + .open = ext4_file_open, + .release = ext4_release_file, + .fsync = ext4_sync_file, + /* Splice not yet supported with DAX */ + .fallocate = ext4_fallocate, +}; +#endif + const struct inode_operations ext4_file_inode_operations = { .setattr = ext4_setattr, .getattr = ext4_getattr, diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 36b369697a13..6b9878a24182 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -689,14 +689,22 @@ retry: inode_dio_done(inode); goto locked; } - ret = __blockdev_direct_IO(rw, iocb, inode, - inode->i_sb->s_bdev, iter, offset, - ext4_get_block, NULL, NULL, 0); + if (IS_DAX(inode)) + ret = dax_do_io(rw, iocb, inode, iter, offset, + ext4_get_block, NULL, 0); + else + ret = __blockdev_direct_IO(rw, iocb, inode, + inode->i_sb->s_bdev, iter, offset, + ext4_get_block, NULL, NULL, 0); inode_dio_done(inode); } else { locked: - ret = blockdev_direct_IO(rw, iocb, inode, iter, - offset, ext4_get_block); + if (IS_DAX(inode)) + ret = dax_do_io(rw, iocb, inode, iter, offset, + ext4_get_block, NULL, DIO_LOCKING); + else + ret = blockdev_direct_IO(rw, iocb, inode, iter, + offset, ext4_get_block); if (unlikely((rw & WRITE) && ret < 0)) { loff_t isize = i_size_read(inode); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 5653fa42930b..28555f191b62 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -657,6 +657,18 @@ has_zeroout: return retval; } +static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate) +{ + struct inode *inode = bh->b_assoc_map->host; + /* XXX: breaks on 32-bit > 16GB. Is that even supported? */ + loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits; + int err; + if (!uptodate) + return; + WARN_ON(!buffer_unwritten(bh)); + err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size); +} + /* Maximum number of blocks we map for direct IO at once. */ #define DIO_MAX_BLOCKS 4096 @@ -694,6 +706,11 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock, map_bh(bh, inode->i_sb, map.m_pblk); bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags; + if (IS_DAX(inode) && buffer_unwritten(bh) && !io_end) { + bh->b_assoc_map = inode->i_mapping; + bh->b_private = (void *)(unsigned long)iblock; + bh->b_end_io = ext4_end_io_unwritten; + } if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN) set_buffer_defer_completion(bh); bh->b_size = inode->i_sb->s_blocksize * map.m_len; @@ -3010,13 +3027,14 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, get_block_func = ext4_get_block_write; dio_flags = DIO_LOCKING; } - ret = __blockdev_direct_IO(rw, iocb, inode, - inode->i_sb->s_bdev, iter, - offset, - get_block_func, - ext4_end_io_dio, - NULL, - dio_flags); + if (IS_DAX(inode)) + ret = dax_do_io(rw, iocb, inode, iter, offset, get_block_func, + ext4_end_io_dio, dio_flags); + else + ret = __blockdev_direct_IO(rw, iocb, inode, + inode->i_sb->s_bdev, iter, offset, + get_block_func, + ext4_end_io_dio, NULL, dio_flags); /* * Put our reference to io_end. This can free the io_end structure e.g. @@ -3180,19 +3198,12 @@ void ext4_set_aops(struct inode *inode) inode->i_mapping->a_ops = &ext4_aops; } -/* - * ext4_block_zero_page_range() zeros out a mapping of length 'length' - * starting from file offset 'from'. The range to be zero'd must - * be contained with in one block. If the specified range exceeds - * the end of the block it will be shortened to end of the block - * that cooresponds to 'from' - */ -static int ext4_block_zero_page_range(handle_t *handle, +static int __ext4_block_zero_page_range(handle_t *handle, struct address_space *mapping, loff_t from, loff_t length) { ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT; unsigned offset = from & (PAGE_CACHE_SIZE-1); - unsigned blocksize, max, pos; + unsigned blocksize, pos; ext4_lblk_t iblock; struct inode *inode = mapping->host; struct buffer_head *bh; @@ -3205,14 +3216,6 @@ static int ext4_block_zero_page_range(handle_t *handle, return -ENOMEM; blocksize = inode->i_sb->s_blocksize; - max = blocksize - (offset & (blocksize - 1)); - - /* - * correct length if it does not fall between - * 'from' and the end of the block - */ - if (length > max || length < 0) - length = max; iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); @@ -3277,6 +3280,33 @@ unlock: return err; } +/* + * ext4_block_zero_page_range() zeros out a mapping of length 'length' + * starting from file offset 'from'. The range to be zero'd must + * be contained with in one block. If the specified range exceeds + * the end of the block it will be shortened to end of the block + * that cooresponds to 'from' + */ +static int ext4_block_zero_page_range(handle_t *handle, + struct address_space *mapping, loff_t from, loff_t length) +{ + struct inode *inode = mapping->host; + unsigned offset = from & (PAGE_CACHE_SIZE-1); + unsigned blocksize = inode->i_sb->s_blocksize; + unsigned max = blocksize - (offset & (blocksize - 1)); + + /* + * correct length if it does not fall between + * 'from' and the end of the block + */ + if (length > max || length < 0) + length = max; + + if (IS_DAX(inode)) + return dax_zero_page_range(inode, from, length, ext4_get_block); + return __ext4_block_zero_page_range(handle, mapping, from, length); +} + /* * ext4_block_truncate_page() zeroes out a mapping from file offset `from' * up to the end of the block which corresponds to `from'. @@ -3798,8 +3828,10 @@ void ext4_set_inode_flags(struct inode *inode) new_fl |= S_NOATIME; if (flags & EXT4_DIRSYNC_FL) new_fl |= S_DIRSYNC; + if (test_opt(inode->i_sb, DAX)) + new_fl |= S_DAX; inode_set_flags(inode, new_fl, - S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); + S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX); } /* Propagate flags from i_flags to EXT4_I(inode)->i_flags */ @@ -4052,7 +4084,10 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) if (S_ISREG(inode->i_mode)) { inode->i_op = &ext4_file_inode_operations; - inode->i_fop = &ext4_file_operations; + if (test_opt(inode->i_sb, DAX)) + inode->i_fop = &ext4_dax_file_operations; + else + inode->i_fop = &ext4_file_operations; ext4_set_aops(inode); } else if (S_ISDIR(inode->i_mode)) { inode->i_op = &ext4_dir_inode_operations; @@ -4534,7 +4569,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) * Truncate pagecache after we've waited for commit * in data=journal mode to make pages freeable. */ - truncate_pagecache(inode, inode->i_size); + truncate_pagecache(inode, inode->i_size); } /* * We want to call ext4_truncate() even if attr->ia_size == diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 2291923dae4e..28fe71a2904c 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2235,7 +2235,10 @@ retry: err = PTR_ERR(inode); if (!IS_ERR(inode)) { inode->i_op = &ext4_file_inode_operations; - inode->i_fop = &ext4_file_operations; + if (test_opt(inode->i_sb, DAX)) + inode->i_fop = &ext4_dax_file_operations; + else + inode->i_fop = &ext4_file_operations; ext4_set_aops(inode); err = ext4_add_nondir(handle, dentry, inode); if (!err && IS_DIRSYNC(dir)) @@ -2299,7 +2302,10 @@ retry: err = PTR_ERR(inode); if (!IS_ERR(inode)) { inode->i_op = &ext4_file_inode_operations; - inode->i_fop = &ext4_file_operations; + if (test_opt(inode->i_sb, DAX)) + inode->i_fop = &ext4_dax_file_operations; + else + inode->i_fop = &ext4_file_operations; ext4_set_aops(inode); d_tmpfile(dentry, inode); err = ext4_orphan_add(handle, inode); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 64c39c7c594f..10e8c6b7ca08 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1124,7 +1124,7 @@ enum { Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota, Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err, - Opt_usrquota, Opt_grpquota, Opt_i_version, + Opt_usrquota, Opt_grpquota, Opt_i_version, Opt_dax, Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit, Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity, Opt_inode_readahead_blks, Opt_journal_ioprio, @@ -1187,6 +1187,7 @@ static const match_table_t tokens = { {Opt_barrier, "barrier"}, {Opt_nobarrier, "nobarrier"}, {Opt_i_version, "i_version"}, + {Opt_dax, "dax"}, {Opt_stripe, "stripe=%u"}, {Opt_delalloc, "delalloc"}, {Opt_nodelalloc, "nodelalloc"}, @@ -1371,6 +1372,7 @@ static const struct mount_opts { {Opt_min_batch_time, 0, MOPT_GTE0}, {Opt_inode_readahead_blks, 0, MOPT_GTE0}, {Opt_init_itable, 0, MOPT_GTE0}, + {Opt_dax, EXT4_MOUNT_DAX, MOPT_SET}, {Opt_stripe, 0, MOPT_GTE0}, {Opt_resuid, 0, MOPT_GTE0}, {Opt_resgid, 0, MOPT_GTE0}, @@ -1606,6 +1608,11 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, return -1; } sbi->s_jquota_fmt = m->mount_opt; +#endif +#ifndef CONFIG_FS_DAX + } else if (token == Opt_dax) { + ext4_msg(sb, KERN_INFO, "dax option not supported"); + return -1; #endif } else { if (!args->from) @@ -3589,6 +3596,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) "both data=journal and dioread_nolock"); goto failed_mount; } + if (test_opt(sb, DAX)) { + ext4_msg(sb, KERN_ERR, "can't mount with " + "both data=journal and dax"); + goto failed_mount; + } if (test_opt(sb, DELALLOC)) clear_opt(sb, DELALLOC); } @@ -3652,6 +3664,19 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount; } + if (sbi->s_mount_opt & EXT4_MOUNT_DAX) { + if (blocksize != PAGE_SIZE) { + ext4_msg(sb, KERN_ERR, + "error: unsupported blocksize for dax"); + goto failed_mount; + } + if (!sb->s_bdev->bd_disk->fops->direct_access) { + ext4_msg(sb, KERN_ERR, + "error: device does not support dax"); + goto failed_mount; + } + } + if (sb->s_blocksize != blocksize) { /* Validate the filesystem blocksize */ if (!sb_set_blocksize(sb, blocksize)) { @@ -4869,6 +4894,18 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) err = -EINVAL; goto restore_opts; } + if (test_opt(sb, DAX)) { + ext4_msg(sb, KERN_ERR, "can't mount with " + "both data=journal and dax"); + err = -EINVAL; + goto restore_opts; + } + } + + if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_DAX) { + ext4_msg(sb, KERN_WARNING, "warning: refusing change of " + "dax flag with busy inodes while remounting"); + sbi->s_mount_opt ^= EXT4_MOUNT_DAX; } if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) From a7a97fc9ff6c2fcec00feb34d9b87b94452b0b78 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Mon, 16 Feb 2015 15:59:41 -0800 Subject: [PATCH 22/37] brd: rename XIP to DAX Since this is relating to FS_XIP, not KERNEL_XIP, it should be called DAX instead of XIP. Signed-off-by: Matthew Wilcox Cc: Andreas Dilger Cc: Boaz Harrosh Cc: Christoph Hellwig Cc: Dave Chinner Cc: Jan Kara Cc: Jens Axboe Cc: Kirill A. Shutemov Cc: Mathieu Desnoyers Cc: Randy Dunlap Cc: Ross Zwisler Cc: Theodore Ts'o Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/Kconfig | 13 +++++++------ drivers/block/brd.c | 14 +++++++------- 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index 014a1cfc41c5..1b8094d4d7af 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -393,14 +393,15 @@ config BLK_DEV_RAM_SIZE The default value is 4096 kilobytes. Only change this if you know what you are doing. -config BLK_DEV_XIP - bool "Support XIP filesystems on RAM block device" - depends on BLK_DEV_RAM +config BLK_DEV_RAM_DAX + bool "Support Direct Access (DAX) to RAM block devices" + depends on BLK_DEV_RAM && FS_DAX default n help - Support XIP filesystems (such as ext2 with XIP support on) on - top of block ram device. This will slightly enlarge the kernel, and - will prevent RAM block device backing store memory from being + Support filesystems using DAX to access RAM block devices. This + avoids double-buffering data in the page cache before copying it + to the block device. Answering Y will slightly enlarge the kernel, + and will prevent RAM block device backing store memory from being allocated from highmem (only a problem for highmem systems). config CDROM_PKTCDVD diff --git a/drivers/block/brd.c b/drivers/block/brd.c index c01b921b1b4a..64ab4951e9d6 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -97,13 +97,13 @@ static struct page *brd_insert_page(struct brd_device *brd, sector_t sector) * Must use NOIO because we don't want to recurse back into the * block or filesystem layers from page reclaim. * - * Cannot support XIP and highmem, because our ->direct_access - * routine for XIP must return memory that is always addressable. - * If XIP was reworked to use pfns and kmap throughout, this + * Cannot support DAX and highmem, because our ->direct_access + * routine for DAX must return memory that is always addressable. + * If DAX was reworked to use pfns and kmap throughout, this * restriction might be able to be lifted. */ gfp_flags = GFP_NOIO | __GFP_ZERO; -#ifndef CONFIG_BLK_DEV_XIP +#ifndef CONFIG_BLK_DEV_RAM_DAX gfp_flags |= __GFP_HIGHMEM; #endif page = alloc_page(gfp_flags); @@ -369,7 +369,7 @@ static int brd_rw_page(struct block_device *bdev, sector_t sector, return err; } -#ifdef CONFIG_BLK_DEV_XIP +#ifdef CONFIG_BLK_DEV_RAM_DAX static long brd_direct_access(struct block_device *bdev, sector_t sector, void **kaddr, unsigned long *pfn, long size) { @@ -390,6 +390,8 @@ static long brd_direct_access(struct block_device *bdev, sector_t sector, */ return PAGE_SIZE; } +#else +#define brd_direct_access NULL #endif static int brd_ioctl(struct block_device *bdev, fmode_t mode, @@ -430,9 +432,7 @@ static const struct block_device_operations brd_fops = { .owner = THIS_MODULE, .rw_page = brd_rw_page, .ioctl = brd_ioctl, -#ifdef CONFIG_BLK_DEV_XIP .direct_access = brd_direct_access, -#endif }; /* From d92576f1167cacf7844e5993f343eed4a6d8a147 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Mon, 16 Feb 2015 15:59:44 -0800 Subject: [PATCH 23/37] dax: does not work correctly with virtual aliasing caches The DAX code accesses the underlying storage through the kernel's linear mapping, which may not be cache-coherent with user mappings on ARM, MIPS or SPARC. Temporarily disable the DAX code until this problem is resolved. The original XIP code also had this problem, but it was never noticed. Signed-off-by: Matthew Wilcox Cc: Andreas Dilger Cc: Boaz Harrosh Cc: Christoph Hellwig Cc: Dave Chinner Cc: Jan Kara Cc: Jens Axboe Cc: Kirill A. Shutemov Cc: Mathieu Desnoyers Cc: Randy Dunlap Cc: Ross Zwisler Cc: Theodore Ts'o Cc: Ralf Baechle Cc: Russell King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/dax.txt | 3 +++ fs/Kconfig | 1 + 2 files changed, 4 insertions(+) diff --git a/Documentation/filesystems/dax.txt b/Documentation/filesystems/dax.txt index be376d91d058..baf41118660d 100644 --- a/Documentation/filesystems/dax.txt +++ b/Documentation/filesystems/dax.txt @@ -82,6 +82,9 @@ Shortcomings Even if the kernel or its modules are stored on a filesystem that supports DAX on a block device that supports DAX, they will still be copied into RAM. +The DAX code does not work correctly on architectures which have virtually +mapped caches such as ARM, MIPS and SPARC. + Calling get_user_pages() on a range of user memory that has been mmaped from a DAX file will fail as there are no 'struct page' to describe those pages. This problem is being worked on. That means that O_DIRECT diff --git a/fs/Kconfig b/fs/Kconfig index 5331497d5b25..ec35851e5b71 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -36,6 +36,7 @@ source "fs/nilfs2/Kconfig" config FS_DAX bool "Direct Access (DAX) support" depends on MMU + depends on !(ARM || MIPS || SPARC) help Direct Access (DAX) can be used on memory-backed block devices. If the block device supports DAX and the filesystem supports DAX, From 6f9e2456c9f8904346958b6d9602a755372865b0 Mon Sep 17 00:00:00 2001 From: Akash Shende Date: Mon, 16 Feb 2015 15:59:48 -0800 Subject: [PATCH 24/37] MAINTAINERS: fix spelling mistake & remove trailing WS Signed-off-by: Akash Shende Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 8670c224c833..0ca23aa48ae7 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -34,7 +34,7 @@ trivial patch so apply some common sense. generalized kernel feature ready for next time. PLEASE check your patch with the automated style checker - (scripts/checkpatch.pl) to catch trival style violations. + (scripts/checkpatch.pl) to catch trivial style violations. See Documentation/CodingStyle for guidance here. PLEASE CC: the maintainers and mailing lists that are generated From 026749a86ebff68cb2accdcd29872d36ac148920 Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Mon, 16 Feb 2015 15:59:50 -0800 Subject: [PATCH 25/37] ocfs2: prepare some interfaces used in append direct io Currently in case of append O_DIRECT write (block not allocated yet), ocfs2 will fall back to buffered I/O. This has some disadvantages. Firstly, it is not the behavior as expected. Secondly, it will consume huge page cache, e.g. in mass backup scenario. Thirdly, modern filesystems such as ext4 support this feature. In this patch set, the direct I/O write doesn't fallback to buffer I/O write any more because the allocate blocks are enabled in direct I/O now. This patch (of 9): Prepare some interfaces which will be used in append O_DIRECT write. Signed-off-by: Joseph Qi Cc: Weiwei Wang Cc: Mark Fasheh Cc: Joel Becker Cc: Xuejiufei Cc: Junxiao Bi Cc: alex chen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/file.c | 11 +++++++++-- fs/ocfs2/file.h | 9 +++++++++ 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index e0f04d55fd05..2fce3c40ad27 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -295,7 +295,7 @@ out: return ret; } -static int ocfs2_set_inode_size(handle_t *handle, +int ocfs2_set_inode_size(handle_t *handle, struct inode *inode, struct buffer_head *fe_bh, u64 new_i_size) @@ -441,7 +441,7 @@ out: return status; } -static int ocfs2_truncate_file(struct inode *inode, +int ocfs2_truncate_file(struct inode *inode, struct buffer_head *di_bh, u64 new_i_size) { @@ -709,6 +709,13 @@ leave: return status; } +int ocfs2_extend_allocation(struct inode *inode, u32 logical_start, + u32 clusters_to_add, int mark_unwritten) +{ + return __ocfs2_extend_allocation(inode, logical_start, + clusters_to_add, mark_unwritten); +} + /* * While a write will already be ordering the data, a truncate will not. * Thus, we need to explicitly order the zeroed pages. diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h index 97bf761c9e7c..e8c62f22215c 100644 --- a/fs/ocfs2/file.h +++ b/fs/ocfs2/file.h @@ -51,13 +51,22 @@ int ocfs2_add_inode_data(struct ocfs2_super *osb, struct ocfs2_alloc_context *data_ac, struct ocfs2_alloc_context *meta_ac, enum ocfs2_alloc_restarted *reason_ret); +int ocfs2_set_inode_size(handle_t *handle, + struct inode *inode, + struct buffer_head *fe_bh, + u64 new_i_size); int ocfs2_simple_size_update(struct inode *inode, struct buffer_head *di_bh, u64 new_i_size); +int ocfs2_truncate_file(struct inode *inode, + struct buffer_head *di_bh, + u64 new_i_size); int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh, u64 new_i_size, u64 zero_to); int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh, loff_t zero_to); +int ocfs2_extend_allocation(struct inode *inode, u32 logical_start, + u32 clusters_to_add, int mark_unwritten); int ocfs2_setattr(struct dentry *dentry, struct iattr *attr); int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); From 06ee5c75b57564435933fc0ffa72dc18a2fda0e7 Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Mon, 16 Feb 2015 15:59:54 -0800 Subject: [PATCH 26/37] ocfs2: add functions to add and remove inode in orphan dir Add functions to add inode to orphan dir and remove inode in orphan dir. Here we do not call ocfs2_prepare_orphan_dir and ocfs2_orphan_add directly. Because append O_DIRECT will add inode to orphan two and may result in more than one orphan entry for the same inode. [akpm@linux-foundation.org: avoid dynamic stack allocation] Signed-off-by: Joseph Qi Cc: Weiwei Wang Cc: Junxiao Bi Cc: Joel Becker Cc: Mark Fasheh Cc: Xuejiufei Cc: alex chen Cc: Fengguang Wu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/inode.c | 2 +- fs/ocfs2/journal.h | 5 + fs/ocfs2/namei.c | 247 +++++++++++++++++++++++++++++++++++++++----- fs/ocfs2/namei.h | 8 +- fs/ocfs2/ocfs2_fs.h | 6 +- 5 files changed, 238 insertions(+), 30 deletions(-) diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index c8b25de9efbb..3025c0da6b8a 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -648,7 +648,7 @@ static int ocfs2_remove_inode(struct inode *inode, if (!(OCFS2_I(inode)->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR)) { status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode, - orphan_dir_bh); + orphan_dir_bh, false); if (status < 0) { mlog_errno(status); goto bail_commit; diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index 7f8cde94abfe..f4cd3c3e9fb7 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -472,6 +472,11 @@ static inline int ocfs2_unlink_credits(struct super_block *sb) * orphan dir index leaf */ #define OCFS2_DELETE_INODE_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 4) +/* dinode + orphan dir dinode + extent tree leaf block + orphan dir entry + + * orphan dir index root + orphan dir index leaf */ +#define OCFS2_INODE_ADD_TO_ORPHAN_CREDITS (2 * OCFS2_INODE_UPDATE_CREDITS + 4) +#define OCFS2_INODE_DEL_FROM_ORPHAN_CREDITS OCFS2_INODE_ADD_TO_ORPHAN_CREDITS + /* dinode update, old dir dinode update, new dir dinode update, old * dir dir entry, new dir dir entry, dir entry update for renaming * directory + target unlink + 3 x dir index leaves */ diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 914c121ec890..7eec45d0d85f 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -79,7 +79,8 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb, struct inode **ret_orphan_dir, u64 blkno, char *name, - struct ocfs2_dir_lookup_result *lookup); + struct ocfs2_dir_lookup_result *lookup, + bool dio); static int ocfs2_orphan_add(struct ocfs2_super *osb, handle_t *handle, @@ -87,7 +88,8 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb, struct buffer_head *fe_bh, char *name, struct ocfs2_dir_lookup_result *lookup, - struct inode *orphan_dir_inode); + struct inode *orphan_dir_inode, + bool dio); static int ocfs2_create_symlink_data(struct ocfs2_super *osb, handle_t *handle, @@ -104,6 +106,8 @@ static int ocfs2_double_lock(struct ocfs2_super *osb, static void ocfs2_double_unlock(struct inode *inode1, struct inode *inode2); /* An orphan dir name is an 8 byte value, printed as a hex string */ #define OCFS2_ORPHAN_NAMELEN ((int)(2 * sizeof(u64))) +#define OCFS2_DIO_ORPHAN_PREFIX "dio-" +#define OCFS2_DIO_ORPHAN_PREFIX_LEN 4 static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) @@ -952,7 +956,8 @@ static int ocfs2_unlink(struct inode *dir, if (ocfs2_inode_is_unlinkable(inode)) { status = ocfs2_prepare_orphan_dir(osb, &orphan_dir, OCFS2_I(inode)->ip_blkno, - orphan_name, &orphan_insert); + orphan_name, &orphan_insert, + false); if (status < 0) { mlog_errno(status); goto leave; @@ -1004,7 +1009,7 @@ static int ocfs2_unlink(struct inode *dir, if (is_unlinkable) { status = ocfs2_orphan_add(osb, handle, inode, fe_bh, - orphan_name, &orphan_insert, orphan_dir); + orphan_name, &orphan_insert, orphan_dir, false); if (status < 0) mlog_errno(status); } @@ -1440,7 +1445,8 @@ static int ocfs2_rename(struct inode *old_dir, if (S_ISDIR(new_inode->i_mode) || (new_inode->i_nlink == 1)) { status = ocfs2_prepare_orphan_dir(osb, &orphan_dir, OCFS2_I(new_inode)->ip_blkno, - orphan_name, &orphan_insert); + orphan_name, &orphan_insert, + false); if (status < 0) { mlog_errno(status); goto bail; @@ -1507,7 +1513,7 @@ static int ocfs2_rename(struct inode *old_dir, if (should_add_orphan) { status = ocfs2_orphan_add(osb, handle, new_inode, newfe_bh, orphan_name, - &orphan_insert, orphan_dir); + &orphan_insert, orphan_dir, false); if (status < 0) { mlog_errno(status); goto bail; @@ -2088,12 +2094,28 @@ static int __ocfs2_prepare_orphan_dir(struct inode *orphan_dir_inode, struct buffer_head *orphan_dir_bh, u64 blkno, char *name, - struct ocfs2_dir_lookup_result *lookup) + struct ocfs2_dir_lookup_result *lookup, + bool dio) { int ret; struct ocfs2_super *osb = OCFS2_SB(orphan_dir_inode->i_sb); + int namelen = dio ? + (OCFS2_DIO_ORPHAN_PREFIX_LEN + OCFS2_ORPHAN_NAMELEN) : + OCFS2_ORPHAN_NAMELEN; - ret = ocfs2_blkno_stringify(blkno, name); + if (dio) { + ret = snprintf(name, OCFS2_DIO_ORPHAN_PREFIX_LEN + 1, "%s", + OCFS2_DIO_ORPHAN_PREFIX); + if (ret != OCFS2_DIO_ORPHAN_PREFIX_LEN) { + ret = -EINVAL; + mlog_errno(ret); + return ret; + } + + ret = ocfs2_blkno_stringify(blkno, + name + OCFS2_DIO_ORPHAN_PREFIX_LEN); + } else + ret = ocfs2_blkno_stringify(blkno, name); if (ret < 0) { mlog_errno(ret); return ret; @@ -2101,7 +2123,7 @@ static int __ocfs2_prepare_orphan_dir(struct inode *orphan_dir_inode, ret = ocfs2_prepare_dir_for_insert(osb, orphan_dir_inode, orphan_dir_bh, name, - OCFS2_ORPHAN_NAMELEN, lookup); + namelen, lookup); if (ret < 0) { mlog_errno(ret); return ret; @@ -2128,7 +2150,8 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb, struct inode **ret_orphan_dir, u64 blkno, char *name, - struct ocfs2_dir_lookup_result *lookup) + struct ocfs2_dir_lookup_result *lookup, + bool dio) { struct inode *orphan_dir_inode = NULL; struct buffer_head *orphan_dir_bh = NULL; @@ -2142,7 +2165,7 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb, } ret = __ocfs2_prepare_orphan_dir(orphan_dir_inode, orphan_dir_bh, - blkno, name, lookup); + blkno, name, lookup, dio); if (ret < 0) { mlog_errno(ret); goto out; @@ -2170,12 +2193,16 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb, struct buffer_head *fe_bh, char *name, struct ocfs2_dir_lookup_result *lookup, - struct inode *orphan_dir_inode) + struct inode *orphan_dir_inode, + bool dio) { struct buffer_head *orphan_dir_bh = NULL; int status = 0; struct ocfs2_dinode *orphan_fe; struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data; + int namelen = dio ? + (OCFS2_DIO_ORPHAN_PREFIX_LEN + OCFS2_ORPHAN_NAMELEN) : + OCFS2_ORPHAN_NAMELEN; trace_ocfs2_orphan_add_begin( (unsigned long long)OCFS2_I(inode)->ip_blkno); @@ -2219,7 +2246,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb, ocfs2_journal_dirty(handle, orphan_dir_bh); status = __ocfs2_add_entry(handle, orphan_dir_inode, name, - OCFS2_ORPHAN_NAMELEN, inode, + namelen, inode, OCFS2_I(inode)->ip_blkno, orphan_dir_bh, lookup); if (status < 0) { @@ -2227,13 +2254,21 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb, goto rollback; } - fe->i_flags |= cpu_to_le32(OCFS2_ORPHANED_FL); - OCFS2_I(inode)->ip_flags &= ~OCFS2_INODE_SKIP_ORPHAN_DIR; + if (dio) { + /* Update flag OCFS2_DIO_ORPHANED_FL and record the orphan + * slot. + */ + fe->i_flags |= cpu_to_le32(OCFS2_DIO_ORPHANED_FL); + fe->i_dio_orphaned_slot = cpu_to_le16(osb->slot_num); + } else { + fe->i_flags |= cpu_to_le32(OCFS2_ORPHANED_FL); + OCFS2_I(inode)->ip_flags &= ~OCFS2_INODE_SKIP_ORPHAN_DIR; - /* Record which orphan dir our inode now resides - * in. delete_inode will use this to determine which orphan - * dir to lock. */ - fe->i_orphaned_slot = cpu_to_le16(osb->slot_num); + /* Record which orphan dir our inode now resides + * in. delete_inode will use this to determine which orphan + * dir to lock. */ + fe->i_orphaned_slot = cpu_to_le16(osb->slot_num); + } ocfs2_journal_dirty(handle, fe_bh); @@ -2258,14 +2293,28 @@ int ocfs2_orphan_del(struct ocfs2_super *osb, handle_t *handle, struct inode *orphan_dir_inode, struct inode *inode, - struct buffer_head *orphan_dir_bh) + struct buffer_head *orphan_dir_bh, + bool dio) { - char name[OCFS2_ORPHAN_NAMELEN + 1]; + const int namelen = OCFS2_DIO_ORPHAN_PREFIX_LEN + OCFS2_ORPHAN_NAMELEN; + char name[namelen + 1]; struct ocfs2_dinode *orphan_fe; int status = 0; struct ocfs2_dir_lookup_result lookup = { NULL, }; - status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, name); + if (dio) { + status = snprintf(name, OCFS2_DIO_ORPHAN_PREFIX_LEN + 1, "%s", + OCFS2_DIO_ORPHAN_PREFIX); + if (status != OCFS2_DIO_ORPHAN_PREFIX_LEN) { + status = -EINVAL; + mlog_errno(status); + return status; + } + + status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, + name + OCFS2_DIO_ORPHAN_PREFIX_LEN); + } else + status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, name); if (status < 0) { mlog_errno(status); goto leave; @@ -2273,10 +2322,10 @@ int ocfs2_orphan_del(struct ocfs2_super *osb, trace_ocfs2_orphan_del( (unsigned long long)OCFS2_I(orphan_dir_inode)->ip_blkno, - name, OCFS2_ORPHAN_NAMELEN); + name, namelen); /* find it's spot in the orphan directory */ - status = ocfs2_find_entry(name, OCFS2_ORPHAN_NAMELEN, orphan_dir_inode, + status = ocfs2_find_entry(name, namelen, orphan_dir_inode, &lookup); if (status) { mlog_errno(status); @@ -2376,7 +2425,8 @@ static int ocfs2_prep_new_orphaned_file(struct inode *dir, } ret = __ocfs2_prepare_orphan_dir(orphan_dir, orphan_dir_bh, - di_blkno, orphan_name, orphan_insert); + di_blkno, orphan_name, orphan_insert, + false); if (ret < 0) { mlog_errno(ret); goto out; @@ -2482,7 +2532,7 @@ int ocfs2_create_inode_in_orphan(struct inode *dir, di = (struct ocfs2_dinode *)new_di_bh->b_data; status = ocfs2_orphan_add(osb, handle, inode, new_di_bh, orphan_name, - &orphan_insert, orphan_dir); + &orphan_insert, orphan_dir, false); if (status < 0) { mlog_errno(status); goto leave; @@ -2527,6 +2577,149 @@ leave: return status; } +int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb, + struct inode *inode) +{ + char orphan_name[OCFS2_DIO_ORPHAN_PREFIX_LEN + OCFS2_ORPHAN_NAMELEN + 1]; + struct inode *orphan_dir_inode = NULL; + struct ocfs2_dir_lookup_result orphan_insert = { NULL, }; + struct buffer_head *di_bh = NULL; + int status = 0; + handle_t *handle = NULL; + + status = ocfs2_inode_lock(inode, &di_bh, 1); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + status = ocfs2_prepare_orphan_dir(osb, &orphan_dir_inode, + OCFS2_I(inode)->ip_blkno, + orphan_name, + &orphan_insert, + true); + if (status < 0) { + mlog_errno(status); + goto bail_unlock_inode; + } + + handle = ocfs2_start_trans(osb, + OCFS2_INODE_ADD_TO_ORPHAN_CREDITS); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + goto bail_unlock_orphan; + } + + status = ocfs2_orphan_add(osb, handle, inode, di_bh, orphan_name, + &orphan_insert, orphan_dir_inode, true); + if (status) + mlog_errno(status); + + ocfs2_commit_trans(osb, handle); + +bail_unlock_orphan: + ocfs2_inode_unlock(orphan_dir_inode, 1); + mutex_unlock(&orphan_dir_inode->i_mutex); + iput(orphan_dir_inode); + + ocfs2_free_dir_lookup_result(&orphan_insert); + +bail_unlock_inode: + ocfs2_inode_unlock(inode, 1); + brelse(di_bh); + +bail: + return status; +} + +int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb, + struct inode *inode, int update_isize, + loff_t end) +{ + struct inode *orphan_dir_inode = NULL; + struct buffer_head *orphan_dir_bh = NULL; + struct buffer_head *di_bh = NULL; + struct ocfs2_dinode *di = NULL; + handle_t *handle = NULL; + int status = 0; + + status = ocfs2_inode_lock(inode, &di_bh, 1); + if (status < 0) { + mlog_errno(status); + goto bail; + } + di = (struct ocfs2_dinode *) di_bh->b_data; + + orphan_dir_inode = ocfs2_get_system_file_inode(osb, + ORPHAN_DIR_SYSTEM_INODE, + le16_to_cpu(di->i_dio_orphaned_slot)); + if (!orphan_dir_inode) { + status = -ENOENT; + mlog_errno(status); + goto bail_unlock_inode; + } + + mutex_lock(&orphan_dir_inode->i_mutex); + status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1); + if (status < 0) { + mutex_unlock(&orphan_dir_inode->i_mutex); + iput(orphan_dir_inode); + mlog_errno(status); + goto bail_unlock_inode; + } + + handle = ocfs2_start_trans(osb, + OCFS2_INODE_DEL_FROM_ORPHAN_CREDITS); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + goto bail_unlock_orphan; + } + + BUG_ON(!(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))); + + status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, + inode, orphan_dir_bh, true); + if (status < 0) { + mlog_errno(status); + goto bail_commit; + } + + status = ocfs2_journal_access_di(handle, + INODE_CACHE(inode), + di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail_commit; + } + + di->i_flags &= ~cpu_to_le32(OCFS2_DIO_ORPHANED_FL); + di->i_dio_orphaned_slot = 0; + + if (update_isize) { + status = ocfs2_set_inode_size(handle, inode, di_bh, end); + if (status) + mlog_errno(status); + } else + ocfs2_journal_dirty(handle, di_bh); + +bail_commit: + ocfs2_commit_trans(osb, handle); + +bail_unlock_orphan: + ocfs2_inode_unlock(orphan_dir_inode, 1); + mutex_unlock(&orphan_dir_inode->i_mutex); + brelse(orphan_dir_bh); + iput(orphan_dir_inode); + +bail_unlock_inode: + ocfs2_inode_unlock(inode, 1); + brelse(di_bh); + +bail: + return status; +} + int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, struct inode *inode, struct dentry *dentry) @@ -2615,7 +2808,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, } status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode, - orphan_dir_bh); + orphan_dir_bh, false); if (status < 0) { mlog_errno(status); goto out_commit; diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h index e5d059d4f115..5ddecce172fa 100644 --- a/fs/ocfs2/namei.h +++ b/fs/ocfs2/namei.h @@ -34,10 +34,16 @@ int ocfs2_orphan_del(struct ocfs2_super *osb, handle_t *handle, struct inode *orphan_dir_inode, struct inode *inode, - struct buffer_head *orphan_dir_bh); + struct buffer_head *orphan_dir_bh, + bool dio); int ocfs2_create_inode_in_orphan(struct inode *dir, int mode, struct inode **new_inode); +int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb, + struct inode *inode); +int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb, + struct inode *inode, int update_isize, + loff_t end); int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, struct inode *new_inode, struct dentry *new_dentry); diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index 938387a10d5d..cf4fa43af78e 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -229,6 +229,8 @@ #define OCFS2_CHAIN_FL (0x00000400) /* Chain allocator */ #define OCFS2_DEALLOC_FL (0x00000800) /* Truncate log */ #define OCFS2_QUOTA_FL (0x00001000) /* Quota file */ +#define OCFS2_DIO_ORPHANED_FL (0X00002000) /* On the orphan list especially + * for dio */ /* * Flags on ocfs2_dinode.i_dyn_features @@ -729,7 +731,9 @@ struct ocfs2_dinode { inode belongs to. Only valid if allocated from a discontiguous block group */ -/*A0*/ __le64 i_reserved2[3]; +/*A0*/ __le16 i_dio_orphaned_slot; /* only used for append dio write */ + __le16 i_reserved1[3]; + __le64 i_reserved2[2]; /*B8*/ union { __le64 i_pad1; /* Generic way to refer to this 64bit union */ From ed460cffc26ba4ec663a89589d81290ca92c5010 Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Mon, 16 Feb 2015 15:59:57 -0800 Subject: [PATCH 27/37] ocfs2: add orphan recovery types in ocfs2_recover_orphans Define two orphan recovery types, which indicates if need truncate file or not. Signed-off-by: Joseph Qi Cc: Weiwei Wang Cc: Junxiao Bi Cc: Joel Becker Cc: Mark Fasheh Cc: Xuejiufei Cc: alex chen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/journal.c | 108 ++++++++++++++++++++++++++++++++++++--------- fs/ocfs2/ocfs2.h | 5 +++ 2 files changed, 93 insertions(+), 20 deletions(-) diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index d10860fde165..9730f5350ef4 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -50,6 +50,8 @@ #include "sysfile.h" #include "uptodate.h" #include "quota.h" +#include "file.h" +#include "namei.h" #include "buffer_head_io.h" #include "ocfs2_trace.h" @@ -69,13 +71,15 @@ static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb, static int ocfs2_trylock_journal(struct ocfs2_super *osb, int slot_num); static int ocfs2_recover_orphans(struct ocfs2_super *osb, - int slot); + int slot, + enum ocfs2_orphan_reco_type orphan_reco_type); static int ocfs2_commit_thread(void *arg); static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal, int slot_num, struct ocfs2_dinode *la_dinode, struct ocfs2_dinode *tl_dinode, - struct ocfs2_quota_recovery *qrec); + struct ocfs2_quota_recovery *qrec, + enum ocfs2_orphan_reco_type orphan_reco_type); static inline int ocfs2_wait_on_mount(struct ocfs2_super *osb) { @@ -149,7 +153,8 @@ int ocfs2_compute_replay_slots(struct ocfs2_super *osb) return 0; } -void ocfs2_queue_replay_slots(struct ocfs2_super *osb) +void ocfs2_queue_replay_slots(struct ocfs2_super *osb, + enum ocfs2_orphan_reco_type orphan_reco_type) { struct ocfs2_replay_map *replay_map = osb->replay_map; int i; @@ -163,7 +168,8 @@ void ocfs2_queue_replay_slots(struct ocfs2_super *osb) for (i = 0; i < replay_map->rm_slots; i++) if (replay_map->rm_replay_slots[i]) ocfs2_queue_recovery_completion(osb->journal, i, NULL, - NULL, NULL); + NULL, NULL, + orphan_reco_type); replay_map->rm_state = REPLAY_DONE; } @@ -1174,6 +1180,7 @@ struct ocfs2_la_recovery_item { struct ocfs2_dinode *lri_la_dinode; struct ocfs2_dinode *lri_tl_dinode; struct ocfs2_quota_recovery *lri_qrec; + enum ocfs2_orphan_reco_type lri_orphan_reco_type; }; /* Does the second half of the recovery process. By this point, the @@ -1195,6 +1202,7 @@ void ocfs2_complete_recovery(struct work_struct *work) struct ocfs2_dinode *la_dinode, *tl_dinode; struct ocfs2_la_recovery_item *item, *n; struct ocfs2_quota_recovery *qrec; + enum ocfs2_orphan_reco_type orphan_reco_type; LIST_HEAD(tmp_la_list); trace_ocfs2_complete_recovery( @@ -1212,6 +1220,7 @@ void ocfs2_complete_recovery(struct work_struct *work) la_dinode = item->lri_la_dinode; tl_dinode = item->lri_tl_dinode; qrec = item->lri_qrec; + orphan_reco_type = item->lri_orphan_reco_type; trace_ocfs2_complete_recovery_slot(item->lri_slot, la_dinode ? le64_to_cpu(la_dinode->i_blkno) : 0, @@ -1236,7 +1245,8 @@ void ocfs2_complete_recovery(struct work_struct *work) kfree(tl_dinode); } - ret = ocfs2_recover_orphans(osb, item->lri_slot); + ret = ocfs2_recover_orphans(osb, item->lri_slot, + orphan_reco_type); if (ret < 0) mlog_errno(ret); @@ -1261,7 +1271,8 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal, int slot_num, struct ocfs2_dinode *la_dinode, struct ocfs2_dinode *tl_dinode, - struct ocfs2_quota_recovery *qrec) + struct ocfs2_quota_recovery *qrec, + enum ocfs2_orphan_reco_type orphan_reco_type) { struct ocfs2_la_recovery_item *item; @@ -1285,6 +1296,7 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal, item->lri_slot = slot_num; item->lri_tl_dinode = tl_dinode; item->lri_qrec = qrec; + item->lri_orphan_reco_type = orphan_reco_type; spin_lock(&journal->j_lock); list_add_tail(&item->lri_list, &journal->j_la_cleanups); @@ -1304,7 +1316,8 @@ void ocfs2_complete_mount_recovery(struct ocfs2_super *osb) /* No need to queue up our truncate_log as regular cleanup will catch * that */ ocfs2_queue_recovery_completion(journal, osb->slot_num, - osb->local_alloc_copy, NULL, NULL); + osb->local_alloc_copy, NULL, NULL, + ORPHAN_NEED_TRUNCATE); ocfs2_schedule_truncate_log_flush(osb, 0); osb->local_alloc_copy = NULL; @@ -1312,7 +1325,7 @@ void ocfs2_complete_mount_recovery(struct ocfs2_super *osb) /* queue to recover orphan slots for all offline slots */ ocfs2_replay_map_set_state(osb, REPLAY_NEEDED); - ocfs2_queue_replay_slots(osb); + ocfs2_queue_replay_slots(osb, ORPHAN_NEED_TRUNCATE); ocfs2_free_replay_slots(osb); } @@ -1323,7 +1336,8 @@ void ocfs2_complete_quota_recovery(struct ocfs2_super *osb) osb->slot_num, NULL, NULL, - osb->quota_rec); + osb->quota_rec, + ORPHAN_NEED_TRUNCATE); osb->quota_rec = NULL; } } @@ -1360,7 +1374,7 @@ restart: /* queue recovery for our own slot */ ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL, - NULL, NULL); + NULL, NULL, ORPHAN_NO_NEED_TRUNCATE); spin_lock(&osb->osb_lock); while (rm->rm_used) { @@ -1419,13 +1433,14 @@ skip_recovery: continue; } ocfs2_queue_recovery_completion(osb->journal, rm_quota[i], - NULL, NULL, qrec); + NULL, NULL, qrec, + ORPHAN_NEED_TRUNCATE); } ocfs2_super_unlock(osb, 1); /* queue recovery for offline slots */ - ocfs2_queue_replay_slots(osb); + ocfs2_queue_replay_slots(osb, ORPHAN_NEED_TRUNCATE); bail: mutex_lock(&osb->recovery_lock); @@ -1711,7 +1726,7 @@ static int ocfs2_recover_node(struct ocfs2_super *osb, /* This will kfree the memory pointed to by la_copy and tl_copy */ ocfs2_queue_recovery_completion(osb->journal, slot_num, la_copy, - tl_copy, NULL); + tl_copy, NULL, ORPHAN_NEED_TRUNCATE); status = 0; done: @@ -1901,7 +1916,7 @@ void ocfs2_queue_orphan_scan(struct ocfs2_super *osb) for (i = 0; i < osb->max_slots; i++) ocfs2_queue_recovery_completion(osb->journal, i, NULL, NULL, - NULL); + NULL, ORPHAN_NO_NEED_TRUNCATE); /* * We queued a recovery on orphan slots, increment the sequence * number and update LVB so other node will skip the scan for a while @@ -2000,6 +2015,13 @@ static int ocfs2_orphan_filldir(struct dir_context *ctx, const char *name, if (IS_ERR(iter)) return 0; + /* Skip inodes which are already added to recover list, since dio may + * happen concurrently with unlink/rename */ + if (OCFS2_I(iter)->ip_next_orphan) { + iput(iter); + return 0; + } + trace_ocfs2_orphan_filldir((unsigned long long)OCFS2_I(iter)->ip_blkno); /* No locking is required for the next_orphan queue as there * is only ever a single process doing orphan recovery. */ @@ -2108,7 +2130,8 @@ static void ocfs2_clear_recovering_orphan_dir(struct ocfs2_super *osb, * advertising our state to ocfs2_delete_inode(). */ static int ocfs2_recover_orphans(struct ocfs2_super *osb, - int slot) + int slot, + enum ocfs2_orphan_reco_type orphan_reco_type) { int ret = 0; struct inode *inode = NULL; @@ -2132,13 +2155,58 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, (unsigned long long)oi->ip_blkno); iter = oi->ip_next_orphan; + oi->ip_next_orphan = NULL; - spin_lock(&oi->ip_lock); - /* Set the proper information to get us going into - * ocfs2_delete_inode. */ - oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED; - spin_unlock(&oi->ip_lock); + /* + * We need to take and drop the inode lock to + * force read inode from disk. + */ + ret = ocfs2_inode_lock(inode, NULL, 0); + if (ret) { + mlog_errno(ret); + goto next; + } + ocfs2_inode_unlock(inode, 0); + if (inode->i_nlink == 0) { + spin_lock(&oi->ip_lock); + /* Set the proper information to get us going into + * ocfs2_delete_inode. */ + oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED; + spin_unlock(&oi->ip_lock); + } else if (orphan_reco_type == ORPHAN_NEED_TRUNCATE) { + struct buffer_head *di_bh = NULL; + + ret = ocfs2_rw_lock(inode, 1); + if (ret) { + mlog_errno(ret); + goto next; + } + + ret = ocfs2_inode_lock(inode, &di_bh, 1); + if (ret < 0) { + ocfs2_rw_unlock(inode, 1); + mlog_errno(ret); + goto next; + } + + ret = ocfs2_truncate_file(inode, di_bh, + i_size_read(inode)); + ocfs2_inode_unlock(inode, 1); + ocfs2_rw_unlock(inode, 1); + brelse(di_bh); + if (ret < 0) { + if (ret != -ENOSPC) + mlog_errno(ret); + goto next; + } + + ret = ocfs2_del_inode_from_orphan(osb, inode, 0, 0); + if (ret) + mlog_errno(ret); + } /* else if ORPHAN_NO_NEED_TRUNCATE, do nothing */ + +next: iput(inode); inode = iter; diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index fdbcbfed529e..df9a95cbea3a 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -209,6 +209,11 @@ struct ocfs2_lock_res { #endif }; +enum ocfs2_orphan_reco_type { + ORPHAN_NO_NEED_TRUNCATE = 0, + ORPHAN_NEED_TRUNCATE, +}; + enum ocfs2_orphan_scan_state { ORPHAN_SCAN_ACTIVE, ORPHAN_SCAN_INACTIVE From 24c40b329e03dd38a1ca2225c739db67f4441343 Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Mon, 16 Feb 2015 16:00:00 -0800 Subject: [PATCH 28/37] ocfs2: implement ocfs2_direct_IO_write Implement ocfs2_direct_IO_write. Add the inode to orphan dir first, and then delete it once append O_DIRECT finished. This is to make sure block allocation and inode size are consistent. [akpm@linux-foundation.org: fix it for "block: Add discard flag to blkdev_issue_zeroout() function"] Signed-off-by: Joseph Qi Cc: Weiwei Wang Cc: Junxiao Bi Cc: Joel Becker Cc: Mark Fasheh Cc: Xuejiufei Cc: alex chen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/aops.c | 197 ++++++++++++++++++++++++++++++++++++++++++++++- fs/ocfs2/ocfs2.h | 10 +++ 2 files changed, 204 insertions(+), 3 deletions(-) diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 46d93e941f3d..be5986b7e5c6 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -28,6 +28,7 @@ #include #include #include +#include #include @@ -47,6 +48,9 @@ #include "ocfs2_trace.h" #include "buffer_head_io.h" +#include "dir.h" +#include "namei.h" +#include "sysfile.h" static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) @@ -597,6 +601,184 @@ static int ocfs2_releasepage(struct page *page, gfp_t wait) return try_to_free_buffers(page); } +static int ocfs2_is_overwrite(struct ocfs2_super *osb, + struct inode *inode, loff_t offset) +{ + int ret = 0; + u32 v_cpos = 0; + u32 p_cpos = 0; + unsigned int num_clusters = 0; + unsigned int ext_flags = 0; + + v_cpos = ocfs2_bytes_to_clusters(osb->sb, offset); + ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos, + &num_clusters, &ext_flags); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + + if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) + return 1; + + return 0; +} + +static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb, + struct iov_iter *iter, + loff_t offset) +{ + ssize_t ret = 0; + ssize_t written = 0; + bool orphaned = false; + int is_overwrite = 0; + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file)->i_mapping->host; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct buffer_head *di_bh = NULL; + size_t count = iter->count; + journal_t *journal = osb->journal->j_journal; + u32 zero_len; + int cluster_align; + loff_t final_size = offset + count; + int append_write = offset >= i_size_read(inode) ? 1 : 0; + unsigned int num_clusters = 0; + unsigned int ext_flags = 0; + + { + u64 o = offset; + + zero_len = do_div(o, 1 << osb->s_clustersize_bits); + cluster_align = !zero_len; + } + + /* + * when final_size > inode->i_size, inode->i_size will be + * updated after direct write, so add the inode to orphan + * dir first. + */ + if (final_size > i_size_read(inode)) { + ret = ocfs2_add_inode_to_orphan(osb, inode); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + orphaned = true; + } + + if (append_write) { + ret = ocfs2_inode_lock(inode, &di_bh, 1); + if (ret < 0) { + mlog_errno(ret); + goto clean_orphan; + } + + if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) + ret = ocfs2_zero_extend(inode, di_bh, offset); + else + ret = ocfs2_extend_no_holes(inode, di_bh, offset, + offset); + if (ret < 0) { + mlog_errno(ret); + ocfs2_inode_unlock(inode, 1); + brelse(di_bh); + goto clean_orphan; + } + + is_overwrite = ocfs2_is_overwrite(osb, inode, offset); + if (is_overwrite < 0) { + mlog_errno(is_overwrite); + ocfs2_inode_unlock(inode, 1); + brelse(di_bh); + goto clean_orphan; + } + + ocfs2_inode_unlock(inode, 1); + brelse(di_bh); + di_bh = NULL; + } + + written = __blockdev_direct_IO(WRITE, iocb, inode, inode->i_sb->s_bdev, + iter, offset, + ocfs2_direct_IO_get_blocks, + ocfs2_dio_end_io, NULL, 0); + if (unlikely(written < 0)) { + loff_t i_size = i_size_read(inode); + + if (offset + count > i_size) { + ret = ocfs2_inode_lock(inode, &di_bh, 1); + if (ret < 0) { + mlog_errno(ret); + goto clean_orphan; + } + + if (i_size == i_size_read(inode)) { + ret = ocfs2_truncate_file(inode, di_bh, + i_size); + if (ret < 0) { + if (ret != -ENOSPC) + mlog_errno(ret); + + ocfs2_inode_unlock(inode, 1); + brelse(di_bh); + goto clean_orphan; + } + } + + ocfs2_inode_unlock(inode, 1); + brelse(di_bh); + + ret = jbd2_journal_force_commit(journal); + if (ret < 0) + mlog_errno(ret); + } + } else if (written < 0 && append_write && !is_overwrite && + !cluster_align) { + u32 p_cpos = 0; + u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, offset); + + ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos, + &num_clusters, &ext_flags); + if (ret < 0) { + mlog_errno(ret); + goto clean_orphan; + } + + BUG_ON(!p_cpos || (ext_flags & OCFS2_EXT_UNWRITTEN)); + + ret = blkdev_issue_zeroout(osb->sb->s_bdev, + p_cpos << (osb->s_clustersize_bits - 9), + zero_len >> 9, GFP_KERNEL, false); + if (ret < 0) + mlog_errno(ret); + } + +clean_orphan: + if (orphaned) { + int tmp_ret; + int update_isize = written > 0 ? 1 : 0; + loff_t end = update_isize ? offset + written : 0; + + tmp_ret = ocfs2_del_inode_from_orphan(osb, inode, + update_isize, end); + if (tmp_ret < 0) { + ret = tmp_ret; + goto out; + } + + tmp_ret = jbd2_journal_force_commit(journal); + if (tmp_ret < 0) { + ret = tmp_ret; + mlog_errno(tmp_ret); + } + } + +out: + if (ret >= 0) + ret = written; + return ret; +} + static ssize_t ocfs2_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, @@ -604,6 +786,9 @@ static ssize_t ocfs2_direct_IO(int rw, { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file)->i_mapping->host; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + int full_coherency = !(osb->s_mount_opt & + OCFS2_MOUNT_COHERENCY_BUFFERED); /* * Fallback to buffered I/O if we see an inode without @@ -612,14 +797,20 @@ static ssize_t ocfs2_direct_IO(int rw, if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) return 0; - /* Fallback to buffered I/O if we are appending. */ - if (i_size_read(inode) <= offset) + /* Fallback to buffered I/O if we are appending and + * concurrent O_DIRECT writes are allowed. + */ + if (i_size_read(inode) <= offset && !full_coherency) return 0; - return __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, + if (rw == READ) + return __blockdev_direct_IO(rw, iocb, inode, + inode->i_sb->s_bdev, iter, offset, ocfs2_direct_IO_get_blocks, ocfs2_dio_end_io, NULL, 0); + else + return ocfs2_direct_IO_write(iocb, iter, offset); } static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb, diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index df9a95cbea3a..7e39cd654834 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -731,6 +731,16 @@ static inline unsigned int ocfs2_clusters_for_bytes(struct super_block *sb, return clusters; } +static inline unsigned int ocfs2_bytes_to_clusters(struct super_block *sb, + u64 bytes) +{ + int cl_bits = OCFS2_SB(sb)->s_clustersize_bits; + unsigned int clusters; + + clusters = (unsigned int)(bytes >> cl_bits); + return clusters; +} + static inline u64 ocfs2_blocks_for_bytes(struct super_block *sb, u64 bytes) { From 49255dce65dba3d06a0f642fee788f0a50d48d5b Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Mon, 16 Feb 2015 16:00:03 -0800 Subject: [PATCH 29/37] ocfs2: allocate blocks in ocfs2_direct_IO_get_blocks Allow blocks allocation in ocfs2_direct_IO_get_blocks. Signed-off-by: Joseph Qi Cc: Weiwei Wang Cc: Junxiao Bi Cc: Joel Becker Cc: Mark Fasheh Cc: Xuejiufei Cc: alex chen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/aops.c | 45 ++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 42 insertions(+), 3 deletions(-) diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index be5986b7e5c6..44db1808cdb5 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -510,18 +510,21 @@ bail: * * called like this: dio->get_blocks(dio->inode, fs_startblk, * fs_count, map_bh, dio->rw == WRITE); - * - * Note that we never bother to allocate blocks here, and thus ignore the - * create argument. */ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { int ret; + u32 cpos = 0; + int alloc_locked = 0; u64 p_blkno, inode_blocks, contig_blocks; unsigned int ext_flags; unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits; unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits; + unsigned long len = bh_result->b_size; + unsigned int clusters_to_alloc = 0; + + cpos = ocfs2_blocks_to_clusters(inode->i_sb, iblock); /* This function won't even be called if the request isn't all * nicely aligned and of the right size, so there's no need @@ -543,6 +546,40 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, /* We should already CoW the refcounted extent in case of create. */ BUG_ON(create && (ext_flags & OCFS2_EXT_REFCOUNTED)); + /* allocate blocks if no p_blkno is found, and create == 1 */ + if (!p_blkno && create) { + ret = ocfs2_inode_lock(inode, NULL, 1); + if (ret < 0) { + mlog_errno(ret); + goto bail; + } + + alloc_locked = 1; + + /* fill hole, allocate blocks can't be larger than the size + * of the hole */ + clusters_to_alloc = ocfs2_clusters_for_bytes(inode->i_sb, len); + if (clusters_to_alloc > contig_blocks) + clusters_to_alloc = contig_blocks; + + /* allocate extent and insert them into the extent tree */ + ret = ocfs2_extend_allocation(inode, cpos, + clusters_to_alloc, 0); + if (ret < 0) { + mlog_errno(ret); + goto bail; + } + + ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, + &contig_blocks, &ext_flags); + if (ret < 0) { + mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n", + (unsigned long long)iblock); + ret = -EIO; + goto bail; + } + } + /* * get_more_blocks() expects us to describe a hole by clearing * the mapped bit on bh_result(). @@ -560,6 +597,8 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, contig_blocks = max_blocks; bh_result->b_size = contig_blocks << blocksize_bits; bail: + if (alloc_locked) + ocfs2_inode_unlock(inode, 1); return ret; } From d943d59dd32d33cd8a44a2f9caf373ede11200da Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Mon, 16 Feb 2015 16:00:06 -0800 Subject: [PATCH 30/37] ocfs2: do not fallback to buffer I/O write if appending Now we can do direct io and do not fallback to buffered IO any more in case of append O_DIRECT write. Signed-off-by: Joseph Qi Cc: Weiwei Wang Cc: Joel Becker Cc: Junxiao Bi Cc: Mark Fasheh Cc: Xuejiufei Cc: alex chen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/file.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 2fce3c40ad27..1055a2ece738 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2116,6 +2116,9 @@ static int ocfs2_prepare_inode_for_write(struct file *file, struct dentry *dentry = file->f_path.dentry; struct inode *inode = dentry->d_inode; loff_t saved_pos = 0, end; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + int full_coherency = !(osb->s_mount_opt & + OCFS2_MOUNT_COHERENCY_BUFFERED); /* * We start with a read level meta lock and only jump to an ex @@ -2204,7 +2207,7 @@ static int ocfs2_prepare_inode_for_write(struct file *file, * one node could wind up truncating another * nodes writes. */ - if (end > i_size_read(inode)) { + if (end > i_size_read(inode) && !full_coherency) { *direct_io = 0; break; } From 3a83b342c87e6d21290de8dc76ec20a67821261d Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Mon, 16 Feb 2015 16:00:09 -0800 Subject: [PATCH 31/37] ocfs2: complete the rest request through buffer io Complte the rest request thourgh buffer io after direct write performed. Signed-off-by: Joseph Qi Cc: Weiwei Wang Cc: Joel Becker Cc: Junxiao Bi Cc: Mark Fasheh Cc: Xuejiufei Cc: alex chen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/file.c | 43 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 1055a2ece738..784f2c72c992 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2253,6 +2253,7 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, u32 old_clusters; struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); + struct address_space *mapping = file->f_mapping; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); int full_coherency = !(osb->s_mount_opt & OCFS2_MOUNT_COHERENCY_BUFFERED); @@ -2367,11 +2368,51 @@ relock: iov_iter_truncate(from, count); if (direct_io) { + loff_t endbyte; + ssize_t written_buffered; written = generic_file_direct_write(iocb, from, *ppos); - if (written < 0) { + if (written < 0 || written == count) { ret = written; goto out_dio; } + + /* + * for completing the rest of the request. + */ + *ppos += written; + count -= written; + written_buffered = generic_perform_write(file, from, *ppos); + /* + * If generic_file_buffered_write() returned a synchronous error + * then we want to return the number of bytes which were + * direct-written, or the error code if that was zero. Note + * that this differs from normal direct-io semantics, which + * will return -EFOO even if some bytes were written. + */ + if (written_buffered < 0) { + ret = written_buffered; + goto out_dio; + } + + iocb->ki_pos = *ppos + written_buffered; + /* We need to ensure that the page cache pages are written to + * disk and invalidated to preserve the expected O_DIRECT + * semantics. + */ + endbyte = *ppos + written_buffered - 1; + ret = filemap_write_and_wait_range(file->f_mapping, *ppos, + endbyte); + if (ret == 0) { + written += written_buffered; + invalidate_mapping_pages(mapping, + *ppos >> PAGE_CACHE_SHIFT, + endbyte >> PAGE_CACHE_SHIFT); + } else { + /* + * We don't know how much we wrote, so just return + * the number of bytes which were direct-written + */ + } } else { current->backing_dev_info = inode_to_bdi(inode); written = generic_perform_write(file, from, *ppos); From 4813962beef7586f890a645a1bda77691da4b74a Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Mon, 16 Feb 2015 16:00:12 -0800 Subject: [PATCH 32/37] ocfs2: wait for orphan recovery first once append O_DIRECT write crash If one node has crashed with orphan entry leftover, another node which do append O_DIRECT write to the same file will override the i_dio_orphaned_slot. Then the old entry won't be cleaned forever. If this case happens, we let it wait for orphan recovery first. Signed-off-by: Joseph Qi Cc: Weiwei Wang Cc: Joel Becker Cc: Junxiao Bi Cc: Mark Fasheh Cc: Xuejiufei Cc: alex chen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/inode.h | 2 ++ fs/ocfs2/journal.c | 2 ++ fs/ocfs2/namei.c | 37 +++++++++++++++++++++++++++++++++++++ fs/ocfs2/super.c | 2 ++ 4 files changed, 43 insertions(+) diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index ca3431ee7f24..5e86b247c821 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h @@ -81,6 +81,8 @@ struct ocfs2_inode_info tid_t i_sync_tid; tid_t i_datasync_tid; + wait_queue_head_t append_dio_wq; + struct dquot *i_dquot[MAXQUOTAS]; }; diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 9730f5350ef4..ff531928269e 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -2204,6 +2204,8 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, ret = ocfs2_del_inode_from_orphan(osb, inode, 0, 0); if (ret) mlog_errno(ret); + + wake_up(&OCFS2_I(inode)->append_dio_wq); } /* else if ORPHAN_NO_NEED_TRUNCATE, do nothing */ next: diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 7eec45d0d85f..b5c3a5ea3ee6 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -2577,6 +2577,27 @@ leave: return status; } +static int ocfs2_dio_orphan_recovered(struct inode *inode) +{ + int ret; + struct buffer_head *di_bh = NULL; + struct ocfs2_dinode *di = NULL; + + ret = ocfs2_inode_lock(inode, &di_bh, 1); + if (ret < 0) { + mlog_errno(ret); + return 0; + } + + di = (struct ocfs2_dinode *) di_bh->b_data; + ret = !(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL)); + ocfs2_inode_unlock(inode, 1); + brelse(di_bh); + + return ret; +} + +#define OCFS2_DIO_ORPHANED_FL_CHECK_INTERVAL 10000 int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb, struct inode *inode) { @@ -2586,13 +2607,29 @@ int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb, struct buffer_head *di_bh = NULL; int status = 0; handle_t *handle = NULL; + struct ocfs2_dinode *di = NULL; +restart: status = ocfs2_inode_lock(inode, &di_bh, 1); if (status < 0) { mlog_errno(status); goto bail; } + di = (struct ocfs2_dinode *) di_bh->b_data; + /* + * Another append dio crashed? + * If so, wait for recovery first. + */ + if (unlikely(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))) { + ocfs2_inode_unlock(inode, 1); + brelse(di_bh); + wait_event_interruptible_timeout(OCFS2_I(inode)->append_dio_wq, + ocfs2_dio_orphan_recovered(inode), + msecs_to_jiffies(OCFS2_DIO_ORPHANED_FL_CHECK_INTERVAL)); + goto restart; + } + status = ocfs2_prepare_orphan_dir(osb, &orphan_dir_inode, OCFS2_I(inode)->ip_blkno, orphan_name, diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 87a1f7679d9b..26675185b886 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1746,6 +1746,8 @@ static void ocfs2_inode_init_once(void *data) ocfs2_lock_res_init_once(&oi->ip_inode_lockres); ocfs2_lock_res_init_once(&oi->ip_open_lockres); + init_waitqueue_head(&oi->append_dio_wq); + ocfs2_metadata_cache_init(INODE_CACHE(&oi->vfs_inode), &ocfs2_inode_caching_ops); From 160cc266639d4213c15c103074561c1b44ffe691 Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Mon, 16 Feb 2015 16:00:15 -0800 Subject: [PATCH 33/37] ocfs2: set append dio as a ro compat feature Intruduce a bit OCFS2_FEATURE_RO_COMPAT_APPEND_DIO and check it in write flow. If the bit is not set, fall back to the old way. Signed-off-by: Joseph Qi Cc: Weiwei Wang Cc: Joel Becker Cc: Junxiao Bi Cc: Mark Fasheh Cc: Xuejiufei Cc: alex chen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/file.c | 17 ++++++++++++++++- fs/ocfs2/ocfs2.h | 8 ++++++++ fs/ocfs2/ocfs2_fs.h | 8 +++++++- 3 files changed, 31 insertions(+), 2 deletions(-) diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 784f2c72c992..46e0d4e857c7 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2212,6 +2212,15 @@ static int ocfs2_prepare_inode_for_write(struct file *file, break; } + /* + * Fallback to old way if the feature bit is not set. + */ + if (end > i_size_read(inode) && + !ocfs2_supports_append_dio(osb)) { + *direct_io = 0; + break; + } + /* * We don't fill holes during direct io, so * check for them here. If any are found, the @@ -2220,7 +2229,13 @@ static int ocfs2_prepare_inode_for_write(struct file *file, */ ret = ocfs2_check_range_for_holes(inode, saved_pos, count); if (ret == 1) { - *direct_io = 0; + /* + * Fallback to old way if the feature bit is not set. + * Otherwise try dio first and then complete the rest + * request through buffer io. + */ + if (!ocfs2_supports_append_dio(osb)) + *direct_io = 0; ret = 0; } else if (ret < 0) mlog_errno(ret); diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 7e39cd654834..8490c64d34fe 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -500,6 +500,14 @@ static inline int ocfs2_writes_unwritten_extents(struct ocfs2_super *osb) return 0; } +static inline int ocfs2_supports_append_dio(struct ocfs2_super *osb) +{ + if (osb->s_feature_ro_compat & OCFS2_FEATURE_RO_COMPAT_APPEND_DIO) + return 1; + return 0; +} + + static inline int ocfs2_supports_inline_data(struct ocfs2_super *osb) { if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_INLINE_DATA) diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index cf4fa43af78e..20e37a3ed26f 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -105,7 +105,8 @@ | OCFS2_FEATURE_INCOMPAT_CLUSTERINFO) #define OCFS2_FEATURE_RO_COMPAT_SUPP (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \ | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \ - | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA) + | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA \ + | OCFS2_FEATURE_RO_COMPAT_APPEND_DIO) /* * Heartbeat-only devices are missing journals and other files. The @@ -199,6 +200,11 @@ #define OCFS2_FEATURE_RO_COMPAT_USRQUOTA 0x0002 #define OCFS2_FEATURE_RO_COMPAT_GRPQUOTA 0x0004 +/* + * Append Direct IO support + */ +#define OCFS2_FEATURE_RO_COMPAT_APPEND_DIO 0x0008 + /* The byte offset of the first backup block will be 1G. * The following will be 4G, 16G, 64G, 256G and 1T. */ From 780fc5642f59b6c6e2b05794de60b2d2ad5f040e Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Mon, 16 Feb 2015 16:00:18 -0800 Subject: [PATCH 34/37] powerpc: drop _PAGE_FILE and pte_file()-related helpers We've replaced remap_file_pages(2) implementation with emulation. Nobody creates non-linear mapping anymore. Signed-off-by: Kirill A. Shutemov Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/include/asm/pgtable-ppc32.h | 9 ++------- arch/powerpc/include/asm/pgtable-ppc64.h | 5 +---- arch/powerpc/include/asm/pgtable.h | 1 - arch/powerpc/include/asm/pte-40x.h | 1 - arch/powerpc/include/asm/pte-44x.h | 5 ----- arch/powerpc/include/asm/pte-8xx.h | 1 - arch/powerpc/include/asm/pte-book3e.h | 1 - arch/powerpc/include/asm/pte-fsl-booke.h | 3 --- arch/powerpc/include/asm/pte-hash32.h | 1 - arch/powerpc/include/asm/pte-hash64.h | 1 - arch/powerpc/mm/pgtable_64.c | 2 +- 11 files changed, 4 insertions(+), 26 deletions(-) diff --git a/arch/powerpc/include/asm/pgtable-ppc32.h b/arch/powerpc/include/asm/pgtable-ppc32.h index 14bdcbd31670..64b52b1cf542 100644 --- a/arch/powerpc/include/asm/pgtable-ppc32.h +++ b/arch/powerpc/include/asm/pgtable-ppc32.h @@ -333,8 +333,8 @@ static inline void __ptep_set_access_flags(pte_t *ptep, pte_t entry) /* * Encode and decode a swap entry. * Note that the bits we use in a PTE for representing a swap entry - * must not include the _PAGE_PRESENT bit, the _PAGE_FILE bit, or the - *_PAGE_HASHPTE bit (if used). -- paulus + * must not include the _PAGE_PRESENT bit or the _PAGE_HASHPTE bit (if used). + * -- paulus */ #define __swp_type(entry) ((entry).val & 0x1f) #define __swp_offset(entry) ((entry).val >> 5) @@ -342,11 +342,6 @@ static inline void __ptep_set_access_flags(pte_t *ptep, pte_t entry) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) >> 3 }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val << 3 }) -/* Encode and decode a nonlinear file mapping entry */ -#define PTE_FILE_MAX_BITS 29 -#define pte_to_pgoff(pte) (pte_val(pte) >> 3) -#define pgoff_to_pte(off) ((pte_t) { ((off) << 3) | _PAGE_FILE }) - #ifndef CONFIG_PPC_4K_PAGES void pgtable_cache_init(void); #else diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h index d46532ccc386..43e6ad424c7f 100644 --- a/arch/powerpc/include/asm/pgtable-ppc64.h +++ b/arch/powerpc/include/asm/pgtable-ppc64.h @@ -352,9 +352,6 @@ static inline void __ptep_set_access_flags(pte_t *ptep, pte_t entry) #define __swp_entry(type, offset) ((swp_entry_t){((type)<< 1)|((offset)<<8)}) #define __pte_to_swp_entry(pte) ((swp_entry_t){pte_val(pte) >> PTE_RPN_SHIFT}) #define __swp_entry_to_pte(x) ((pte_t) { (x).val << PTE_RPN_SHIFT }) -#define pte_to_pgoff(pte) (pte_val(pte) >> PTE_RPN_SHIFT) -#define pgoff_to_pte(off) ((pte_t) {((off) << PTE_RPN_SHIFT)|_PAGE_FILE}) -#define PTE_FILE_MAX_BITS (BITS_PER_LONG - PTE_RPN_SHIFT) void pgtable_cache_add(unsigned shift, void (*ctor)(void *)); void pgtable_cache_init(void); @@ -389,7 +386,7 @@ void pgtable_cache_init(void); * The last three bits are intentionally left to zero. This memory location * are also used as normal page PTE pointers. So if we have any pointers * left around while we collapse a hugepage, we need to make sure - * _PAGE_PRESENT and _PAGE_FILE bits of that are zero when we look at them + * _PAGE_PRESENT bit of that is zero when we look at them */ static inline unsigned int hpte_valid(unsigned char *hpte_slot_array, int index) { diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index 79fee2eb8d56..9835ac4173b7 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -34,7 +34,6 @@ static inline int pte_write(pte_t pte) { return (pte_val(pte) & (_PAGE_RW | _PAGE_RO)) != _PAGE_RO; } static inline int pte_dirty(pte_t pte) { return pte_val(pte) & _PAGE_DIRTY; } static inline int pte_young(pte_t pte) { return pte_val(pte) & _PAGE_ACCESSED; } -static inline int pte_file(pte_t pte) { return pte_val(pte) & _PAGE_FILE; } static inline int pte_special(pte_t pte) { return pte_val(pte) & _PAGE_SPECIAL; } static inline int pte_none(pte_t pte) { return (pte_val(pte) & ~_PTE_NONE_MASK) == 0; } static inline pgprot_t pte_pgprot(pte_t pte) { return __pgprot(pte_val(pte) & PAGE_PROT_BITS); } diff --git a/arch/powerpc/include/asm/pte-40x.h b/arch/powerpc/include/asm/pte-40x.h index ec0b0b0d1df9..486b1ef81338 100644 --- a/arch/powerpc/include/asm/pte-40x.h +++ b/arch/powerpc/include/asm/pte-40x.h @@ -38,7 +38,6 @@ */ #define _PAGE_GUARDED 0x001 /* G: page is guarded from prefetch */ -#define _PAGE_FILE 0x001 /* when !present: nonlinear file mapping */ #define _PAGE_PRESENT 0x002 /* software: PTE contains a translation */ #define _PAGE_NO_CACHE 0x004 /* I: caching is inhibited */ #define _PAGE_WRITETHRU 0x008 /* W: caching is write-through */ diff --git a/arch/powerpc/include/asm/pte-44x.h b/arch/powerpc/include/asm/pte-44x.h index 4192b9bad901..36f75fab23f5 100644 --- a/arch/powerpc/include/asm/pte-44x.h +++ b/arch/powerpc/include/asm/pte-44x.h @@ -44,9 +44,6 @@ * - PRESENT *must* be in the bottom three bits because swap cache * entries use the top 29 bits for TLB2. * - * - FILE *must* be in the bottom three bits because swap cache - * entries use the top 29 bits for TLB2. - * * - CACHE COHERENT bit (M) has no effect on original PPC440 cores, * because it doesn't support SMP. However, some later 460 variants * have -some- form of SMP support and so I keep the bit there for @@ -68,7 +65,6 @@ * * There are three protection bits available for SWAP entry: * _PAGE_PRESENT - * _PAGE_FILE * _PAGE_HASHPTE (if HW has) * * So those three bits have to be inside of 0-2nd LSB of PTE. @@ -77,7 +73,6 @@ #define _PAGE_PRESENT 0x00000001 /* S: PTE valid */ #define _PAGE_RW 0x00000002 /* S: Write permission */ -#define _PAGE_FILE 0x00000004 /* S: nonlinear file mapping */ #define _PAGE_EXEC 0x00000004 /* H: Execute permission */ #define _PAGE_ACCESSED 0x00000008 /* S: Page referenced */ #define _PAGE_DIRTY 0x00000010 /* S: Page dirty */ diff --git a/arch/powerpc/include/asm/pte-8xx.h b/arch/powerpc/include/asm/pte-8xx.h index eb6edb44f140..97bae64afdaa 100644 --- a/arch/powerpc/include/asm/pte-8xx.h +++ b/arch/powerpc/include/asm/pte-8xx.h @@ -29,7 +29,6 @@ /* Definitions for 8xx embedded chips. */ #define _PAGE_PRESENT 0x0001 /* Page is valid */ -#define _PAGE_FILE 0x0002 /* when !present: nonlinear file mapping */ #define _PAGE_NO_CACHE 0x0002 /* I: cache inhibit */ #define _PAGE_SHARED 0x0004 /* No ASID (context) compare */ #define _PAGE_SPECIAL 0x0008 /* SW entry, forced to 0 by the TLB miss */ diff --git a/arch/powerpc/include/asm/pte-book3e.h b/arch/powerpc/include/asm/pte-book3e.h index 576ad88104cb..91a704952ca1 100644 --- a/arch/powerpc/include/asm/pte-book3e.h +++ b/arch/powerpc/include/asm/pte-book3e.h @@ -10,7 +10,6 @@ /* Architected bits */ #define _PAGE_PRESENT 0x000001 /* software: pte contains a translation */ -#define _PAGE_FILE 0x000002 /* (!present only) software: pte holds file offset */ #define _PAGE_SW1 0x000002 #define _PAGE_BAP_SR 0x000004 #define _PAGE_BAP_UR 0x000008 diff --git a/arch/powerpc/include/asm/pte-fsl-booke.h b/arch/powerpc/include/asm/pte-fsl-booke.h index e84dd7ed505e..9f5c3d04a1a3 100644 --- a/arch/powerpc/include/asm/pte-fsl-booke.h +++ b/arch/powerpc/include/asm/pte-fsl-booke.h @@ -13,14 +13,11 @@ - PRESENT *must* be in the bottom three bits because swap cache entries use the top 29 bits. - - FILE *must* be in the bottom three bits because swap cache - entries use the top 29 bits. */ /* Definitions for FSL Book-E Cores */ #define _PAGE_PRESENT 0x00001 /* S: PTE contains a translation */ #define _PAGE_USER 0x00002 /* S: User page (maps to UR) */ -#define _PAGE_FILE 0x00002 /* S: when !present: nonlinear file mapping */ #define _PAGE_RW 0x00004 /* S: Write permission (SW) */ #define _PAGE_DIRTY 0x00008 /* S: Page dirty */ #define _PAGE_EXEC 0x00010 /* H: SX permission */ diff --git a/arch/powerpc/include/asm/pte-hash32.h b/arch/powerpc/include/asm/pte-hash32.h index 4aad4132d0a8..62cfb0c663bb 100644 --- a/arch/powerpc/include/asm/pte-hash32.h +++ b/arch/powerpc/include/asm/pte-hash32.h @@ -18,7 +18,6 @@ #define _PAGE_PRESENT 0x001 /* software: pte contains a translation */ #define _PAGE_HASHPTE 0x002 /* hash_page has made an HPTE for this pte */ -#define _PAGE_FILE 0x004 /* when !present: nonlinear file mapping */ #define _PAGE_USER 0x004 /* usermode access allowed */ #define _PAGE_GUARDED 0x008 /* G: prohibit speculative access */ #define _PAGE_COHERENT 0x010 /* M: enforce memory coherence (SMP systems) */ diff --git a/arch/powerpc/include/asm/pte-hash64.h b/arch/powerpc/include/asm/pte-hash64.h index 55aea0caf95e..fc852f7e7b3a 100644 --- a/arch/powerpc/include/asm/pte-hash64.h +++ b/arch/powerpc/include/asm/pte-hash64.h @@ -16,7 +16,6 @@ */ #define _PAGE_PRESENT 0x0001 /* software: pte contains a translation */ #define _PAGE_USER 0x0002 /* matches one of the PP bits */ -#define _PAGE_FILE 0x0002 /* (!present only) software: pte holds file offset */ #define _PAGE_EXEC 0x0004 /* No execute on POWER4 and newer (we invert) */ #define _PAGE_GUARDED 0x0008 /* We can derive Memory coherence from _PAGE_NO_CACHE */ diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index 91bb8836825a..6957cc1ca0a7 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -782,7 +782,7 @@ pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot) { pmd_t pmd; /* - * For a valid pte, we would have _PAGE_PRESENT or _PAGE_FILE always + * For a valid pte, we would have _PAGE_PRESENT always * set. We use this to check THP page at pmd level. * leaf pte for huge page, bottom two bits != 00 */ From 841c009007144bed9b46643e2e0c4ba6821a2299 Mon Sep 17 00:00:00 2001 From: Christoph Jaeger Date: Mon, 16 Feb 2015 16:00:20 -0800 Subject: [PATCH 35/37] lib/Kconfig: use bool instead of boolean Keyword 'boolean' for type definition attributes is considered deprecated and, therefore, should not be used anymore. See http://lkml.kernel.org/r/cover.1418003065.git.cj@linux.com See http://lkml.kernel.org/r/1419108071-11607-1-git-send-email-cj@linux.com Signed-off-by: Christoph Jaeger Cc: Russell King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/Kconfig b/lib/Kconfig index cd177caf3876..cb9758e0ba0c 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -14,7 +14,7 @@ config BITREVERSE tristate config HAVE_ARCH_BITREVERSE - boolean + bool default n depends on BITREVERSE help From befeb596a7224119b60499eb20c7545f7a73f104 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 16 Feb 2015 16:00:23 -0800 Subject: [PATCH 36/37] MAINTAINERS: add entry for Maxim PMICs on Samsung boards Add myself and Chanwoo Choi as supporters to help in reviewing patches for Maxim 77686 PMIC and Maxim 14577/77693 MUIC drivers: - mfd (all of them), - extcon (extcon-max14577.c, extcon-max77693.c), - regulator (all of them), - clock (clk-max77686.c), - RTC (rtc-max77686.c). Lately I am the author of contributors to them. These drivers are used on Exynos-based boards (Trats 2, Gear 1 and Gear 2). Signed-off-by: Krzysztof Kozlowski Cc: MyungJoo Ham Cc: Chanwoo Choi Cc: Samuel Ortiz Cc: Lee Jones Cc: Liam Girdwood Cc: Mark Brown Cc: Mike Turquette Cc: Stephen Boyd Cc: Alessandro Zummo Acked-by: Mark Brown Acked-by: Lee Jones Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 0ca23aa48ae7..6f2c849e6da3 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6216,6 +6216,26 @@ S: Supported F: drivers/power/max14577_charger.c F: drivers/power/max77693_charger.c +MAXIM PMIC AND MUIC DRIVERS FOR EXYNOS BASED BOARDS +M: Chanwoo Choi +M: Krzysztof Kozlowski +L: linux-kernel@vger.kernel.org +S: Supported +F: drivers/*/max14577.c +F: drivers/*/max77686.c +F: drivers/*/max77693.c +F: drivers/extcon/extcon-max14577.c +F: drivers/extcon/extcon-max77693.c +F: drivers/rtc/rtc-max77686.c +F: drivers/clk/clk-max77686.c +F: Documentation/devicetree/bindings/mfd/max14577.txt +F: Documentation/devicetree/bindings/mfd/max77686.txt +F: Documentation/devicetree/bindings/mfd/max77693.txt +F: Documentation/devicetree/bindings/clock/maxim,max77686.txt +F: include/linux/mfd/max14577*.h +F: include/linux/mfd/max77686*.h +F: include/linux/mfd/max77693*.h + MAXIRADIO FM RADIO RECEIVER DRIVER M: Hans Verkuil L: linux-media@vger.kernel.org From aaaf5fbf56f16c81a653713cc333b18ad6e25ea9 Mon Sep 17 00:00:00 2001 From: Joshua Kinard Date: Mon, 16 Feb 2015 16:00:26 -0800 Subject: [PATCH 37/37] rtc: add driver for DS1685 family of real time clocks This adds a driver for the Dallas/Maxim DS1685-family of RTC chips. It supports the DS1685/DS1687, DS1688/DS1691, DS1689/DS1693, DS17285/DS17287, DS17485/DS17487, and DS17885/DS17887 RTC chips. These chips are commonly found in SGI O2 and SGI Octane systems. It was originally derived from a driver patch submitted by Matthias Fuchs many years ago for use in EPPC-405-UC modules, which also used these RTCs. In addition to the time-keeping functions, this RTC also handles the shutdown mechanism of the O2 and Octane and acts as a partial NVRAM for the boot PROMS in these systems. Verified on both an SGI O2 and an SGI Octane. Signed-off-by: Joshua Kinard Cc: Ralf Baechle Cc: Alessandro Zummo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 6 + drivers/rtc/Kconfig | 90 ++ drivers/rtc/Makefile | 1 + drivers/rtc/rtc-ds1685.c | 2252 ++++++++++++++++++++++++++++++++++++ include/linux/rtc/ds1685.h | 375 ++++++ 5 files changed, 2724 insertions(+) create mode 100644 drivers/rtc/rtc-ds1685.c create mode 100644 include/linux/rtc/ds1685.h diff --git a/MAINTAINERS b/MAINTAINERS index 6f2c849e6da3..d5ff4c3cdbac 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2963,6 +2963,12 @@ S: Supported F: drivers/input/touchscreen/cyttsp* F: include/linux/input/cyttsp.h +DALLAS/MAXIM DS1685-FAMILY REAL TIME CLOCK +M: Joshua Kinard +S: Maintained +F: drivers/rtc/rtc-ds1685.c +F: include/linux/rtc/ds1685.h + DAMA SLAVE for AX.25 M: Joerg Reuter W: http://yaina.de/jreuter/ diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig index 3bc9ddbe5cf7..0cf2e1d9cb17 100644 --- a/drivers/rtc/Kconfig +++ b/drivers/rtc/Kconfig @@ -801,6 +801,96 @@ config RTC_DRV_DS1553 This driver can also be built as a module. If so, the module will be called rtc-ds1553. +config RTC_DRV_DS1685_FAMILY + tristate "Dallas/Maxim DS1685 Family" + help + If you say yes here you get support for the Dallas/Maxim DS1685 + family of real time chips. This family includes the DS1685/DS1687, + DS1689/DS1693, DS17285/DS17287, DS17485/DS17487, and + DS17885/DS17887 chips. + + This driver can also be built as a module. If so, the module + will be called rtc-ds1685. + +choice + prompt "Subtype" + depends on RTC_DRV_DS1685_FAMILY + default RTC_DRV_DS1685 + +config RTC_DRV_DS1685 + bool "DS1685/DS1687" + help + This enables support for the Dallas/Maxim DS1685/DS1687 real time + clock chip. + + This chip is commonly found in SGI O2 (IP32) and SGI Octane (IP30) + systems, as well as EPPC-405-UC modules by electronic system design + GmbH. + +config RTC_DRV_DS1689 + bool "DS1689/DS1693" + help + This enables support for the Dallas/Maxim DS1689/DS1693 real time + clock chip. + + This is an older RTC chip, supplanted by the DS1685/DS1687 above, + which supports a few minor features such as Vcc, Vbat, and Power + Cycle counters, plus a customer-specific, 8-byte ROM/Serial number. + + It also works for the even older DS1688/DS1691 RTC chips, which are + virtually the same and carry the same model number. Both chips + have 114 bytes of user NVRAM. + +config RTC_DRV_DS17285 + bool "DS17285/DS17287" + help + This enables support for the Dallas/Maxim DS17285/DS17287 real time + clock chip. + + This chip features 2kb of extended NV-SRAM. It may possibly be + found in some SGI O2 systems (rare). + +config RTC_DRV_DS17485 + bool "DS17485/DS17487" + help + This enables support for the Dallas/Maxim DS17485/DS17487 real time + clock chip. + + This chip features 4kb of extended NV-SRAM. + +config RTC_DRV_DS17885 + bool "DS17885/DS17887" + help + This enables support for the Dallas/Maxim DS17885/DS17887 real time + clock chip. + + This chip features 8kb of extended NV-SRAM. + +endchoice + +config RTC_DS1685_PROC_REGS + bool "Display register values in /proc" + depends on RTC_DRV_DS1685_FAMILY && PROC_FS + help + Enable this to display a readout of all of the RTC registers in + /proc/drivers/rtc. Keep in mind that this can potentially lead + to lost interrupts, as reading Control Register C will clear + all pending IRQ flags. + + Unless you are debugging this driver, choose N. + +config RTC_DS1685_SYSFS_REGS + bool "SysFS access to RTC register bits" + depends on RTC_DRV_DS1685_FAMILY && SYSFS + help + Enable this to provide access to the RTC control register bits + in /sys. Some of the bits are read-write, others are read-only. + + Keep in mind that reading Control C's bits automatically clears + all pending IRQ flags - this can cause lost interrupts. + + If you know that you need access to these bits, choose Y, Else N. + config RTC_DRV_DS1742 tristate "Maxim/Dallas DS1742/1743" depends on HAS_IOMEM diff --git a/drivers/rtc/Makefile b/drivers/rtc/Makefile index 99ded8b75e95..69c87062b098 100644 --- a/drivers/rtc/Makefile +++ b/drivers/rtc/Makefile @@ -54,6 +54,7 @@ obj-$(CONFIG_RTC_DRV_DS1390) += rtc-ds1390.o obj-$(CONFIG_RTC_DRV_DS1511) += rtc-ds1511.o obj-$(CONFIG_RTC_DRV_DS1553) += rtc-ds1553.o obj-$(CONFIG_RTC_DRV_DS1672) += rtc-ds1672.o +obj-$(CONFIG_RTC_DRV_DS1685_FAMILY) += rtc-ds1685.o obj-$(CONFIG_RTC_DRV_DS1742) += rtc-ds1742.o obj-$(CONFIG_RTC_DRV_DS2404) += rtc-ds2404.o obj-$(CONFIG_RTC_DRV_DS3232) += rtc-ds3232.o diff --git a/drivers/rtc/rtc-ds1685.c b/drivers/rtc/rtc-ds1685.c new file mode 100644 index 000000000000..8c3bfcb115b7 --- /dev/null +++ b/drivers/rtc/rtc-ds1685.c @@ -0,0 +1,2252 @@ +/* + * An rtc driver for the Dallas/Maxim DS1685/DS1687 and related real-time + * chips. + * + * Copyright (C) 2011-2014 Joshua Kinard . + * Copyright (C) 2009 Matthias Fuchs . + * + * References: + * DS1685/DS1687 3V/5V Real-Time Clocks, 19-5215, Rev 4/10. + * DS17x85/DS17x87 3V/5V Real-Time Clocks, 19-5222, Rev 4/10. + * DS1689/DS1693 3V/5V Serialized Real-Time Clocks, Rev 112105. + * Application Note 90, Using the Multiplex Bus RTC Extended Features. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include + +#ifdef CONFIG_PROC_FS +#include +#endif + +#define DRV_VERSION "0.42.0" + + +/* ----------------------------------------------------------------------- */ +/* Standard read/write functions if platform does not provide overrides */ + +/** + * ds1685_read - read a value from an rtc register. + * @rtc: pointer to the ds1685 rtc structure. + * @reg: the register address to read. + */ +static u8 +ds1685_read(struct ds1685_priv *rtc, int reg) +{ + return readb((u8 __iomem *)rtc->regs + + (reg * rtc->regstep)); +} + +/** + * ds1685_write - write a value to an rtc register. + * @rtc: pointer to the ds1685 rtc structure. + * @reg: the register address to write. + * @value: value to write to the register. + */ +static void +ds1685_write(struct ds1685_priv *rtc, int reg, u8 value) +{ + writeb(value, ((u8 __iomem *)rtc->regs + + (reg * rtc->regstep))); +} +/* ----------------------------------------------------------------------- */ + + +/* ----------------------------------------------------------------------- */ +/* Inlined functions */ + +/** + * ds1685_rtc_bcd2bin - bcd2bin wrapper in case platform doesn't support BCD. + * @rtc: pointer to the ds1685 rtc structure. + * @val: u8 time value to consider converting. + * @bcd_mask: u8 mask value if BCD mode is used. + * @bin_mask: u8 mask value if BIN mode is used. + * + * Returns the value, converted to BIN if originally in BCD and bcd_mode TRUE. + */ +static inline u8 +ds1685_rtc_bcd2bin(struct ds1685_priv *rtc, u8 val, u8 bcd_mask, u8 bin_mask) +{ + if (rtc->bcd_mode) + return (bcd2bin(val) & bcd_mask); + + return (val & bin_mask); +} + +/** + * ds1685_rtc_bin2bcd - bin2bcd wrapper in case platform doesn't support BCD. + * @rtc: pointer to the ds1685 rtc structure. + * @val: u8 time value to consider converting. + * @bin_mask: u8 mask value if BIN mode is used. + * @bcd_mask: u8 mask value if BCD mode is used. + * + * Returns the value, converted to BCD if originally in BIN and bcd_mode TRUE. + */ +static inline u8 +ds1685_rtc_bin2bcd(struct ds1685_priv *rtc, u8 val, u8 bin_mask, u8 bcd_mask) +{ + if (rtc->bcd_mode) + return (bin2bcd(val) & bcd_mask); + + return (val & bin_mask); +} + +/** + * ds1685_rtc_switch_to_bank0 - switch the rtc to bank 0. + * @rtc: pointer to the ds1685 rtc structure. + */ +static inline void +ds1685_rtc_switch_to_bank0(struct ds1685_priv *rtc) +{ + rtc->write(rtc, RTC_CTRL_A, + (rtc->read(rtc, RTC_CTRL_A) & ~(RTC_CTRL_A_DV0))); +} + +/** + * ds1685_rtc_switch_to_bank1 - switch the rtc to bank 1. + * @rtc: pointer to the ds1685 rtc structure. + */ +static inline void +ds1685_rtc_switch_to_bank1(struct ds1685_priv *rtc) +{ + rtc->write(rtc, RTC_CTRL_A, + (rtc->read(rtc, RTC_CTRL_A) | RTC_CTRL_A_DV0)); +} + +/** + * ds1685_rtc_begin_data_access - prepare the rtc for data access. + * @rtc: pointer to the ds1685 rtc structure. + * + * This takes several steps to prepare the rtc for access to get/set time + * and alarm values from the rtc registers: + * - Sets the SET bit in Control Register B. + * - Reads Ext Control Register 4A and checks the INCR bit. + * - If INCR is active, a short delay is added before Ext Control Register 4A + * is read again in a loop until INCR is inactive. + * - Switches the rtc to bank 1. This allows access to all relevant + * data for normal rtc operation, as bank 0 contains only the nvram. + */ +static inline void +ds1685_rtc_begin_data_access(struct ds1685_priv *rtc) +{ + /* Set the SET bit in Ctrl B */ + rtc->write(rtc, RTC_CTRL_B, + (rtc->read(rtc, RTC_CTRL_B) | RTC_CTRL_B_SET)); + + /* Read Ext Ctrl 4A and check the INCR bit to avoid a lockout. */ + while (rtc->read(rtc, RTC_EXT_CTRL_4A) & RTC_CTRL_4A_INCR) + cpu_relax(); + + /* Switch to Bank 1 */ + ds1685_rtc_switch_to_bank1(rtc); +} + +/** + * ds1685_rtc_end_data_access - end data access on the rtc. + * @rtc: pointer to the ds1685 rtc structure. + * + * This ends what was started by ds1685_rtc_begin_data_access: + * - Switches the rtc back to bank 0. + * - Clears the SET bit in Control Register B. + */ +static inline void +ds1685_rtc_end_data_access(struct ds1685_priv *rtc) +{ + /* Switch back to Bank 0 */ + ds1685_rtc_switch_to_bank1(rtc); + + /* Clear the SET bit in Ctrl B */ + rtc->write(rtc, RTC_CTRL_B, + (rtc->read(rtc, RTC_CTRL_B) & ~(RTC_CTRL_B_SET))); +} + +/** + * ds1685_rtc_begin_ctrl_access - prepare the rtc for ctrl access. + * @rtc: pointer to the ds1685 rtc structure. + * @flags: irq flags variable for spin_lock_irqsave. + * + * This takes several steps to prepare the rtc for access to read just the + * control registers: + * - Sets a spinlock on the rtc IRQ. + * - Switches the rtc to bank 1. This allows access to the two extended + * control registers. + * + * Only use this where you are certain another lock will not be held. + */ +static inline void +ds1685_rtc_begin_ctrl_access(struct ds1685_priv *rtc, unsigned long flags) +{ + spin_lock_irqsave(&rtc->lock, flags); + ds1685_rtc_switch_to_bank1(rtc); +} + +/** + * ds1685_rtc_end_ctrl_access - end ctrl access on the rtc. + * @rtc: pointer to the ds1685 rtc structure. + * @flags: irq flags variable for spin_unlock_irqrestore. + * + * This ends what was started by ds1685_rtc_begin_ctrl_access: + * - Switches the rtc back to bank 0. + * - Unsets the spinlock on the rtc IRQ. + */ +static inline void +ds1685_rtc_end_ctrl_access(struct ds1685_priv *rtc, unsigned long flags) +{ + ds1685_rtc_switch_to_bank0(rtc); + spin_unlock_irqrestore(&rtc->lock, flags); +} + +/** + * ds1685_rtc_get_ssn - retrieve the silicon serial number. + * @rtc: pointer to the ds1685 rtc structure. + * @ssn: u8 array to hold the bits of the silicon serial number. + * + * This number starts at 0x40, and is 8-bytes long, ending at 0x47. The + * first byte is the model number, the next six bytes are the serial number + * digits, and the final byte is a CRC check byte. Together, they form the + * silicon serial number. + * + * These values are stored in bank1, so ds1685_rtc_switch_to_bank1 must be + * called first before calling this function, else data will be read out of + * the bank0 NVRAM. Be sure to call ds1685_rtc_switch_to_bank0 when done. + */ +static inline void +ds1685_rtc_get_ssn(struct ds1685_priv *rtc, u8 *ssn) +{ + ssn[0] = rtc->read(rtc, RTC_BANK1_SSN_MODEL); + ssn[1] = rtc->read(rtc, RTC_BANK1_SSN_BYTE_1); + ssn[2] = rtc->read(rtc, RTC_BANK1_SSN_BYTE_2); + ssn[3] = rtc->read(rtc, RTC_BANK1_SSN_BYTE_3); + ssn[4] = rtc->read(rtc, RTC_BANK1_SSN_BYTE_4); + ssn[5] = rtc->read(rtc, RTC_BANK1_SSN_BYTE_5); + ssn[6] = rtc->read(rtc, RTC_BANK1_SSN_BYTE_6); + ssn[7] = rtc->read(rtc, RTC_BANK1_SSN_CRC); +} +/* ----------------------------------------------------------------------- */ + + +/* ----------------------------------------------------------------------- */ +/* Read/Set Time & Alarm functions */ + +/** + * ds1685_rtc_read_time - reads the time registers. + * @dev: pointer to device structure. + * @tm: pointer to rtc_time structure. + */ +static int +ds1685_rtc_read_time(struct device *dev, struct rtc_time *tm) +{ + struct platform_device *pdev = to_platform_device(dev); + struct ds1685_priv *rtc = platform_get_drvdata(pdev); + u8 ctrlb, century; + u8 seconds, minutes, hours, wday, mday, month, years; + + /* Fetch the time info from the RTC registers. */ + ds1685_rtc_begin_data_access(rtc); + seconds = rtc->read(rtc, RTC_SECS); + minutes = rtc->read(rtc, RTC_MINS); + hours = rtc->read(rtc, RTC_HRS); + wday = rtc->read(rtc, RTC_WDAY); + mday = rtc->read(rtc, RTC_MDAY); + month = rtc->read(rtc, RTC_MONTH); + years = rtc->read(rtc, RTC_YEAR); + century = rtc->read(rtc, RTC_CENTURY); + ctrlb = rtc->read(rtc, RTC_CTRL_B); + ds1685_rtc_end_data_access(rtc); + + /* bcd2bin if needed, perform fixups, and store to rtc_time. */ + years = ds1685_rtc_bcd2bin(rtc, years, RTC_YEAR_BCD_MASK, + RTC_YEAR_BIN_MASK); + century = ds1685_rtc_bcd2bin(rtc, century, RTC_CENTURY_MASK, + RTC_CENTURY_MASK); + tm->tm_sec = ds1685_rtc_bcd2bin(rtc, seconds, RTC_SECS_BCD_MASK, + RTC_SECS_BIN_MASK); + tm->tm_min = ds1685_rtc_bcd2bin(rtc, minutes, RTC_MINS_BCD_MASK, + RTC_MINS_BIN_MASK); + tm->tm_hour = ds1685_rtc_bcd2bin(rtc, hours, RTC_HRS_24_BCD_MASK, + RTC_HRS_24_BIN_MASK); + tm->tm_wday = (ds1685_rtc_bcd2bin(rtc, wday, RTC_WDAY_MASK, + RTC_WDAY_MASK) - 1); + tm->tm_mday = ds1685_rtc_bcd2bin(rtc, mday, RTC_MDAY_BCD_MASK, + RTC_MDAY_BIN_MASK); + tm->tm_mon = (ds1685_rtc_bcd2bin(rtc, month, RTC_MONTH_BCD_MASK, + RTC_MONTH_BIN_MASK) - 1); + tm->tm_year = ((years + (century * 100)) - 1900); + tm->tm_yday = rtc_year_days(tm->tm_mday, tm->tm_mon, tm->tm_year); + tm->tm_isdst = 0; /* RTC has hardcoded timezone, so don't use. */ + + return rtc_valid_tm(tm); +} + +/** + * ds1685_rtc_set_time - sets the time registers. + * @dev: pointer to device structure. + * @tm: pointer to rtc_time structure. + */ +static int +ds1685_rtc_set_time(struct device *dev, struct rtc_time *tm) +{ + struct platform_device *pdev = to_platform_device(dev); + struct ds1685_priv *rtc = platform_get_drvdata(pdev); + u8 ctrlb, seconds, minutes, hours, wday, mday, month, years, century; + + /* Fetch the time info from rtc_time. */ + seconds = ds1685_rtc_bin2bcd(rtc, tm->tm_sec, RTC_SECS_BIN_MASK, + RTC_SECS_BCD_MASK); + minutes = ds1685_rtc_bin2bcd(rtc, tm->tm_min, RTC_MINS_BIN_MASK, + RTC_MINS_BCD_MASK); + hours = ds1685_rtc_bin2bcd(rtc, tm->tm_hour, RTC_HRS_24_BIN_MASK, + RTC_HRS_24_BCD_MASK); + wday = ds1685_rtc_bin2bcd(rtc, (tm->tm_wday + 1), RTC_WDAY_MASK, + RTC_WDAY_MASK); + mday = ds1685_rtc_bin2bcd(rtc, tm->tm_mday, RTC_MDAY_BIN_MASK, + RTC_MDAY_BCD_MASK); + month = ds1685_rtc_bin2bcd(rtc, (tm->tm_mon + 1), RTC_MONTH_BIN_MASK, + RTC_MONTH_BCD_MASK); + years = ds1685_rtc_bin2bcd(rtc, (tm->tm_year % 100), + RTC_YEAR_BIN_MASK, RTC_YEAR_BCD_MASK); + century = ds1685_rtc_bin2bcd(rtc, ((tm->tm_year + 1900) / 100), + RTC_CENTURY_MASK, RTC_CENTURY_MASK); + + /* + * Perform Sanity Checks: + * - Months: !> 12, Month Day != 0. + * - Month Day !> Max days in current month. + * - Hours !>= 24, Mins !>= 60, Secs !>= 60, & Weekday !> 7. + */ + if ((tm->tm_mon > 11) || (mday == 0)) + return -EDOM; + + if (tm->tm_mday > rtc_month_days(tm->tm_mon, tm->tm_year)) + return -EDOM; + + if ((tm->tm_hour >= 24) || (tm->tm_min >= 60) || + (tm->tm_sec >= 60) || (wday > 7)) + return -EDOM; + + /* + * Set the data mode to use and store the time values in the + * RTC registers. + */ + ds1685_rtc_begin_data_access(rtc); + ctrlb = rtc->read(rtc, RTC_CTRL_B); + if (rtc->bcd_mode) + ctrlb &= ~(RTC_CTRL_B_DM); + else + ctrlb |= RTC_CTRL_B_DM; + rtc->write(rtc, RTC_CTRL_B, ctrlb); + rtc->write(rtc, RTC_SECS, seconds); + rtc->write(rtc, RTC_MINS, minutes); + rtc->write(rtc, RTC_HRS, hours); + rtc->write(rtc, RTC_WDAY, wday); + rtc->write(rtc, RTC_MDAY, mday); + rtc->write(rtc, RTC_MONTH, month); + rtc->write(rtc, RTC_YEAR, years); + rtc->write(rtc, RTC_CENTURY, century); + ds1685_rtc_end_data_access(rtc); + + return 0; +} + +/** + * ds1685_rtc_read_alarm - reads the alarm registers. + * @dev: pointer to device structure. + * @alrm: pointer to rtc_wkalrm structure. + * + * There are three primary alarm registers: seconds, minutes, and hours. + * A fourth alarm register for the month date is also available in bank1 for + * kickstart/wakeup features. The DS1685/DS1687 manual states that a + * "don't care" value ranging from 0xc0 to 0xff may be written into one or + * more of the three alarm bytes to act as a wildcard value. The fourth + * byte doesn't support a "don't care" value. + */ +static int +ds1685_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alrm) +{ + struct platform_device *pdev = to_platform_device(dev); + struct ds1685_priv *rtc = platform_get_drvdata(pdev); + u8 seconds, minutes, hours, mday, ctrlb, ctrlc; + + /* Fetch the alarm info from the RTC alarm registers. */ + ds1685_rtc_begin_data_access(rtc); + seconds = rtc->read(rtc, RTC_SECS_ALARM); + minutes = rtc->read(rtc, RTC_MINS_ALARM); + hours = rtc->read(rtc, RTC_HRS_ALARM); + mday = rtc->read(rtc, RTC_MDAY_ALARM); + ctrlb = rtc->read(rtc, RTC_CTRL_B); + ctrlc = rtc->read(rtc, RTC_CTRL_C); + ds1685_rtc_end_data_access(rtc); + + /* Check month date. */ + if (!(mday >= 1) && (mday <= 31)) + return -EDOM; + + /* + * Check the three alarm bytes. + * + * The Linux RTC system doesn't support the "don't care" capability + * of this RTC chip. We check for it anyways in case support is + * added in the future. + */ + if (unlikely((seconds >= 0xc0) && (seconds <= 0xff))) + alrm->time.tm_sec = -1; + else + alrm->time.tm_sec = ds1685_rtc_bcd2bin(rtc, seconds, + RTC_SECS_BCD_MASK, + RTC_SECS_BIN_MASK); + + if (unlikely((minutes >= 0xc0) && (minutes <= 0xff))) + alrm->time.tm_min = -1; + else + alrm->time.tm_min = ds1685_rtc_bcd2bin(rtc, minutes, + RTC_MINS_BCD_MASK, + RTC_MINS_BIN_MASK); + + if (unlikely((hours >= 0xc0) && (hours <= 0xff))) + alrm->time.tm_hour = -1; + else + alrm->time.tm_hour = ds1685_rtc_bcd2bin(rtc, hours, + RTC_HRS_24_BCD_MASK, + RTC_HRS_24_BIN_MASK); + + /* Write the data to rtc_wkalrm. */ + alrm->time.tm_mday = ds1685_rtc_bcd2bin(rtc, mday, RTC_MDAY_BCD_MASK, + RTC_MDAY_BIN_MASK); + alrm->time.tm_mon = -1; + alrm->time.tm_year = -1; + alrm->time.tm_wday = -1; + alrm->time.tm_yday = -1; + alrm->time.tm_isdst = -1; + alrm->enabled = !!(ctrlb & RTC_CTRL_B_AIE); + alrm->pending = !!(ctrlc & RTC_CTRL_C_AF); + + return 0; +} + +/** + * ds1685_rtc_set_alarm - sets the alarm in registers. + * @dev: pointer to device structure. + * @alrm: pointer to rtc_wkalrm structure. + */ +static int +ds1685_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm) +{ + struct platform_device *pdev = to_platform_device(dev); + struct ds1685_priv *rtc = platform_get_drvdata(pdev); + u8 ctrlb, seconds, minutes, hours, mday; + + /* Fetch the alarm info and convert to BCD. */ + seconds = ds1685_rtc_bin2bcd(rtc, alrm->time.tm_sec, + RTC_SECS_BIN_MASK, + RTC_SECS_BCD_MASK); + minutes = ds1685_rtc_bin2bcd(rtc, alrm->time.tm_min, + RTC_MINS_BIN_MASK, + RTC_MINS_BCD_MASK); + hours = ds1685_rtc_bin2bcd(rtc, alrm->time.tm_hour, + RTC_HRS_24_BIN_MASK, + RTC_HRS_24_BCD_MASK); + mday = ds1685_rtc_bin2bcd(rtc, alrm->time.tm_mday, + RTC_MDAY_BIN_MASK, + RTC_MDAY_BCD_MASK); + + /* Check the month date for validity. */ + if (!(mday >= 1) && (mday <= 31)) + return -EDOM; + + /* + * Check the three alarm bytes. + * + * The Linux RTC system doesn't support the "don't care" capability + * of this RTC chip because rtc_valid_tm tries to validate every + * field, and we only support four fields. We put the support + * here anyways for the future. + */ + if (unlikely((seconds >= 0xc0) && (seconds <= 0xff))) + seconds = 0xff; + + if (unlikely((minutes >= 0xc0) && (minutes <= 0xff))) + minutes = 0xff; + + if (unlikely((hours >= 0xc0) && (hours <= 0xff))) + hours = 0xff; + + alrm->time.tm_mon = -1; + alrm->time.tm_year = -1; + alrm->time.tm_wday = -1; + alrm->time.tm_yday = -1; + alrm->time.tm_isdst = -1; + + /* Disable the alarm interrupt first. */ + ds1685_rtc_begin_data_access(rtc); + ctrlb = rtc->read(rtc, RTC_CTRL_B); + rtc->write(rtc, RTC_CTRL_B, (ctrlb & ~(RTC_CTRL_B_AIE))); + + /* Read ctrlc to clear RTC_CTRL_C_AF. */ + rtc->read(rtc, RTC_CTRL_C); + + /* + * Set the data mode to use and store the time values in the + * RTC registers. + */ + ctrlb = rtc->read(rtc, RTC_CTRL_B); + if (rtc->bcd_mode) + ctrlb &= ~(RTC_CTRL_B_DM); + else + ctrlb |= RTC_CTRL_B_DM; + rtc->write(rtc, RTC_CTRL_B, ctrlb); + rtc->write(rtc, RTC_SECS_ALARM, seconds); + rtc->write(rtc, RTC_MINS_ALARM, minutes); + rtc->write(rtc, RTC_HRS_ALARM, hours); + rtc->write(rtc, RTC_MDAY_ALARM, mday); + + /* Re-enable the alarm if needed. */ + if (alrm->enabled) { + ctrlb = rtc->read(rtc, RTC_CTRL_B); + ctrlb |= RTC_CTRL_B_AIE; + rtc->write(rtc, RTC_CTRL_B, ctrlb); + } + + /* Done! */ + ds1685_rtc_end_data_access(rtc); + + return 0; +} +/* ----------------------------------------------------------------------- */ + + +/* ----------------------------------------------------------------------- */ +/* /dev/rtcX Interface functions */ + +#ifdef CONFIG_RTC_INTF_DEV +/** + * ds1685_rtc_alarm_irq_enable - replaces ioctl() RTC_AIE on/off. + * @dev: pointer to device structure. + * @enabled: flag indicating whether to enable or disable. + */ +static int +ds1685_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled) +{ + struct ds1685_priv *rtc = dev_get_drvdata(dev); + unsigned long flags = 0; + + /* Enable/disable the Alarm IRQ-Enable flag. */ + spin_lock_irqsave(&rtc->lock, flags); + + /* Flip the requisite interrupt-enable bit. */ + if (enabled) + rtc->write(rtc, RTC_CTRL_B, (rtc->read(rtc, RTC_CTRL_B) | + RTC_CTRL_B_AIE)); + else + rtc->write(rtc, RTC_CTRL_B, (rtc->read(rtc, RTC_CTRL_B) & + ~(RTC_CTRL_B_AIE))); + + /* Read Control C to clear all the flag bits. */ + rtc->read(rtc, RTC_CTRL_C); + spin_unlock_irqrestore(&rtc->lock, flags); + + return 0; +} +#endif +/* ----------------------------------------------------------------------- */ + + +/* ----------------------------------------------------------------------- */ +/* IRQ handler & workqueue. */ + +/** + * ds1685_rtc_irq_handler - IRQ handler. + * @irq: IRQ number. + * @dev_id: platform device pointer. + */ +static irqreturn_t +ds1685_rtc_irq_handler(int irq, void *dev_id) +{ + struct platform_device *pdev = dev_id; + struct ds1685_priv *rtc = platform_get_drvdata(pdev); + u8 ctrlb, ctrlc; + unsigned long events = 0; + u8 num_irqs = 0; + + /* Abort early if the device isn't ready yet (i.e., DEBUG_SHIRQ). */ + if (unlikely(!rtc)) + return IRQ_HANDLED; + + /* Ctrlb holds the interrupt-enable bits and ctrlc the flag bits. */ + spin_lock(&rtc->lock); + ctrlb = rtc->read(rtc, RTC_CTRL_B); + ctrlc = rtc->read(rtc, RTC_CTRL_C); + + /* Is the IRQF bit set? */ + if (likely(ctrlc & RTC_CTRL_C_IRQF)) { + /* + * We need to determine if it was one of the standard + * events: PF, AF, or UF. If so, we handle them and + * update the RTC core. + */ + if (likely(ctrlc & RTC_CTRL_B_PAU_MASK)) { + events = RTC_IRQF; + + /* Check for a periodic interrupt. */ + if ((ctrlb & RTC_CTRL_B_PIE) && + (ctrlc & RTC_CTRL_C_PF)) { + events |= RTC_PF; + num_irqs++; + } + + /* Check for an alarm interrupt. */ + if ((ctrlb & RTC_CTRL_B_AIE) && + (ctrlc & RTC_CTRL_C_AF)) { + events |= RTC_AF; + num_irqs++; + } + + /* Check for an update interrupt. */ + if ((ctrlb & RTC_CTRL_B_UIE) && + (ctrlc & RTC_CTRL_C_UF)) { + events |= RTC_UF; + num_irqs++; + } + + rtc_update_irq(rtc->dev, num_irqs, events); + } else { + /* + * One of the "extended" interrupts was received that + * is not recognized by the RTC core. These need to + * be handled in task context as they can call other + * functions and the time spent in irq context needs + * to be minimized. Schedule them into a workqueue + * and inform the RTC core that the IRQs were handled. + */ + spin_unlock(&rtc->lock); + schedule_work(&rtc->work); + rtc_update_irq(rtc->dev, 0, 0); + return IRQ_HANDLED; + } + } + spin_unlock(&rtc->lock); + + return events ? IRQ_HANDLED : IRQ_NONE; +} + +/** + * ds1685_rtc_work_queue - work queue handler. + * @work: work_struct containing data to work on in task context. + */ +static void +ds1685_rtc_work_queue(struct work_struct *work) +{ + struct ds1685_priv *rtc = container_of(work, + struct ds1685_priv, work); + struct platform_device *pdev = to_platform_device(&rtc->dev->dev); + struct mutex *rtc_mutex = &rtc->dev->ops_lock; + u8 ctrl4a, ctrl4b; + + mutex_lock(rtc_mutex); + + ds1685_rtc_switch_to_bank1(rtc); + ctrl4a = rtc->read(rtc, RTC_EXT_CTRL_4A); + ctrl4b = rtc->read(rtc, RTC_EXT_CTRL_4B); + + /* + * Check for a kickstart interrupt. With Vcc applied, this + * typically means that the power button was pressed, so we + * begin the shutdown sequence. + */ + if ((ctrl4b & RTC_CTRL_4B_KSE) && (ctrl4a & RTC_CTRL_4A_KF)) { + /* Briefly disable kickstarts to debounce button presses. */ + rtc->write(rtc, RTC_EXT_CTRL_4B, + (rtc->read(rtc, RTC_EXT_CTRL_4B) & + ~(RTC_CTRL_4B_KSE))); + + /* Clear the kickstart flag. */ + rtc->write(rtc, RTC_EXT_CTRL_4A, + (ctrl4a & ~(RTC_CTRL_4A_KF))); + + + /* + * Sleep 500ms before re-enabling kickstarts. This allows + * adequate time to avoid reading signal jitter as additional + * button presses. + */ + msleep(500); + rtc->write(rtc, RTC_EXT_CTRL_4B, + (rtc->read(rtc, RTC_EXT_CTRL_4B) | + RTC_CTRL_4B_KSE)); + + /* Call the platform pre-poweroff function. Else, shutdown. */ + if (rtc->prepare_poweroff != NULL) + rtc->prepare_poweroff(); + else + ds1685_rtc_poweroff(pdev); + } + + /* + * Check for a wake-up interrupt. With Vcc applied, this is + * essentially a second alarm interrupt, except it takes into + * account the 'date' register in bank1 in addition to the + * standard three alarm registers. + */ + if ((ctrl4b & RTC_CTRL_4B_WIE) && (ctrl4a & RTC_CTRL_4A_WF)) { + rtc->write(rtc, RTC_EXT_CTRL_4A, + (ctrl4a & ~(RTC_CTRL_4A_WF))); + + /* Call the platform wake_alarm function if defined. */ + if (rtc->wake_alarm != NULL) + rtc->wake_alarm(); + else + dev_warn(&pdev->dev, + "Wake Alarm IRQ just occurred!\n"); + } + + /* + * Check for a ram-clear interrupt. This happens if RIE=1 and RF=0 + * when RCE=1 in 4B. This clears all NVRAM bytes in bank0 by setting + * each byte to a logic 1. This has no effect on any extended + * NV-SRAM that might be present, nor on the time/calendar/alarm + * registers. After a ram-clear is completed, there is a minimum + * recovery time of ~150ms in which all reads/writes are locked out. + * NOTE: A ram-clear can still occur if RCE=1 and RIE=0. We cannot + * catch this scenario. + */ + if ((ctrl4b & RTC_CTRL_4B_RIE) && (ctrl4a & RTC_CTRL_4A_RF)) { + rtc->write(rtc, RTC_EXT_CTRL_4A, + (ctrl4a & ~(RTC_CTRL_4A_RF))); + msleep(150); + + /* Call the platform post_ram_clear function if defined. */ + if (rtc->post_ram_clear != NULL) + rtc->post_ram_clear(); + else + dev_warn(&pdev->dev, + "RAM-Clear IRQ just occurred!\n"); + } + ds1685_rtc_switch_to_bank0(rtc); + + mutex_unlock(rtc_mutex); +} +/* ----------------------------------------------------------------------- */ + + +/* ----------------------------------------------------------------------- */ +/* ProcFS interface */ + +#ifdef CONFIG_PROC_FS +#define NUM_REGS 6 /* Num of control registers. */ +#define NUM_BITS 8 /* Num bits per register. */ +#define NUM_SPACES 4 /* Num spaces between each bit. */ + +/* + * Periodic Interrupt Rates. + */ +static const char *ds1685_rtc_pirq_rate[16] = { + "none", "3.90625ms", "7.8125ms", "0.122070ms", "0.244141ms", + "0.488281ms", "0.9765625ms", "1.953125ms", "3.90625ms", "7.8125ms", + "15.625ms", "31.25ms", "62.5ms", "125ms", "250ms", "500ms" +}; + +/* + * Square-Wave Output Frequencies. + */ +static const char *ds1685_rtc_sqw_freq[16] = { + "none", "256Hz", "128Hz", "8192Hz", "4096Hz", "2048Hz", "1024Hz", + "512Hz", "256Hz", "128Hz", "64Hz", "32Hz", "16Hz", "8Hz", "4Hz", "2Hz" +}; + +#ifdef CONFIG_RTC_DS1685_PROC_REGS +/** + * ds1685_rtc_print_regs - helper function to print register values. + * @hex: hex byte to convert into binary bits. + * @dest: destination char array. + * + * This is basically a hex->binary function, just with extra spacing between + * the digits. It only works on 1-byte values (8 bits). + */ +static char* +ds1685_rtc_print_regs(u8 hex, char *dest) +{ + u32 i, j; + char *tmp = dest; + + for (i = 0; i < NUM_BITS; i++) { + *tmp++ = ((hex & 0x80) != 0 ? '1' : '0'); + for (j = 0; j < NUM_SPACES; j++) + *tmp++ = ' '; + hex <<= 1; + } + *tmp++ = '\0'; + + return dest; +} +#endif + +/** + * ds1685_rtc_proc - procfs access function. + * @dev: pointer to device structure. + * @seq: pointer to seq_file structure. + */ +static int +ds1685_rtc_proc(struct device *dev, struct seq_file *seq) +{ + struct platform_device *pdev = to_platform_device(dev); + struct ds1685_priv *rtc = platform_get_drvdata(pdev); + u8 ctrla, ctrlb, ctrlc, ctrld, ctrl4a, ctrl4b, ssn[8]; + char *model = '\0'; +#ifdef CONFIG_RTC_DS1685_PROC_REGS + char bits[NUM_REGS][(NUM_BITS * NUM_SPACES) + NUM_BITS + 1]; +#endif + + /* Read all the relevant data from the control registers. */ + ds1685_rtc_switch_to_bank1(rtc); + ds1685_rtc_get_ssn(rtc, ssn); + ctrla = rtc->read(rtc, RTC_CTRL_A); + ctrlb = rtc->read(rtc, RTC_CTRL_B); + ctrlc = rtc->read(rtc, RTC_CTRL_C); + ctrld = rtc->read(rtc, RTC_CTRL_D); + ctrl4a = rtc->read(rtc, RTC_EXT_CTRL_4A); + ctrl4b = rtc->read(rtc, RTC_EXT_CTRL_4B); + ds1685_rtc_switch_to_bank0(rtc); + + /* Determine the RTC model. */ + switch (ssn[0]) { + case RTC_MODEL_DS1685: + model = "DS1685/DS1687\0"; + break; + case RTC_MODEL_DS1689: + model = "DS1689/DS1693\0"; + break; + case RTC_MODEL_DS17285: + model = "DS17285/DS17287\0"; + break; + case RTC_MODEL_DS17485: + model = "DS17485/DS17487\0"; + break; + case RTC_MODEL_DS17885: + model = "DS17885/DS17887\0"; + break; + default: + model = "Unknown\0"; + break; + } + + /* Print out the information. */ + seq_printf(seq, + "Model\t\t: %s\n" + "Oscillator\t: %s\n" + "12/24hr\t\t: %s\n" + "DST\t\t: %s\n" + "Data mode\t: %s\n" + "Battery\t\t: %s\n" + "Aux batt\t: %s\n" + "Update IRQ\t: %s\n" + "Periodic IRQ\t: %s\n" + "Periodic Rate\t: %s\n" + "SQW Freq\t: %s\n" +#ifdef CONFIG_RTC_DS1685_PROC_REGS + "Serial #\t: %02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x\n" + "Register Status\t:\n" + " Ctrl A\t: UIP DV2 DV1 DV0 RS3 RS2 RS1 RS0\n" + "\t\t: %s\n" + " Ctrl B\t: SET PIE AIE UIE SQWE DM 2412 DSE\n" + "\t\t: %s\n" + " Ctrl C\t: IRQF PF AF UF --- --- --- ---\n" + "\t\t: %s\n" + " Ctrl D\t: VRT --- --- --- --- --- --- ---\n" + "\t\t: %s\n" +#if !defined(CONFIG_RTC_DRV_DS1685) && !defined(CONFIG_RTC_DRV_DS1689) + " Ctrl 4A\t: VRT2 INCR BME --- PAB RF WF KF\n" +#else + " Ctrl 4A\t: VRT2 INCR --- --- PAB RF WF KF\n" +#endif + "\t\t: %s\n" + " Ctrl 4B\t: ABE E32k CS RCE PRS RIE WIE KSE\n" + "\t\t: %s\n", +#else + "Serial #\t: %02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x\n", +#endif + model, + ((ctrla & RTC_CTRL_A_DV1) ? "enabled" : "disabled"), + ((ctrlb & RTC_CTRL_B_2412) ? "24-hour" : "12-hour"), + ((ctrlb & RTC_CTRL_B_DSE) ? "enabled" : "disabled"), + ((ctrlb & RTC_CTRL_B_DM) ? "binary" : "BCD"), + ((ctrld & RTC_CTRL_D_VRT) ? "ok" : "exhausted or n/a"), + ((ctrl4a & RTC_CTRL_4A_VRT2) ? "ok" : "exhausted or n/a"), + ((ctrlb & RTC_CTRL_B_UIE) ? "yes" : "no"), + ((ctrlb & RTC_CTRL_B_PIE) ? "yes" : "no"), + (!(ctrl4b & RTC_CTRL_4B_E32K) ? + ds1685_rtc_pirq_rate[(ctrla & RTC_CTRL_A_RS_MASK)] : "none"), + (!((ctrl4b & RTC_CTRL_4B_E32K)) ? + ds1685_rtc_sqw_freq[(ctrla & RTC_CTRL_A_RS_MASK)] : "32768Hz"), +#ifdef CONFIG_RTC_DS1685_PROC_REGS + ssn[0], ssn[1], ssn[2], ssn[3], ssn[4], ssn[5], ssn[6], ssn[7], + ds1685_rtc_print_regs(ctrla, bits[0]), + ds1685_rtc_print_regs(ctrlb, bits[1]), + ds1685_rtc_print_regs(ctrlc, bits[2]), + ds1685_rtc_print_regs(ctrld, bits[3]), + ds1685_rtc_print_regs(ctrl4a, bits[4]), + ds1685_rtc_print_regs(ctrl4b, bits[5])); +#else + ssn[0], ssn[1], ssn[2], ssn[3], ssn[4], ssn[5], ssn[6], ssn[7]); +#endif + return 0; +} +#else +#define ds1685_rtc_proc NULL +#endif /* CONFIG_PROC_FS */ +/* ----------------------------------------------------------------------- */ + + +/* ----------------------------------------------------------------------- */ +/* RTC Class operations */ + +static const struct rtc_class_ops +ds1685_rtc_ops = { + .proc = ds1685_rtc_proc, + .read_time = ds1685_rtc_read_time, + .set_time = ds1685_rtc_set_time, + .read_alarm = ds1685_rtc_read_alarm, + .set_alarm = ds1685_rtc_set_alarm, + .alarm_irq_enable = ds1685_rtc_alarm_irq_enable, +}; +/* ----------------------------------------------------------------------- */ + + +/* ----------------------------------------------------------------------- */ +/* SysFS interface */ + +#ifdef CONFIG_SYSFS +/** + * ds1685_rtc_sysfs_nvram_read - reads rtc nvram via sysfs. + * @file: pointer to file structure. + * @kobj: pointer to kobject structure. + * @bin_attr: pointer to bin_attribute structure. + * @buf: pointer to char array to hold the output. + * @pos: current file position pointer. + * @size: size of the data to read. + */ +static ssize_t +ds1685_rtc_sysfs_nvram_read(struct file *filp, struct kobject *kobj, + struct bin_attribute *bin_attr, char *buf, + loff_t pos, size_t size) +{ + struct platform_device *pdev = + to_platform_device(container_of(kobj, struct device, kobj)); + struct ds1685_priv *rtc = platform_get_drvdata(pdev); + ssize_t count; + unsigned long flags = 0; + + spin_lock_irqsave(&rtc->lock, flags); + ds1685_rtc_switch_to_bank0(rtc); + + /* Read NVRAM in time and bank0 registers. */ + for (count = 0; size > 0 && pos < NVRAM_TOTAL_SZ_BANK0; + count++, size--) { + if (count < NVRAM_SZ_TIME) + *buf++ = rtc->read(rtc, (NVRAM_TIME_BASE + pos++)); + else + *buf++ = rtc->read(rtc, (NVRAM_BANK0_BASE + pos++)); + } + +#ifndef CONFIG_RTC_DRV_DS1689 + if (size > 0) { + ds1685_rtc_switch_to_bank1(rtc); + +#ifndef CONFIG_RTC_DRV_DS1685 + /* Enable burst-mode on DS17x85/DS17x87 */ + rtc->write(rtc, RTC_EXT_CTRL_4A, + (rtc->read(rtc, RTC_EXT_CTRL_4A) | + RTC_CTRL_4A_BME)); + + /* We need one write to RTC_BANK1_RAM_ADDR_LSB to start + * reading with burst-mode */ + rtc->write(rtc, RTC_BANK1_RAM_ADDR_LSB, + (pos - NVRAM_TOTAL_SZ_BANK0)); +#endif + + /* Read NVRAM in bank1 registers. */ + for (count = 0; size > 0 && pos < NVRAM_TOTAL_SZ; + count++, size--) { +#ifdef CONFIG_RTC_DRV_DS1685 + /* DS1685/DS1687 has to write to RTC_BANK1_RAM_ADDR + * before each read. */ + rtc->write(rtc, RTC_BANK1_RAM_ADDR, + (pos - NVRAM_TOTAL_SZ_BANK0)); +#endif + *buf++ = rtc->read(rtc, RTC_BANK1_RAM_DATA_PORT); + pos++; + } + +#ifndef CONFIG_RTC_DRV_DS1685 + /* Disable burst-mode on DS17x85/DS17x87 */ + rtc->write(rtc, RTC_EXT_CTRL_4A, + (rtc->read(rtc, RTC_EXT_CTRL_4A) & + ~(RTC_CTRL_4A_BME))); +#endif + ds1685_rtc_switch_to_bank0(rtc); + } +#endif /* !CONFIG_RTC_DRV_DS1689 */ + spin_unlock_irqrestore(&rtc->lock, flags); + + /* + * XXX: Bug? this appears to cause the function to get executed + * several times in succession. But it's the only way to actually get + * data written out to a file. + */ + return count; +} + +/** + * ds1685_rtc_sysfs_nvram_write - writes rtc nvram via sysfs. + * @file: pointer to file structure. + * @kobj: pointer to kobject structure. + * @bin_attr: pointer to bin_attribute structure. + * @buf: pointer to char array to hold the input. + * @pos: current file position pointer. + * @size: size of the data to write. + */ +static ssize_t +ds1685_rtc_sysfs_nvram_write(struct file *filp, struct kobject *kobj, + struct bin_attribute *bin_attr, char *buf, + loff_t pos, size_t size) +{ + struct platform_device *pdev = + to_platform_device(container_of(kobj, struct device, kobj)); + struct ds1685_priv *rtc = platform_get_drvdata(pdev); + ssize_t count; + unsigned long flags = 0; + + spin_lock_irqsave(&rtc->lock, flags); + ds1685_rtc_switch_to_bank0(rtc); + + /* Write NVRAM in time and bank0 registers. */ + for (count = 0; size > 0 && pos < NVRAM_TOTAL_SZ_BANK0; + count++, size--) + if (count < NVRAM_SZ_TIME) + rtc->write(rtc, (NVRAM_TIME_BASE + pos++), + *buf++); + else + rtc->write(rtc, (NVRAM_BANK0_BASE), *buf++); + +#ifndef CONFIG_RTC_DRV_DS1689 + if (size > 0) { + ds1685_rtc_switch_to_bank1(rtc); + +#ifndef CONFIG_RTC_DRV_DS1685 + /* Enable burst-mode on DS17x85/DS17x87 */ + rtc->write(rtc, RTC_EXT_CTRL_4A, + (rtc->read(rtc, RTC_EXT_CTRL_4A) | + RTC_CTRL_4A_BME)); + + /* We need one write to RTC_BANK1_RAM_ADDR_LSB to start + * writing with burst-mode */ + rtc->write(rtc, RTC_BANK1_RAM_ADDR_LSB, + (pos - NVRAM_TOTAL_SZ_BANK0)); +#endif + + /* Write NVRAM in bank1 registers. */ + for (count = 0; size > 0 && pos < NVRAM_TOTAL_SZ; + count++, size--) { +#ifdef CONFIG_RTC_DRV_DS1685 + /* DS1685/DS1687 has to write to RTC_BANK1_RAM_ADDR + * before each read. */ + rtc->write(rtc, RTC_BANK1_RAM_ADDR, + (pos - NVRAM_TOTAL_SZ_BANK0)); +#endif + rtc->write(rtc, RTC_BANK1_RAM_DATA_PORT, *buf++); + pos++; + } + +#ifndef CONFIG_RTC_DRV_DS1685 + /* Disable burst-mode on DS17x85/DS17x87 */ + rtc->write(rtc, RTC_EXT_CTRL_4A, + (rtc->read(rtc, RTC_EXT_CTRL_4A) & + ~(RTC_CTRL_4A_BME))); +#endif + ds1685_rtc_switch_to_bank0(rtc); + } +#endif /* !CONFIG_RTC_DRV_DS1689 */ + spin_unlock_irqrestore(&rtc->lock, flags); + + return count; +} + +/** + * struct ds1685_rtc_sysfs_nvram_attr - sysfs attributes for rtc nvram. + * @attr: nvram attributes. + * @read: nvram read function. + * @write: nvram write function. + * @size: nvram total size (bank0 + extended). + */ +static struct bin_attribute +ds1685_rtc_sysfs_nvram_attr = { + .attr = { + .name = "nvram", + .mode = S_IRUGO | S_IWUSR, + }, + .read = ds1685_rtc_sysfs_nvram_read, + .write = ds1685_rtc_sysfs_nvram_write, + .size = NVRAM_TOTAL_SZ +}; + +/** + * ds1685_rtc_sysfs_battery_show - sysfs file for main battery status. + * @dev: pointer to device structure. + * @attr: pointer to device_attribute structure. + * @buf: pointer to char array to hold the output. + */ +static ssize_t +ds1685_rtc_sysfs_battery_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct platform_device *pdev = to_platform_device(dev); + struct ds1685_priv *rtc = platform_get_drvdata(pdev); + u8 ctrld; + + ctrld = rtc->read(rtc, RTC_CTRL_D); + + return snprintf(buf, 13, "%s\n", + (ctrld & RTC_CTRL_D_VRT) ? "ok" : "not ok or N/A"); +} +static DEVICE_ATTR(battery, S_IRUGO, ds1685_rtc_sysfs_battery_show, NULL); + +/** + * ds1685_rtc_sysfs_auxbatt_show - sysfs file for aux battery status. + * @dev: pointer to device structure. + * @attr: pointer to device_attribute structure. + * @buf: pointer to char array to hold the output. + */ +static ssize_t +ds1685_rtc_sysfs_auxbatt_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct platform_device *pdev = to_platform_device(dev); + struct ds1685_priv *rtc = platform_get_drvdata(pdev); + u8 ctrl4a; + + ds1685_rtc_switch_to_bank1(rtc); + ctrl4a = rtc->read(rtc, RTC_EXT_CTRL_4A); + ds1685_rtc_switch_to_bank0(rtc); + + return snprintf(buf, 13, "%s\n", + (ctrl4a & RTC_CTRL_4A_VRT2) ? "ok" : "not ok or N/A"); +} +static DEVICE_ATTR(auxbatt, S_IRUGO, ds1685_rtc_sysfs_auxbatt_show, NULL); + +/** + * ds1685_rtc_sysfs_serial_show - sysfs file for silicon serial number. + * @dev: pointer to device structure. + * @attr: pointer to device_attribute structure. + * @buf: pointer to char array to hold the output. + */ +static ssize_t +ds1685_rtc_sysfs_serial_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct platform_device *pdev = to_platform_device(dev); + struct ds1685_priv *rtc = platform_get_drvdata(pdev); + u8 ssn[8]; + + ds1685_rtc_switch_to_bank1(rtc); + ds1685_rtc_get_ssn(rtc, ssn); + ds1685_rtc_switch_to_bank0(rtc); + + return snprintf(buf, 24, "%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x\n", + ssn[0], ssn[1], ssn[2], ssn[3], ssn[4], ssn[5], + ssn[6], ssn[7]); + + return 0; +} +static DEVICE_ATTR(serial, S_IRUGO, ds1685_rtc_sysfs_serial_show, NULL); + +/** + * struct ds1685_rtc_sysfs_misc_attrs - list for misc RTC features. + */ +static struct attribute* +ds1685_rtc_sysfs_misc_attrs[] = { + &dev_attr_battery.attr, + &dev_attr_auxbatt.attr, + &dev_attr_serial.attr, + NULL, +}; + +/** + * struct ds1685_rtc_sysfs_misc_grp - attr group for misc RTC features. + */ +static const struct attribute_group +ds1685_rtc_sysfs_misc_grp = { + .name = "misc", + .attrs = ds1685_rtc_sysfs_misc_attrs, +}; + +#ifdef CONFIG_RTC_DS1685_SYSFS_REGS +/** + * struct ds1685_rtc_ctrl_regs. + * @name: char pointer for the bit name. + * @reg: control register the bit is in. + * @bit: the bit's offset in the register. + */ +struct ds1685_rtc_ctrl_regs { + const char *name; + const u8 reg; + const u8 bit; +}; + +/* + * Ctrl register bit lookup table. + */ +static const struct ds1685_rtc_ctrl_regs +ds1685_ctrl_regs_table[] = { + { "uip", RTC_CTRL_A, RTC_CTRL_A_UIP }, + { "dv2", RTC_CTRL_A, RTC_CTRL_A_DV2 }, + { "dv1", RTC_CTRL_A, RTC_CTRL_A_DV1 }, + { "dv0", RTC_CTRL_A, RTC_CTRL_A_DV0 }, + { "rs3", RTC_CTRL_A, RTC_CTRL_A_RS3 }, + { "rs2", RTC_CTRL_A, RTC_CTRL_A_RS2 }, + { "rs1", RTC_CTRL_A, RTC_CTRL_A_RS1 }, + { "rs0", RTC_CTRL_A, RTC_CTRL_A_RS0 }, + { "set", RTC_CTRL_B, RTC_CTRL_B_SET }, + { "pie", RTC_CTRL_B, RTC_CTRL_B_PIE }, + { "aie", RTC_CTRL_B, RTC_CTRL_B_AIE }, + { "uie", RTC_CTRL_B, RTC_CTRL_B_UIE }, + { "sqwe", RTC_CTRL_B, RTC_CTRL_B_SQWE }, + { "dm", RTC_CTRL_B, RTC_CTRL_B_DM }, + { "2412", RTC_CTRL_B, RTC_CTRL_B_2412 }, + { "dse", RTC_CTRL_B, RTC_CTRL_B_DSE }, + { "irqf", RTC_CTRL_C, RTC_CTRL_C_IRQF }, + { "pf", RTC_CTRL_C, RTC_CTRL_C_PF }, + { "af", RTC_CTRL_C, RTC_CTRL_C_AF }, + { "uf", RTC_CTRL_C, RTC_CTRL_C_UF }, + { "vrt", RTC_CTRL_D, RTC_CTRL_D_VRT }, + { "vrt2", RTC_EXT_CTRL_4A, RTC_CTRL_4A_VRT2 }, + { "incr", RTC_EXT_CTRL_4A, RTC_CTRL_4A_INCR }, + { "pab", RTC_EXT_CTRL_4A, RTC_CTRL_4A_PAB }, + { "rf", RTC_EXT_CTRL_4A, RTC_CTRL_4A_RF }, + { "wf", RTC_EXT_CTRL_4A, RTC_CTRL_4A_WF }, + { "kf", RTC_EXT_CTRL_4A, RTC_CTRL_4A_KF }, +#if !defined(CONFIG_RTC_DRV_DS1685) && !defined(CONFIG_RTC_DRV_DS1689) + { "bme", RTC_EXT_CTRL_4A, RTC_CTRL_4A_BME }, +#endif + { "abe", RTC_EXT_CTRL_4B, RTC_CTRL_4B_ABE }, + { "e32k", RTC_EXT_CTRL_4B, RTC_CTRL_4B_E32K }, + { "cs", RTC_EXT_CTRL_4B, RTC_CTRL_4B_CS }, + { "rce", RTC_EXT_CTRL_4B, RTC_CTRL_4B_RCE }, + { "prs", RTC_EXT_CTRL_4B, RTC_CTRL_4B_PRS }, + { "rie", RTC_EXT_CTRL_4B, RTC_CTRL_4B_RIE }, + { "wie", RTC_EXT_CTRL_4B, RTC_CTRL_4B_WIE }, + { "kse", RTC_EXT_CTRL_4B, RTC_CTRL_4B_KSE }, + { NULL, 0, 0 }, +}; + +/** + * ds1685_rtc_sysfs_ctrl_regs_lookup - ctrl register bit lookup function. + * @name: ctrl register bit to look up in ds1685_ctrl_regs_table. + */ +static const struct ds1685_rtc_ctrl_regs* +ds1685_rtc_sysfs_ctrl_regs_lookup(const char *name) +{ + const struct ds1685_rtc_ctrl_regs *p = ds1685_ctrl_regs_table; + + for (; p->name != NULL; ++p) + if (strcmp(p->name, name) == 0) + return p; + + return NULL; +} + +/** + * ds1685_rtc_sysfs_ctrl_regs_show - reads a ctrl register bit via sysfs. + * @dev: pointer to device structure. + * @attr: pointer to device_attribute structure. + * @buf: pointer to char array to hold the output. + */ +static ssize_t +ds1685_rtc_sysfs_ctrl_regs_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + u8 tmp; + struct ds1685_priv *rtc = dev_get_drvdata(dev); + const struct ds1685_rtc_ctrl_regs *reg_info = + ds1685_rtc_sysfs_ctrl_regs_lookup(attr->attr.name); + + /* Make sure we actually matched something. */ + if (!reg_info) + return -EINVAL; + + /* No spinlock during a read -- mutex is already held. */ + ds1685_rtc_switch_to_bank1(rtc); + tmp = rtc->read(rtc, reg_info->reg) & reg_info->bit; + ds1685_rtc_switch_to_bank0(rtc); + + return snprintf(buf, 2, "%d\n", (tmp ? 1 : 0)); +} + +/** + * ds1685_rtc_sysfs_ctrl_regs_store - writes a ctrl register bit via sysfs. + * @dev: pointer to device structure. + * @attr: pointer to device_attribute structure. + * @buf: pointer to char array to hold the output. + * @count: number of bytes written. + */ +static ssize_t +ds1685_rtc_sysfs_ctrl_regs_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct ds1685_priv *rtc = dev_get_drvdata(dev); + u8 reg = 0, bit = 0, tmp; + unsigned long flags = 0; + long int val = 0; + const struct ds1685_rtc_ctrl_regs *reg_info = + ds1685_rtc_sysfs_ctrl_regs_lookup(attr->attr.name); + + /* We only accept numbers. */ + if (kstrtol(buf, 10, &val) < 0) + return -EINVAL; + + /* bits are binary, 0 or 1 only. */ + if ((val != 0) && (val != 1)) + return -ERANGE; + + /* Make sure we actually matched something. */ + if (!reg_info) + return -EINVAL; + + reg = reg_info->reg; + bit = reg_info->bit; + + /* Safe to spinlock during a write. */ + ds1685_rtc_begin_ctrl_access(rtc, flags); + tmp = rtc->read(rtc, reg); + rtc->write(rtc, reg, (val ? (tmp | bit) : (tmp & ~(bit)))); + ds1685_rtc_end_ctrl_access(rtc, flags); + + return count; +} + +/** + * DS1685_RTC_SYSFS_CTRL_REG_RO - device_attribute for read-only register bit. + * @bit: bit to read. + */ +#define DS1685_RTC_SYSFS_CTRL_REG_RO(bit) \ + static DEVICE_ATTR(bit, S_IRUGO, \ + ds1685_rtc_sysfs_ctrl_regs_show, NULL) + +/** + * DS1685_RTC_SYSFS_CTRL_REG_RW - device_attribute for read-write register bit. + * @bit: bit to read or write. + */ +#define DS1685_RTC_SYSFS_CTRL_REG_RW(bit) \ + static DEVICE_ATTR(bit, S_IRUGO | S_IWUSR, \ + ds1685_rtc_sysfs_ctrl_regs_show, \ + ds1685_rtc_sysfs_ctrl_regs_store) + +/* + * Control Register A bits. + */ +DS1685_RTC_SYSFS_CTRL_REG_RO(uip); +DS1685_RTC_SYSFS_CTRL_REG_RW(dv2); +DS1685_RTC_SYSFS_CTRL_REG_RW(dv1); +DS1685_RTC_SYSFS_CTRL_REG_RO(dv0); +DS1685_RTC_SYSFS_CTRL_REG_RW(rs3); +DS1685_RTC_SYSFS_CTRL_REG_RW(rs2); +DS1685_RTC_SYSFS_CTRL_REG_RW(rs1); +DS1685_RTC_SYSFS_CTRL_REG_RW(rs0); + +static struct attribute* +ds1685_rtc_sysfs_ctrla_attrs[] = { + &dev_attr_uip.attr, + &dev_attr_dv2.attr, + &dev_attr_dv1.attr, + &dev_attr_dv0.attr, + &dev_attr_rs3.attr, + &dev_attr_rs2.attr, + &dev_attr_rs1.attr, + &dev_attr_rs0.attr, + NULL, +}; + +static const struct attribute_group +ds1685_rtc_sysfs_ctrla_grp = { + .name = "ctrla", + .attrs = ds1685_rtc_sysfs_ctrla_attrs, +}; + + +/* + * Control Register B bits. + */ +DS1685_RTC_SYSFS_CTRL_REG_RO(set); +DS1685_RTC_SYSFS_CTRL_REG_RW(pie); +DS1685_RTC_SYSFS_CTRL_REG_RW(aie); +DS1685_RTC_SYSFS_CTRL_REG_RW(uie); +DS1685_RTC_SYSFS_CTRL_REG_RW(sqwe); +DS1685_RTC_SYSFS_CTRL_REG_RO(dm); +DS1685_RTC_SYSFS_CTRL_REG_RO(2412); +DS1685_RTC_SYSFS_CTRL_REG_RO(dse); + +static struct attribute* +ds1685_rtc_sysfs_ctrlb_attrs[] = { + &dev_attr_set.attr, + &dev_attr_pie.attr, + &dev_attr_aie.attr, + &dev_attr_uie.attr, + &dev_attr_sqwe.attr, + &dev_attr_dm.attr, + &dev_attr_2412.attr, + &dev_attr_dse.attr, + NULL, +}; + +static const struct attribute_group +ds1685_rtc_sysfs_ctrlb_grp = { + .name = "ctrlb", + .attrs = ds1685_rtc_sysfs_ctrlb_attrs, +}; + +/* + * Control Register C bits. + * + * Reading Control C clears these bits! Reading them individually can + * possibly cause an interrupt to be missed. Use the /proc interface + * to see all the bits in this register simultaneously. + */ +DS1685_RTC_SYSFS_CTRL_REG_RO(irqf); +DS1685_RTC_SYSFS_CTRL_REG_RO(pf); +DS1685_RTC_SYSFS_CTRL_REG_RO(af); +DS1685_RTC_SYSFS_CTRL_REG_RO(uf); + +static struct attribute* +ds1685_rtc_sysfs_ctrlc_attrs[] = { + &dev_attr_irqf.attr, + &dev_attr_pf.attr, + &dev_attr_af.attr, + &dev_attr_uf.attr, + NULL, +}; + +static const struct attribute_group +ds1685_rtc_sysfs_ctrlc_grp = { + .name = "ctrlc", + .attrs = ds1685_rtc_sysfs_ctrlc_attrs, +}; + +/* + * Control Register D bits. + */ +DS1685_RTC_SYSFS_CTRL_REG_RO(vrt); + +static struct attribute* +ds1685_rtc_sysfs_ctrld_attrs[] = { + &dev_attr_vrt.attr, + NULL, +}; + +static const struct attribute_group +ds1685_rtc_sysfs_ctrld_grp = { + .name = "ctrld", + .attrs = ds1685_rtc_sysfs_ctrld_attrs, +}; + +/* + * Control Register 4A bits. + */ +DS1685_RTC_SYSFS_CTRL_REG_RO(vrt2); +DS1685_RTC_SYSFS_CTRL_REG_RO(incr); +DS1685_RTC_SYSFS_CTRL_REG_RW(pab); +DS1685_RTC_SYSFS_CTRL_REG_RW(rf); +DS1685_RTC_SYSFS_CTRL_REG_RW(wf); +DS1685_RTC_SYSFS_CTRL_REG_RW(kf); +#if !defined(CONFIG_RTC_DRV_DS1685) && !defined(CONFIG_RTC_DRV_DS1689) +DS1685_RTC_SYSFS_CTRL_REG_RO(bme); +#endif + +static struct attribute* +ds1685_rtc_sysfs_ctrl4a_attrs[] = { + &dev_attr_vrt2.attr, + &dev_attr_incr.attr, + &dev_attr_pab.attr, + &dev_attr_rf.attr, + &dev_attr_wf.attr, + &dev_attr_kf.attr, +#if !defined(CONFIG_RTC_DRV_DS1685) && !defined(CONFIG_RTC_DRV_DS1689) + &dev_attr_bme.attr, +#endif + NULL, +}; + +static const struct attribute_group +ds1685_rtc_sysfs_ctrl4a_grp = { + .name = "ctrl4a", + .attrs = ds1685_rtc_sysfs_ctrl4a_attrs, +}; + +/* + * Control Register 4B bits. + */ +DS1685_RTC_SYSFS_CTRL_REG_RW(abe); +DS1685_RTC_SYSFS_CTRL_REG_RW(e32k); +DS1685_RTC_SYSFS_CTRL_REG_RO(cs); +DS1685_RTC_SYSFS_CTRL_REG_RW(rce); +DS1685_RTC_SYSFS_CTRL_REG_RW(prs); +DS1685_RTC_SYSFS_CTRL_REG_RW(rie); +DS1685_RTC_SYSFS_CTRL_REG_RW(wie); +DS1685_RTC_SYSFS_CTRL_REG_RW(kse); + +static struct attribute* +ds1685_rtc_sysfs_ctrl4b_attrs[] = { + &dev_attr_abe.attr, + &dev_attr_e32k.attr, + &dev_attr_cs.attr, + &dev_attr_rce.attr, + &dev_attr_prs.attr, + &dev_attr_rie.attr, + &dev_attr_wie.attr, + &dev_attr_kse.attr, + NULL, +}; + +static const struct attribute_group +ds1685_rtc_sysfs_ctrl4b_grp = { + .name = "ctrl4b", + .attrs = ds1685_rtc_sysfs_ctrl4b_attrs, +}; + + +/** + * struct ds1685_rtc_ctrl_regs. + * @name: char pointer for the bit name. + * @reg: control register the bit is in. + * @bit: the bit's offset in the register. + */ +struct ds1685_rtc_time_regs { + const char *name; + const u8 reg; + const u8 mask; + const u8 min; + const u8 max; +}; + +/* + * Time/Date register lookup tables. + */ +static const struct ds1685_rtc_time_regs +ds1685_time_regs_bcd_table[] = { + { "seconds", RTC_SECS, RTC_SECS_BCD_MASK, 0, 59 }, + { "minutes", RTC_MINS, RTC_MINS_BCD_MASK, 0, 59 }, + { "hours", RTC_HRS, RTC_HRS_24_BCD_MASK, 0, 23 }, + { "wday", RTC_WDAY, RTC_WDAY_MASK, 1, 7 }, + { "mday", RTC_MDAY, RTC_MDAY_BCD_MASK, 1, 31 }, + { "month", RTC_MONTH, RTC_MONTH_BCD_MASK, 1, 12 }, + { "year", RTC_YEAR, RTC_YEAR_BCD_MASK, 0, 99 }, + { "century", RTC_CENTURY, RTC_CENTURY_MASK, 0, 99 }, + { "alarm_seconds", RTC_SECS_ALARM, RTC_SECS_BCD_MASK, 0, 59 }, + { "alarm_minutes", RTC_MINS_ALARM, RTC_MINS_BCD_MASK, 0, 59 }, + { "alarm_hours", RTC_HRS_ALARM, RTC_HRS_24_BCD_MASK, 0, 23 }, + { "alarm_mday", RTC_MDAY_ALARM, RTC_MDAY_ALARM_MASK, 1, 31 }, + { NULL, 0, 0, 0, 0 }, +}; + +static const struct ds1685_rtc_time_regs +ds1685_time_regs_bin_table[] = { + { "seconds", RTC_SECS, RTC_SECS_BIN_MASK, 0x00, 0x3b }, + { "minutes", RTC_MINS, RTC_MINS_BIN_MASK, 0x00, 0x3b }, + { "hours", RTC_HRS, RTC_HRS_24_BIN_MASK, 0x00, 0x17 }, + { "wday", RTC_WDAY, RTC_WDAY_MASK, 0x01, 0x07 }, + { "mday", RTC_MDAY, RTC_MDAY_BIN_MASK, 0x01, 0x1f }, + { "month", RTC_MONTH, RTC_MONTH_BIN_MASK, 0x01, 0x0c }, + { "year", RTC_YEAR, RTC_YEAR_BIN_MASK, 0x00, 0x63 }, + { "century", RTC_CENTURY, RTC_CENTURY_MASK, 0x00, 0x63 }, + { "alarm_seconds", RTC_SECS_ALARM, RTC_SECS_BIN_MASK, 0x00, 0x3b }, + { "alarm_minutes", RTC_MINS_ALARM, RTC_MINS_BIN_MASK, 0x00, 0x3b }, + { "alarm_hours", RTC_HRS_ALARM, RTC_HRS_24_BIN_MASK, 0x00, 0x17 }, + { "alarm_mday", RTC_MDAY_ALARM, RTC_MDAY_ALARM_MASK, 0x01, 0x1f }, + { NULL, 0, 0, 0x00, 0x00 }, +}; + +/** + * ds1685_rtc_sysfs_time_regs_bcd_lookup - time/date reg bit lookup function. + * @name: register bit to look up in ds1685_time_regs_bcd_table. + */ +static const struct ds1685_rtc_time_regs* +ds1685_rtc_sysfs_time_regs_lookup(const char *name, bool bcd_mode) +{ + const struct ds1685_rtc_time_regs *p; + + if (bcd_mode) + p = ds1685_time_regs_bcd_table; + else + p = ds1685_time_regs_bin_table; + + for (; p->name != NULL; ++p) + if (strcmp(p->name, name) == 0) + return p; + + return NULL; +} + +/** + * ds1685_rtc_sysfs_time_regs_show - reads a time/date register via sysfs. + * @dev: pointer to device structure. + * @attr: pointer to device_attribute structure. + * @buf: pointer to char array to hold the output. + */ +static ssize_t +ds1685_rtc_sysfs_time_regs_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + u8 tmp; + struct ds1685_priv *rtc = dev_get_drvdata(dev); + const struct ds1685_rtc_time_regs *bcd_reg_info = + ds1685_rtc_sysfs_time_regs_lookup(attr->attr.name, true); + const struct ds1685_rtc_time_regs *bin_reg_info = + ds1685_rtc_sysfs_time_regs_lookup(attr->attr.name, false); + + /* Make sure we actually matched something. */ + if (!bcd_reg_info && !bin_reg_info) + return -EINVAL; + + /* bcd_reg_info->reg == bin_reg_info->reg. */ + ds1685_rtc_begin_data_access(rtc); + tmp = rtc->read(rtc, bcd_reg_info->reg); + ds1685_rtc_end_data_access(rtc); + + tmp = ds1685_rtc_bcd2bin(rtc, tmp, bcd_reg_info->mask, + bin_reg_info->mask); + + return snprintf(buf, 4, "%d\n", tmp); +} + +/** + * ds1685_rtc_sysfs_time_regs_store - writes a time/date register via sysfs. + * @dev: pointer to device structure. + * @attr: pointer to device_attribute structure. + * @buf: pointer to char array to hold the output. + * @count: number of bytes written. + */ +static ssize_t +ds1685_rtc_sysfs_time_regs_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + long int val = 0; + struct ds1685_priv *rtc = dev_get_drvdata(dev); + const struct ds1685_rtc_time_regs *bcd_reg_info = + ds1685_rtc_sysfs_time_regs_lookup(attr->attr.name, true); + const struct ds1685_rtc_time_regs *bin_reg_info = + ds1685_rtc_sysfs_time_regs_lookup(attr->attr.name, false); + + /* We only accept numbers. */ + if (kstrtol(buf, 10, &val) < 0) + return -EINVAL; + + /* Make sure we actually matched something. */ + if (!bcd_reg_info && !bin_reg_info) + return -EINVAL; + + /* Check for a valid range. */ + if (rtc->bcd_mode) { + if ((val < bcd_reg_info->min) || (val > bcd_reg_info->max)) + return -ERANGE; + } else { + if ((val < bin_reg_info->min) || (val > bin_reg_info->max)) + return -ERANGE; + } + + val = ds1685_rtc_bin2bcd(rtc, val, bin_reg_info->mask, + bcd_reg_info->mask); + + /* bcd_reg_info->reg == bin_reg_info->reg. */ + ds1685_rtc_begin_data_access(rtc); + rtc->write(rtc, bcd_reg_info->reg, val); + ds1685_rtc_end_data_access(rtc); + + return count; +} + +/** + * DS1685_RTC_SYSFS_REG_RW - device_attribute for a read-write time register. + * @reg: time/date register to read or write. + */ +#define DS1685_RTC_SYSFS_TIME_REG_RW(reg) \ + static DEVICE_ATTR(reg, S_IRUGO | S_IWUSR, \ + ds1685_rtc_sysfs_time_regs_show, \ + ds1685_rtc_sysfs_time_regs_store) + +/* + * Time/Date Register bits. + */ +DS1685_RTC_SYSFS_TIME_REG_RW(seconds); +DS1685_RTC_SYSFS_TIME_REG_RW(minutes); +DS1685_RTC_SYSFS_TIME_REG_RW(hours); +DS1685_RTC_SYSFS_TIME_REG_RW(wday); +DS1685_RTC_SYSFS_TIME_REG_RW(mday); +DS1685_RTC_SYSFS_TIME_REG_RW(month); +DS1685_RTC_SYSFS_TIME_REG_RW(year); +DS1685_RTC_SYSFS_TIME_REG_RW(century); +DS1685_RTC_SYSFS_TIME_REG_RW(alarm_seconds); +DS1685_RTC_SYSFS_TIME_REG_RW(alarm_minutes); +DS1685_RTC_SYSFS_TIME_REG_RW(alarm_hours); +DS1685_RTC_SYSFS_TIME_REG_RW(alarm_mday); + +static struct attribute* +ds1685_rtc_sysfs_time_attrs[] = { + &dev_attr_seconds.attr, + &dev_attr_minutes.attr, + &dev_attr_hours.attr, + &dev_attr_wday.attr, + &dev_attr_mday.attr, + &dev_attr_month.attr, + &dev_attr_year.attr, + &dev_attr_century.attr, + NULL, +}; + +static const struct attribute_group +ds1685_rtc_sysfs_time_grp = { + .name = "datetime", + .attrs = ds1685_rtc_sysfs_time_attrs, +}; + +static struct attribute* +ds1685_rtc_sysfs_alarm_attrs[] = { + &dev_attr_alarm_seconds.attr, + &dev_attr_alarm_minutes.attr, + &dev_attr_alarm_hours.attr, + &dev_attr_alarm_mday.attr, + NULL, +}; + +static const struct attribute_group +ds1685_rtc_sysfs_alarm_grp = { + .name = "alarm", + .attrs = ds1685_rtc_sysfs_alarm_attrs, +}; +#endif /* CONFIG_RTC_DS1685_SYSFS_REGS */ + + +/** + * ds1685_rtc_sysfs_register - register sysfs files. + * @dev: pointer to device structure. + */ +static int +ds1685_rtc_sysfs_register(struct device *dev) +{ + int ret = 0; + + sysfs_bin_attr_init(&ds1685_rtc_sysfs_nvram_attr); + ret = sysfs_create_bin_file(&dev->kobj, &ds1685_rtc_sysfs_nvram_attr); + if (ret) + return ret; + + ret = sysfs_create_group(&dev->kobj, &ds1685_rtc_sysfs_misc_grp); + if (ret) + return ret; + +#ifdef CONFIG_RTC_DS1685_SYSFS_REGS + ret = sysfs_create_group(&dev->kobj, &ds1685_rtc_sysfs_ctrla_grp); + if (ret) + return ret; + + ret = sysfs_create_group(&dev->kobj, &ds1685_rtc_sysfs_ctrlb_grp); + if (ret) + return ret; + + ret = sysfs_create_group(&dev->kobj, &ds1685_rtc_sysfs_ctrlc_grp); + if (ret) + return ret; + + ret = sysfs_create_group(&dev->kobj, &ds1685_rtc_sysfs_ctrld_grp); + if (ret) + return ret; + + ret = sysfs_create_group(&dev->kobj, &ds1685_rtc_sysfs_ctrl4a_grp); + if (ret) + return ret; + + ret = sysfs_create_group(&dev->kobj, &ds1685_rtc_sysfs_ctrl4b_grp); + if (ret) + return ret; + + ret = sysfs_create_group(&dev->kobj, &ds1685_rtc_sysfs_time_grp); + if (ret) + return ret; + + ret = sysfs_create_group(&dev->kobj, &ds1685_rtc_sysfs_alarm_grp); + if (ret) + return ret; +#endif + return 0; +} + +/** + * ds1685_rtc_sysfs_unregister - unregister sysfs files. + * @dev: pointer to device structure. + */ +static int +ds1685_rtc_sysfs_unregister(struct device *dev) +{ + sysfs_remove_bin_file(&dev->kobj, &ds1685_rtc_sysfs_nvram_attr); + sysfs_remove_group(&dev->kobj, &ds1685_rtc_sysfs_misc_grp); + +#ifdef CONFIG_RTC_DS1685_SYSFS_REGS + sysfs_remove_group(&dev->kobj, &ds1685_rtc_sysfs_ctrla_grp); + sysfs_remove_group(&dev->kobj, &ds1685_rtc_sysfs_ctrlb_grp); + sysfs_remove_group(&dev->kobj, &ds1685_rtc_sysfs_ctrlc_grp); + sysfs_remove_group(&dev->kobj, &ds1685_rtc_sysfs_ctrld_grp); + sysfs_remove_group(&dev->kobj, &ds1685_rtc_sysfs_ctrl4a_grp); + sysfs_remove_group(&dev->kobj, &ds1685_rtc_sysfs_ctrl4b_grp); + sysfs_remove_group(&dev->kobj, &ds1685_rtc_sysfs_time_grp); + sysfs_remove_group(&dev->kobj, &ds1685_rtc_sysfs_alarm_grp); +#endif + + return 0; +} +#endif /* CONFIG_SYSFS */ + + + +/* ----------------------------------------------------------------------- */ +/* Driver Probe/Removal */ + +/** + * ds1685_rtc_probe - initializes rtc driver. + * @pdev: pointer to platform_device structure. + */ +static int +ds1685_rtc_probe(struct platform_device *pdev) +{ + struct rtc_device *rtc_dev; + struct resource *res; + struct ds1685_priv *rtc; + struct ds1685_rtc_platform_data *pdata; + u8 ctrla, ctrlb, hours; + unsigned char am_pm; + int ret = 0; + + /* Get the platform data. */ + pdata = (struct ds1685_rtc_platform_data *) pdev->dev.platform_data; + if (!pdata) + return -ENODEV; + + /* Allocate memory for the rtc device. */ + rtc = devm_kzalloc(&pdev->dev, sizeof(*rtc), GFP_KERNEL); + if (!rtc) + return -ENOMEM; + + /* + * Allocate/setup any IORESOURCE_MEM resources, if required. Not all + * platforms put the RTC in an easy-access place. Like the SGI Octane, + * which attaches the RTC to a "ByteBus", hooked to a SuperIO chip + * that sits behind the IOC3 PCI metadevice. + */ + if (pdata->alloc_io_resources) { + /* Get the platform resources. */ + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + if (!res) + return -ENXIO; + rtc->size = resource_size(res); + + /* Request a memory region. */ + /* XXX: mmio-only for now. */ + if (!devm_request_mem_region(&pdev->dev, res->start, rtc->size, + pdev->name)) + return -EBUSY; + + /* + * Set the base address for the rtc, and ioremap its + * registers. + */ + rtc->baseaddr = res->start; + rtc->regs = devm_ioremap(&pdev->dev, res->start, rtc->size); + if (!rtc->regs) + return -ENOMEM; + } + rtc->alloc_io_resources = pdata->alloc_io_resources; + + /* Get the register step size. */ + if (pdata->regstep > 0) + rtc->regstep = pdata->regstep; + else + rtc->regstep = 1; + + /* Platform read function, else default if mmio setup */ + if (pdata->plat_read) + rtc->read = pdata->plat_read; + else + if (pdata->alloc_io_resources) + rtc->read = ds1685_read; + else + return -ENXIO; + + /* Platform write function, else default if mmio setup */ + if (pdata->plat_write) + rtc->write = pdata->plat_write; + else + if (pdata->alloc_io_resources) + rtc->write = ds1685_write; + else + return -ENXIO; + + /* Platform pre-shutdown function, if defined. */ + if (pdata->plat_prepare_poweroff) + rtc->prepare_poweroff = pdata->plat_prepare_poweroff; + + /* Platform wake_alarm function, if defined. */ + if (pdata->plat_wake_alarm) + rtc->wake_alarm = pdata->plat_wake_alarm; + + /* Platform post_ram_clear function, if defined. */ + if (pdata->plat_post_ram_clear) + rtc->post_ram_clear = pdata->plat_post_ram_clear; + + /* Init the spinlock, workqueue, & set the driver data. */ + spin_lock_init(&rtc->lock); + INIT_WORK(&rtc->work, ds1685_rtc_work_queue); + platform_set_drvdata(pdev, rtc); + + /* Turn the oscillator on if is not already on (DV1 = 1). */ + ctrla = rtc->read(rtc, RTC_CTRL_A); + if (!(ctrla & RTC_CTRL_A_DV1)) + ctrla |= RTC_CTRL_A_DV1; + + /* Enable the countdown chain (DV2 = 0) */ + ctrla &= ~(RTC_CTRL_A_DV2); + + /* Clear RS3-RS0 in Control A. */ + ctrla &= ~(RTC_CTRL_A_RS_MASK); + + /* + * All done with Control A. Switch to Bank 1 for the remainder of + * the RTC setup so we have access to the extended functions. + */ + ctrla |= RTC_CTRL_A_DV0; + rtc->write(rtc, RTC_CTRL_A, ctrla); + + /* Default to 32768kHz output. */ + rtc->write(rtc, RTC_EXT_CTRL_4B, + (rtc->read(rtc, RTC_EXT_CTRL_4B) | RTC_CTRL_4B_E32K)); + + /* Set the SET bit in Control B so we can do some housekeeping. */ + rtc->write(rtc, RTC_CTRL_B, + (rtc->read(rtc, RTC_CTRL_B) | RTC_CTRL_B_SET)); + + /* Read Ext Ctrl 4A and check the INCR bit to avoid a lockout. */ + while (rtc->read(rtc, RTC_EXT_CTRL_4A) & RTC_CTRL_4A_INCR) + cpu_relax(); + + /* + * If the platform supports BCD mode, then set DM=0 in Control B. + * Otherwise, set DM=1 for BIN mode. + */ + ctrlb = rtc->read(rtc, RTC_CTRL_B); + if (pdata->bcd_mode) + ctrlb &= ~(RTC_CTRL_B_DM); + else + ctrlb |= RTC_CTRL_B_DM; + rtc->bcd_mode = pdata->bcd_mode; + + /* + * Disable Daylight Savings Time (DSE = 0). + * The RTC has hardcoded timezone information that is rendered + * obselete. We'll let the OS deal with DST settings instead. + */ + if (ctrlb & RTC_CTRL_B_DSE) + ctrlb &= ~(RTC_CTRL_B_DSE); + + /* Force 24-hour mode (2412 = 1). */ + if (!(ctrlb & RTC_CTRL_B_2412)) { + /* Reinitialize the time hours. */ + hours = rtc->read(rtc, RTC_HRS); + am_pm = hours & RTC_HRS_AMPM_MASK; + hours = ds1685_rtc_bcd2bin(rtc, hours, RTC_HRS_12_BCD_MASK, + RTC_HRS_12_BIN_MASK); + hours = ((hours == 12) ? 0 : ((am_pm) ? hours + 12 : hours)); + + /* Enable 24-hour mode. */ + ctrlb |= RTC_CTRL_B_2412; + + /* Write back to Control B, including DM & DSE bits. */ + rtc->write(rtc, RTC_CTRL_B, ctrlb); + + /* Write the time hours back. */ + rtc->write(rtc, RTC_HRS, + ds1685_rtc_bin2bcd(rtc, hours, + RTC_HRS_24_BIN_MASK, + RTC_HRS_24_BCD_MASK)); + + /* Reinitialize the alarm hours. */ + hours = rtc->read(rtc, RTC_HRS_ALARM); + am_pm = hours & RTC_HRS_AMPM_MASK; + hours = ds1685_rtc_bcd2bin(rtc, hours, RTC_HRS_12_BCD_MASK, + RTC_HRS_12_BIN_MASK); + hours = ((hours == 12) ? 0 : ((am_pm) ? hours + 12 : hours)); + + /* Write the alarm hours back. */ + rtc->write(rtc, RTC_HRS_ALARM, + ds1685_rtc_bin2bcd(rtc, hours, + RTC_HRS_24_BIN_MASK, + RTC_HRS_24_BCD_MASK)); + } else { + /* 24-hour mode is already set, so write Control B back. */ + rtc->write(rtc, RTC_CTRL_B, ctrlb); + } + + /* Unset the SET bit in Control B so the RTC can update. */ + rtc->write(rtc, RTC_CTRL_B, + (rtc->read(rtc, RTC_CTRL_B) & ~(RTC_CTRL_B_SET))); + + /* Check the main battery. */ + if (!(rtc->read(rtc, RTC_CTRL_D) & RTC_CTRL_D_VRT)) + dev_warn(&pdev->dev, + "Main battery is exhausted! RTC may be invalid!\n"); + + /* Check the auxillary battery. It is optional. */ + if (!(rtc->read(rtc, RTC_EXT_CTRL_4A) & RTC_CTRL_4A_VRT2)) + dev_warn(&pdev->dev, + "Aux battery is exhausted or not available.\n"); + + /* Read Ctrl B and clear PIE/AIE/UIE. */ + rtc->write(rtc, RTC_CTRL_B, + (rtc->read(rtc, RTC_CTRL_B) & ~(RTC_CTRL_B_PAU_MASK))); + + /* Reading Ctrl C auto-clears PF/AF/UF. */ + rtc->read(rtc, RTC_CTRL_C); + + /* Read Ctrl 4B and clear RIE/WIE/KSE. */ + rtc->write(rtc, RTC_EXT_CTRL_4B, + (rtc->read(rtc, RTC_EXT_CTRL_4B) & ~(RTC_CTRL_4B_RWK_MASK))); + + /* Clear RF/WF/KF in Ctrl 4A. */ + rtc->write(rtc, RTC_EXT_CTRL_4A, + (rtc->read(rtc, RTC_EXT_CTRL_4A) & ~(RTC_CTRL_4A_RWK_MASK))); + + /* + * Re-enable KSE to handle power button events. We do not enable + * WIE or RIE by default. + */ + rtc->write(rtc, RTC_EXT_CTRL_4B, + (rtc->read(rtc, RTC_EXT_CTRL_4B) | RTC_CTRL_4B_KSE)); + + /* + * Fetch the IRQ and setup the interrupt handler. + * + * Not all platforms have the IRQF pin tied to something. If not, the + * RTC will still set the *IE / *F flags and raise IRQF in ctrlc, but + * there won't be an automatic way of notifying the kernel about it, + * unless ctrlc is explicitly polled. + */ + if (!pdata->no_irq) { + ret = platform_get_irq(pdev, 0); + if (ret > 0) { + rtc->irq_num = ret; + + /* Request an IRQ. */ + ret = devm_request_irq(&pdev->dev, rtc->irq_num, + ds1685_rtc_irq_handler, + IRQF_SHARED, pdev->name, pdev); + + /* Check to see if something came back. */ + if (unlikely(ret)) { + dev_warn(&pdev->dev, + "RTC interrupt not available\n"); + rtc->irq_num = 0; + } + } else + return ret; + } + rtc->no_irq = pdata->no_irq; + + /* Setup complete. */ + ds1685_rtc_switch_to_bank0(rtc); + + /* Register the device as an RTC. */ + rtc_dev = rtc_device_register(pdev->name, &pdev->dev, + &ds1685_rtc_ops, THIS_MODULE); + + /* Success? */ + if (IS_ERR(rtc_dev)) + return PTR_ERR(rtc_dev); + + /* Maximum periodic rate is 8192Hz (0.122070ms). */ + rtc_dev->max_user_freq = RTC_MAX_USER_FREQ; + + /* See if the platform doesn't support UIE. */ + if (pdata->uie_unsupported) + rtc_dev->uie_unsupported = 1; + rtc->uie_unsupported = pdata->uie_unsupported; + + rtc->dev = rtc_dev; + +#ifdef CONFIG_SYSFS + ret = ds1685_rtc_sysfs_register(&pdev->dev); + if (ret) + rtc_device_unregister(rtc->dev); +#endif + + /* Done! */ + return ret; +} + +/** + * ds1685_rtc_remove - removes rtc driver. + * @pdev: pointer to platform_device structure. + */ +static int +ds1685_rtc_remove(struct platform_device *pdev) +{ + struct ds1685_priv *rtc = platform_get_drvdata(pdev); + +#ifdef CONFIG_SYSFS + ds1685_rtc_sysfs_unregister(&pdev->dev); +#endif + + rtc_device_unregister(rtc->dev); + + /* Read Ctrl B and clear PIE/AIE/UIE. */ + rtc->write(rtc, RTC_CTRL_B, + (rtc->read(rtc, RTC_CTRL_B) & + ~(RTC_CTRL_B_PAU_MASK))); + + /* Reading Ctrl C auto-clears PF/AF/UF. */ + rtc->read(rtc, RTC_CTRL_C); + + /* Read Ctrl 4B and clear RIE/WIE/KSE. */ + rtc->write(rtc, RTC_EXT_CTRL_4B, + (rtc->read(rtc, RTC_EXT_CTRL_4B) & + ~(RTC_CTRL_4B_RWK_MASK))); + + /* Manually clear RF/WF/KF in Ctrl 4A. */ + rtc->write(rtc, RTC_EXT_CTRL_4A, + (rtc->read(rtc, RTC_EXT_CTRL_4A) & + ~(RTC_CTRL_4A_RWK_MASK))); + + cancel_work_sync(&rtc->work); + + return 0; +} + +/** + * ds1685_rtc_driver - rtc driver properties. + */ +static struct platform_driver ds1685_rtc_driver = { + .driver = { + .name = "rtc-ds1685", + .owner = THIS_MODULE, + }, + .probe = ds1685_rtc_probe, + .remove = ds1685_rtc_remove, +}; + +/** + * ds1685_rtc_init - rtc module init. + */ +static int __init +ds1685_rtc_init(void) +{ + return platform_driver_register(&ds1685_rtc_driver); +} + +/** + * ds1685_rtc_exit - rtc module exit. + */ +static void __exit +ds1685_rtc_exit(void) +{ + platform_driver_unregister(&ds1685_rtc_driver); +} + +module_init(ds1685_rtc_init); +module_exit(ds1685_rtc_exit); +/* ----------------------------------------------------------------------- */ + + +/* ----------------------------------------------------------------------- */ +/* Poweroff function */ + +/** + * ds1685_rtc_poweroff - uses the RTC chip to power the system off. + * @pdev: pointer to platform_device structure. + */ +extern void __noreturn +ds1685_rtc_poweroff(struct platform_device *pdev) +{ + u8 ctrla, ctrl4a, ctrl4b; + struct ds1685_priv *rtc; + + /* Check for valid RTC data, else, spin forever. */ + if (unlikely(!pdev)) { + pr_emerg("rtc-ds1685: platform device data not available, spinning forever ...\n"); + unreachable(); + } else { + /* Get the rtc data. */ + rtc = platform_get_drvdata(pdev); + + /* + * Disable our IRQ. We're powering down, so we're not + * going to worry about cleaning up. Most of that should + * have been taken care of by the shutdown scripts and this + * is the final function call. + */ + if (!rtc->no_irq) + disable_irq_nosync(rtc->irq_num); + + /* Oscillator must be on and the countdown chain enabled. */ + ctrla = rtc->read(rtc, RTC_CTRL_A); + ctrla |= RTC_CTRL_A_DV1; + ctrla &= ~(RTC_CTRL_A_DV2); + rtc->write(rtc, RTC_CTRL_A, ctrla); + + /* + * Read Control 4A and check the status of the auxillary + * battery. This must be present and working (VRT2 = 1) + * for wakeup and kickstart functionality to be useful. + */ + ds1685_rtc_switch_to_bank1(rtc); + ctrl4a = rtc->read(rtc, RTC_EXT_CTRL_4A); + if (ctrl4a & RTC_CTRL_4A_VRT2) { + /* Clear all of the interrupt flags on Control 4A. */ + ctrl4a &= ~(RTC_CTRL_4A_RWK_MASK); + rtc->write(rtc, RTC_EXT_CTRL_4A, ctrl4a); + + /* + * The auxillary battery is present and working. + * Enable extended functions (ABE=1), enable + * wake-up (WIE=1), and enable kickstart (KSE=1) + * in Control 4B. + */ + ctrl4b = rtc->read(rtc, RTC_EXT_CTRL_4B); + ctrl4b |= (RTC_CTRL_4B_ABE | RTC_CTRL_4B_WIE | + RTC_CTRL_4B_KSE); + rtc->write(rtc, RTC_EXT_CTRL_4B, ctrl4b); + } + + /* Set PAB to 1 in Control 4A to power the system down. */ + dev_warn(&pdev->dev, "Powerdown.\n"); + msleep(20); + rtc->write(rtc, RTC_EXT_CTRL_4A, + (ctrl4a | RTC_CTRL_4A_PAB)); + + /* Spin ... we do not switch back to bank0. */ + unreachable(); + } +} +EXPORT_SYMBOL(ds1685_rtc_poweroff); +/* ----------------------------------------------------------------------- */ + + +MODULE_AUTHOR("Joshua Kinard "); +MODULE_AUTHOR("Matthias Fuchs "); +MODULE_DESCRIPTION("Dallas/Maxim DS1685/DS1687-series RTC driver"); +MODULE_LICENSE("GPL"); +MODULE_VERSION(DRV_VERSION); +MODULE_ALIAS("platform:rtc-ds1685"); diff --git a/include/linux/rtc/ds1685.h b/include/linux/rtc/ds1685.h new file mode 100644 index 000000000000..e6337a56d741 --- /dev/null +++ b/include/linux/rtc/ds1685.h @@ -0,0 +1,375 @@ +/* + * Definitions for the registers, addresses, and platform data of the + * DS1685/DS1687-series RTC chips. + * + * This Driver also works for the DS17X85/DS17X87 RTC chips. Functionally + * similar to the DS1685/DS1687, they support a few extra features which + * include larger, battery-backed NV-SRAM, burst-mode access, and an RTC + * write counter. + * + * Copyright (C) 2011-2014 Joshua Kinard . + * Copyright (C) 2009 Matthias Fuchs . + * + * References: + * DS1685/DS1687 3V/5V Real-Time Clocks, 19-5215, Rev 4/10. + * DS17x85/DS17x87 3V/5V Real-Time Clocks, 19-5222, Rev 4/10. + * DS1689/DS1693 3V/5V Serialized Real-Time Clocks, Rev 112105. + * Application Note 90, Using the Multiplex Bus RTC Extended Features. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef _LINUX_RTC_DS1685_H_ +#define _LINUX_RTC_DS1685_H_ + +#include +#include +#include + +/** + * struct ds1685_priv - DS1685 private data structure. + * @dev: pointer to the rtc_device structure. + * @regs: iomapped base address pointer of the RTC registers. + * @regstep: padding/step size between registers (optional). + * @baseaddr: base address of the RTC device. + * @size: resource size. + * @lock: private lock variable for spin locking/unlocking. + * @work: private workqueue. + * @irq: IRQ number assigned to the RTC device. + * @prepare_poweroff: pointer to platform pre-poweroff function. + * @wake_alarm: pointer to platform wake alarm function. + * @post_ram_clear: pointer to platform post ram-clear function. + */ +struct ds1685_priv { + struct rtc_device *dev; + void __iomem *regs; + u32 regstep; + resource_size_t baseaddr; + size_t size; + spinlock_t lock; + struct work_struct work; + int irq_num; + bool bcd_mode; + bool no_irq; + bool uie_unsupported; + bool alloc_io_resources; + u8 (*read)(struct ds1685_priv *, int); + void (*write)(struct ds1685_priv *, int, u8); + void (*prepare_poweroff)(void); + void (*wake_alarm)(void); + void (*post_ram_clear)(void); +}; + + +/** + * struct ds1685_rtc_platform_data - platform data structure. + * @plat_prepare_poweroff: platform-specific pre-poweroff function. + * @plat_wake_alarm: platform-specific wake alarm function. + * @plat_post_ram_clear: platform-specific post ram-clear function. + * + * If your platform needs to use a custom padding/step size between + * registers, or uses one or more of the extended interrupts and needs special + * handling, then include this header file in your platform definition and + * set regstep and the plat_* pointers as appropriate. + */ +struct ds1685_rtc_platform_data { + const u32 regstep; + const bool bcd_mode; + const bool no_irq; + const bool uie_unsupported; + const bool alloc_io_resources; + u8 (*plat_read)(struct ds1685_priv *, int); + void (*plat_write)(struct ds1685_priv *, int, u8); + void (*plat_prepare_poweroff)(void); + void (*plat_wake_alarm)(void); + void (*plat_post_ram_clear)(void); +}; + + +/* + * Time Registers. + */ +#define RTC_SECS 0x00 /* Seconds 00-59 */ +#define RTC_SECS_ALARM 0x01 /* Alarm Seconds 00-59 */ +#define RTC_MINS 0x02 /* Minutes 00-59 */ +#define RTC_MINS_ALARM 0x03 /* Alarm Minutes 00-59 */ +#define RTC_HRS 0x04 /* Hours 01-12 AM/PM || 00-23 */ +#define RTC_HRS_ALARM 0x05 /* Alarm Hours 01-12 AM/PM || 00-23 */ +#define RTC_WDAY 0x06 /* Day of Week 01-07 */ +#define RTC_MDAY 0x07 /* Day of Month 01-31 */ +#define RTC_MONTH 0x08 /* Month 01-12 */ +#define RTC_YEAR 0x09 /* Year 00-99 */ +#define RTC_CENTURY 0x48 /* Century 00-99 */ +#define RTC_MDAY_ALARM 0x49 /* Alarm Day of Month 01-31 */ + + +/* + * Bit masks for the Time registers in BCD Mode (DM = 0). + */ +#define RTC_SECS_BCD_MASK 0x7f /* - x x x x x x x */ +#define RTC_MINS_BCD_MASK 0x7f /* - x x x x x x x */ +#define RTC_HRS_12_BCD_MASK 0x1f /* - - - x x x x x */ +#define RTC_HRS_24_BCD_MASK 0x3f /* - - x x x x x x */ +#define RTC_MDAY_BCD_MASK 0x3f /* - - x x x x x x */ +#define RTC_MONTH_BCD_MASK 0x1f /* - - - x x x x x */ +#define RTC_YEAR_BCD_MASK 0xff /* x x x x x x x x */ + +/* + * Bit masks for the Time registers in BIN Mode (DM = 1). + */ +#define RTC_SECS_BIN_MASK 0x3f /* - - x x x x x x */ +#define RTC_MINS_BIN_MASK 0x3f /* - - x x x x x x */ +#define RTC_HRS_12_BIN_MASK 0x0f /* - - - - x x x x */ +#define RTC_HRS_24_BIN_MASK 0x1f /* - - - x x x x x */ +#define RTC_MDAY_BIN_MASK 0x1f /* - - - x x x x x */ +#define RTC_MONTH_BIN_MASK 0x0f /* - - - - x x x x */ +#define RTC_YEAR_BIN_MASK 0x7f /* - x x x x x x x */ + +/* + * Bit masks common for the Time registers in BCD or BIN Mode. + */ +#define RTC_WDAY_MASK 0x07 /* - - - - - x x x */ +#define RTC_CENTURY_MASK 0xff /* x x x x x x x x */ +#define RTC_MDAY_ALARM_MASK 0xff /* x x x x x x x x */ +#define RTC_HRS_AMPM_MASK BIT(7) /* Mask for the AM/PM bit */ + + + +/* + * Control Registers. + */ +#define RTC_CTRL_A 0x0a /* Control Register A */ +#define RTC_CTRL_B 0x0b /* Control Register B */ +#define RTC_CTRL_C 0x0c /* Control Register C */ +#define RTC_CTRL_D 0x0d /* Control Register D */ +#define RTC_EXT_CTRL_4A 0x4a /* Extended Control Register 4A */ +#define RTC_EXT_CTRL_4B 0x4b /* Extended Control Register 4B */ + + +/* + * Bit names in Control Register A. + */ +#define RTC_CTRL_A_UIP BIT(7) /* Update In Progress */ +#define RTC_CTRL_A_DV2 BIT(6) /* Countdown Chain */ +#define RTC_CTRL_A_DV1 BIT(5) /* Oscillator Enable */ +#define RTC_CTRL_A_DV0 BIT(4) /* Bank Select */ +#define RTC_CTRL_A_RS2 BIT(2) /* Rate-Selection Bit 2 */ +#define RTC_CTRL_A_RS3 BIT(3) /* Rate-Selection Bit 3 */ +#define RTC_CTRL_A_RS1 BIT(1) /* Rate-Selection Bit 1 */ +#define RTC_CTRL_A_RS0 BIT(0) /* Rate-Selection Bit 0 */ +#define RTC_CTRL_A_RS_MASK 0x0f /* RS3 + RS2 + RS1 + RS0 */ + +/* + * Bit names in Control Register B. + */ +#define RTC_CTRL_B_SET BIT(7) /* SET Bit */ +#define RTC_CTRL_B_PIE BIT(6) /* Periodic-Interrupt Enable */ +#define RTC_CTRL_B_AIE BIT(5) /* Alarm-Interrupt Enable */ +#define RTC_CTRL_B_UIE BIT(4) /* Update-Ended Interrupt-Enable */ +#define RTC_CTRL_B_SQWE BIT(3) /* Square-Wave Enable */ +#define RTC_CTRL_B_DM BIT(2) /* Data Mode */ +#define RTC_CTRL_B_2412 BIT(1) /* 12-Hr/24-Hr Mode */ +#define RTC_CTRL_B_DSE BIT(0) /* Daylight Savings Enable */ +#define RTC_CTRL_B_PAU_MASK 0x70 /* PIE + AIE + UIE */ + + +/* + * Bit names in Control Register C. + * + * BIT(0), BIT(1), BIT(2), & BIT(3) are unused, always return 0, and cannot + * be written to. + */ +#define RTC_CTRL_C_IRQF BIT(7) /* Interrupt-Request Flag */ +#define RTC_CTRL_C_PF BIT(6) /* Periodic-Interrupt Flag */ +#define RTC_CTRL_C_AF BIT(5) /* Alarm-Interrupt Flag */ +#define RTC_CTRL_C_UF BIT(4) /* Update-Ended Interrupt Flag */ +#define RTC_CTRL_C_PAU_MASK 0x70 /* PF + AF + UF */ + + +/* + * Bit names in Control Register D. + * + * BIT(0) through BIT(6) are unused, always return 0, and cannot + * be written to. + */ +#define RTC_CTRL_D_VRT BIT(7) /* Valid RAM and Time */ + + +/* + * Bit names in Extended Control Register 4A. + * + * On the DS1685/DS1687/DS1689/DS1693, BIT(4) and BIT(5) are reserved for + * future use. They can be read from and written to, but have no effect + * on the RTC's operation. + * + * On the DS17x85/DS17x87, BIT(5) is Burst-Mode Enable (BME), and allows + * access to the extended NV-SRAM by automatically incrementing the address + * register when they are read from or written to. + */ +#define RTC_CTRL_4A_VRT2 BIT(7) /* Auxillary Battery Status */ +#define RTC_CTRL_4A_INCR BIT(6) /* Increment-in-Progress Status */ +#define RTC_CTRL_4A_PAB BIT(3) /* Power-Active Bar Control */ +#define RTC_CTRL_4A_RF BIT(2) /* RAM-Clear Flag */ +#define RTC_CTRL_4A_WF BIT(1) /* Wake-Up Alarm Flag */ +#define RTC_CTRL_4A_KF BIT(0) /* Kickstart Flag */ +#if !defined(CONFIG_RTC_DRV_DS1685) && !defined(CONFIG_RTC_DRV_DS1689) +#define RTC_CTRL_4A_BME BIT(5) /* Burst-Mode Enable */ +#endif +#define RTC_CTRL_4A_RWK_MASK 0x07 /* RF + WF + KF */ + + +/* + * Bit names in Extended Control Register 4B. + */ +#define RTC_CTRL_4B_ABE BIT(7) /* Auxillary Battery Enable */ +#define RTC_CTRL_4B_E32K BIT(6) /* Enable 32.768Hz on SQW Pin */ +#define RTC_CTRL_4B_CS BIT(5) /* Crystal Select */ +#define RTC_CTRL_4B_RCE BIT(4) /* RAM Clear-Enable */ +#define RTC_CTRL_4B_PRS BIT(3) /* PAB Reset-Select */ +#define RTC_CTRL_4B_RIE BIT(2) /* RAM Clear-Interrupt Enable */ +#define RTC_CTRL_4B_WIE BIT(1) /* Wake-Up Alarm-Interrupt Enable */ +#define RTC_CTRL_4B_KSE BIT(0) /* Kickstart Interrupt-Enable */ +#define RTC_CTRL_4B_RWK_MASK 0x07 /* RIE + WIE + KSE */ + + +/* + * Misc register names in Bank 1. + * + * The DV0 bit in Control Register A must be set to 1 for these registers + * to become available, including Extended Control Registers 4A & 4B. + */ +#define RTC_BANK1_SSN_MODEL 0x40 /* Model Number */ +#define RTC_BANK1_SSN_BYTE_1 0x41 /* 1st Byte of Serial Number */ +#define RTC_BANK1_SSN_BYTE_2 0x42 /* 2nd Byte of Serial Number */ +#define RTC_BANK1_SSN_BYTE_3 0x43 /* 3rd Byte of Serial Number */ +#define RTC_BANK1_SSN_BYTE_4 0x44 /* 4th Byte of Serial Number */ +#define RTC_BANK1_SSN_BYTE_5 0x45 /* 5th Byte of Serial Number */ +#define RTC_BANK1_SSN_BYTE_6 0x46 /* 6th Byte of Serial Number */ +#define RTC_BANK1_SSN_CRC 0x47 /* Serial CRC Byte */ +#define RTC_BANK1_RAM_DATA_PORT 0x53 /* Extended RAM Data Port */ + + +/* + * Model-specific registers in Bank 1. + * + * The addresses below differ depending on the model of the RTC chip + * selected in the kernel configuration. Not all of these features are + * supported in the main driver at present. + * + * DS1685/DS1687 - Extended NV-SRAM address (LSB only). + * DS1689/DS1693 - Vcc, Vbat, Pwr Cycle Counters & Customer-specific S/N. + * DS17x85/DS17x87 - Extended NV-SRAM addresses (MSB & LSB) & Write counter. + */ +#if defined(CONFIG_RTC_DRV_DS1685) +#define RTC_BANK1_RAM_ADDR 0x50 /* NV-SRAM Addr */ +#elif defined(CONFIG_RTC_DRV_DS1689) +#define RTC_BANK1_VCC_CTR_LSB 0x54 /* Vcc Counter Addr (LSB) */ +#define RTC_BANK1_VCC_CTR_MSB 0x57 /* Vcc Counter Addr (MSB) */ +#define RTC_BANK1_VBAT_CTR_LSB 0x58 /* Vbat Counter Addr (LSB) */ +#define RTC_BANK1_VBAT_CTR_MSB 0x5b /* Vbat Counter Addr (MSB) */ +#define RTC_BANK1_PWR_CTR_LSB 0x5c /* Pwr Cycle Counter Addr (LSB) */ +#define RTC_BANK1_PWR_CTR_MSB 0x5d /* Pwr Cycle Counter Addr (MSB) */ +#define RTC_BANK1_UNIQ_SN 0x60 /* Customer-specific S/N */ +#else /* DS17x85/DS17x87 */ +#define RTC_BANK1_RAM_ADDR_LSB 0x50 /* NV-SRAM Addr (LSB) */ +#define RTC_BANK1_RAM_ADDR_MSB 0x51 /* NV-SRAM Addr (MSB) */ +#define RTC_BANK1_WRITE_CTR 0x5e /* RTC Write Counter */ +#endif + + +/* + * Model numbers. + * + * The DS1688/DS1691 and DS1689/DS1693 chips share the same model number + * and the manual doesn't indicate any major differences. As such, they + * are regarded as the same chip in this driver. + */ +#define RTC_MODEL_DS1685 0x71 /* DS1685/DS1687 */ +#define RTC_MODEL_DS17285 0x72 /* DS17285/DS17287 */ +#define RTC_MODEL_DS1689 0x73 /* DS1688/DS1691/DS1689/DS1693 */ +#define RTC_MODEL_DS17485 0x74 /* DS17485/DS17487 */ +#define RTC_MODEL_DS17885 0x78 /* DS17885/DS17887 */ + + +/* + * Periodic Interrupt Rates / Square-Wave Output Frequency + * + * Periodic rates are selected by setting the RS3-RS0 bits in Control + * Register A and enabled via either the E32K bit in Extended Control + * Register 4B or the SQWE bit in Control Register B. + * + * E32K overrides the settings of RS3-RS0 and outputs a frequency of 32768Hz + * on the SQW pin of the RTC chip. While there are 16 possible selections, + * the 1-of-16 decoder is only able to divide the base 32768Hz signal into 13 + * smaller frequencies. The values 0x01 and 0x02 are not used and are + * synonymous with 0x08 and 0x09, respectively. + * + * When E32K is set to a logic 1, periodic interrupts are disabled and reading + * /dev/rtc will return -EINVAL. This also applies if the periodic interrupt + * frequency is set to 0Hz. + * + * Not currently used by the rtc-ds1685 driver because the RTC core removed + * support for hardware-generated periodic-interrupts in favour of + * hrtimer-generated interrupts. But these defines are kept around for use + * in userland, as documentation to the hardware, and possible future use if + * hardware-generated periodic interrupts are ever added back. + */ + /* E32K RS3 RS2 RS1 RS0 */ +#define RTC_SQW_8192HZ 0x03 /* 0 0 0 1 1 */ +#define RTC_SQW_4096HZ 0x04 /* 0 0 1 0 0 */ +#define RTC_SQW_2048HZ 0x05 /* 0 0 1 0 1 */ +#define RTC_SQW_1024HZ 0x06 /* 0 0 1 1 0 */ +#define RTC_SQW_512HZ 0x07 /* 0 0 1 1 1 */ +#define RTC_SQW_256HZ 0x08 /* 0 1 0 0 0 */ +#define RTC_SQW_128HZ 0x09 /* 0 1 0 0 1 */ +#define RTC_SQW_64HZ 0x0a /* 0 1 0 1 0 */ +#define RTC_SQW_32HZ 0x0b /* 0 1 0 1 1 */ +#define RTC_SQW_16HZ 0x0c /* 0 1 1 0 0 */ +#define RTC_SQW_8HZ 0x0d /* 0 1 1 0 1 */ +#define RTC_SQW_4HZ 0x0e /* 0 1 1 1 0 */ +#define RTC_SQW_2HZ 0x0f /* 0 1 1 1 1 */ +#define RTC_SQW_0HZ 0x00 /* 0 0 0 0 0 */ +#define RTC_SQW_32768HZ 32768 /* 1 - - - - */ +#define RTC_MAX_USER_FREQ 8192 + + +/* + * NVRAM data & addresses: + * - 50 bytes of NVRAM are available just past the clock registers. + * - 64 additional bytes are available in Bank0. + * + * Extended, battery-backed NV-SRAM: + * - DS1685/DS1687 - 128 bytes. + * - DS1689/DS1693 - 0 bytes. + * - DS17285/DS17287 - 2048 bytes. + * - DS17485/DS17487 - 4096 bytes. + * - DS17885/DS17887 - 8192 bytes. + */ +#define NVRAM_TIME_BASE 0x0e /* NVRAM Addr in Time regs */ +#define NVRAM_BANK0_BASE 0x40 /* NVRAM Addr in Bank0 regs */ +#define NVRAM_SZ_TIME 50 +#define NVRAM_SZ_BANK0 64 +#if defined(CONFIG_RTC_DRV_DS1685) +# define NVRAM_SZ_EXTND 128 +#elif defined(CONFIG_RTC_DRV_DS1689) +# define NVRAM_SZ_EXTND 0 +#elif defined(CONFIG_RTC_DRV_DS17285) +# define NVRAM_SZ_EXTND 2048 +#elif defined(CONFIG_RTC_DRV_DS17485) +# define NVRAM_SZ_EXTND 4096 +#elif defined(CONFIG_RTC_DRV_DS17885) +# define NVRAM_SZ_EXTND 8192 +#endif +#define NVRAM_TOTAL_SZ_BANK0 (NVRAM_SZ_TIME + NVRAM_SZ_BANK0) +#define NVRAM_TOTAL_SZ (NVRAM_TOTAL_SZ_BANK0 + NVRAM_SZ_EXTND) + + +/* + * Function Prototypes. + */ +extern void __noreturn +ds1685_rtc_poweroff(struct platform_device *pdev); + +#endif /* _LINUX_RTC_DS1685_H_ */