remarkable-linux/fs/xfs/xfs_dinode.h
Dave Chinner e1b4271ac2 xfs: di_flushiter considered harmful
When we made all inode updates transactional, we no longer needed
the log recovery detection for inodes being newer on disk than the
transaction being replayed - it was redundant as replay of the log
would always result in the latest version of the inode would be on
disk. It was redundant, but left in place because it wasn't
considered to be a problem.

However, with the new "don't read inodes on create" optimisation,
flushiter has come back to bite us. Essentially, the optimisation
made always initialises flushiter to zero in the create transaction,
and so if we then crash and run recovery and the inode already on
disk has a non-zero flushiter it will skip recovery of that inode.
As a result, log recovery does the wrong thing and we end up with a
corrupt filesystem.

Because we have to support old kernel to new kernel upgrades, we
can't just get rid of the flushiter support in log recovery as we
might be upgrading from a kernel that doesn't have fully transactional
inode updates.  Unfortunately, for v4 superblocks there is no way to
guarantee that log recovery knows about this fact.

We cannot add a new inode format flag to say it's a "special inode
create" because it won't be understood by older kernels and so
recovery could do the wrong thing on downgrade. We cannot specially
detect the combination of zero mode/non-zero flushiter on disk to
non-zero mode, zero flushiter in the log item during recovery
because wrapping of the flushiter can result in false detection.

Hence that makes this "don't use flushiter" optimisation limited to
a disk format that guarantees that we don't need it. And that means
the only fix here is to limit the "no read IO on create"
optimisation to version 5 superblocks....

Reported-by: Markus Trippelsdorf <markus@trippelsdorf.de>
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>

(cherry picked from commit e60896d8f2)
2013-07-25 10:41:42 -05:00

242 lines
9 KiB
C

/*
* Copyright (c) 2000,2002,2005 Silicon Graphics, Inc.
* All Rights Reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it would be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef __XFS_DINODE_H__
#define __XFS_DINODE_H__
#define XFS_DINODE_MAGIC 0x494e /* 'IN' */
#define XFS_DINODE_GOOD_VERSION(v) ((v) >= 1 && (v) <= 3)
typedef struct xfs_timestamp {
__be32 t_sec; /* timestamp seconds */
__be32 t_nsec; /* timestamp nanoseconds */
} xfs_timestamp_t;
/*
* On-disk inode structure.
*
* This is just the header or "dinode core", the inode is expanded to fill a
* variable size the leftover area split into a data and an attribute fork.
* The format of the data and attribute fork depends on the format of the
* inode as indicated by di_format and di_aformat. To access the data and
* attribute use the XFS_DFORK_DPTR, XFS_DFORK_APTR, and XFS_DFORK_PTR macros
* below.
*
* There is a very similar struct icdinode in xfs_inode which matches the
* layout of the first 96 bytes of this structure, but is kept in native
* format instead of big endian.
*
* Note: di_flushiter is only used by v1/2 inodes - it's effectively a zeroed
* padding field for v3 inodes.
*/
typedef struct xfs_dinode {
__be16 di_magic; /* inode magic # = XFS_DINODE_MAGIC */
__be16 di_mode; /* mode and type of file */
__u8 di_version; /* inode version */
__u8 di_format; /* format of di_c data */
__be16 di_onlink; /* old number of links to file */
__be32 di_uid; /* owner's user id */
__be32 di_gid; /* owner's group id */
__be32 di_nlink; /* number of links to file */
__be16 di_projid_lo; /* lower part of owner's project id */
__be16 di_projid_hi; /* higher part owner's project id */
__u8 di_pad[6]; /* unused, zeroed space */
__be16 di_flushiter; /* incremented on flush */
xfs_timestamp_t di_atime; /* time last accessed */
xfs_timestamp_t di_mtime; /* time last modified */
xfs_timestamp_t di_ctime; /* time created/inode modified */
__be64 di_size; /* number of bytes in file */
__be64 di_nblocks; /* # of direct & btree blocks used */
__be32 di_extsize; /* basic/minimum extent size for file */
__be32 di_nextents; /* number of extents in data fork */
__be16 di_anextents; /* number of extents in attribute fork*/
__u8 di_forkoff; /* attr fork offs, <<3 for 64b align */
__s8 di_aformat; /* format of attr fork's data */
__be32 di_dmevmask; /* DMIG event mask */
__be16 di_dmstate; /* DMIG state info */
__be16 di_flags; /* random flags, XFS_DIFLAG_... */
__be32 di_gen; /* generation number */
/* di_next_unlinked is the only non-core field in the old dinode */
__be32 di_next_unlinked;/* agi unlinked list ptr */
/* start of the extended dinode, writable fields */
__le32 di_crc; /* CRC of the inode */
__be64 di_changecount; /* number of attribute changes */
__be64 di_lsn; /* flush sequence */
__be64 di_flags2; /* more random flags */
__u8 di_pad2[16]; /* more padding for future expansion */
/* fields only written to during inode creation */
xfs_timestamp_t di_crtime; /* time created */
__be64 di_ino; /* inode number */
uuid_t di_uuid; /* UUID of the filesystem */
/* structure must be padded to 64 bit alignment */
} xfs_dinode_t;
#define DI_MAX_FLUSH 0xffff
/*
* Size of the core inode on disk. Version 1 and 2 inodes have
* the same size, but version 3 has grown a few additional fields.
*/
static inline uint xfs_dinode_size(int version)
{
if (version == 3)
return sizeof(struct xfs_dinode);
return offsetof(struct xfs_dinode, di_crc);
}
/*
* The 32 bit link count in the inode theoretically maxes out at UINT_MAX.
* Since the pathconf interface is signed, we use 2^31 - 1 instead.
* The old inode format had a 16 bit link count, so its maximum is USHRT_MAX.
*/
#define XFS_MAXLINK ((1U << 31) - 1U)
#define XFS_MAXLINK_1 65535U
/*
* Values for di_format
*/
typedef enum xfs_dinode_fmt {
XFS_DINODE_FMT_DEV, /* xfs_dev_t */
XFS_DINODE_FMT_LOCAL, /* bulk data */
XFS_DINODE_FMT_EXTENTS, /* struct xfs_bmbt_rec */
XFS_DINODE_FMT_BTREE, /* struct xfs_bmdr_block */
XFS_DINODE_FMT_UUID /* uuid_t */
} xfs_dinode_fmt_t;
/*
* Inode minimum and maximum sizes.
*/
#define XFS_DINODE_MIN_LOG 8
#define XFS_DINODE_MAX_LOG 11
#define XFS_DINODE_MIN_SIZE (1 << XFS_DINODE_MIN_LOG)
#define XFS_DINODE_MAX_SIZE (1 << XFS_DINODE_MAX_LOG)
/*
* Inode size for given fs.
*/
#define XFS_LITINO(mp, version) \
((int)(((mp)->m_sb.sb_inodesize) - xfs_dinode_size(version)))
/*
* Inode data & attribute fork sizes, per inode.
*/
#define XFS_DFORK_Q(dip) ((dip)->di_forkoff != 0)
#define XFS_DFORK_BOFF(dip) ((int)((dip)->di_forkoff << 3))
#define XFS_DFORK_DSIZE(dip,mp) \
(XFS_DFORK_Q(dip) ? \
XFS_DFORK_BOFF(dip) : \
XFS_LITINO(mp, (dip)->di_version))
#define XFS_DFORK_ASIZE(dip,mp) \
(XFS_DFORK_Q(dip) ? \
XFS_LITINO(mp, (dip)->di_version) - XFS_DFORK_BOFF(dip) : \
0)
#define XFS_DFORK_SIZE(dip,mp,w) \
((w) == XFS_DATA_FORK ? \
XFS_DFORK_DSIZE(dip, mp) : \
XFS_DFORK_ASIZE(dip, mp))
/*
* Return pointers to the data or attribute forks.
*/
#define XFS_DFORK_DPTR(dip) \
((char *)dip + xfs_dinode_size(dip->di_version))
#define XFS_DFORK_APTR(dip) \
(XFS_DFORK_DPTR(dip) + XFS_DFORK_BOFF(dip))
#define XFS_DFORK_PTR(dip,w) \
((w) == XFS_DATA_FORK ? XFS_DFORK_DPTR(dip) : XFS_DFORK_APTR(dip))
#define XFS_DFORK_FORMAT(dip,w) \
((w) == XFS_DATA_FORK ? \
(dip)->di_format : \
(dip)->di_aformat)
#define XFS_DFORK_NEXTENTS(dip,w) \
((w) == XFS_DATA_FORK ? \
be32_to_cpu((dip)->di_nextents) : \
be16_to_cpu((dip)->di_anextents))
#define XFS_BUF_TO_DINODE(bp) ((xfs_dinode_t *)((bp)->b_addr))
/*
* For block and character special files the 32bit dev_t is stored at the
* beginning of the data fork.
*/
static inline xfs_dev_t xfs_dinode_get_rdev(struct xfs_dinode *dip)
{
return be32_to_cpu(*(__be32 *)XFS_DFORK_DPTR(dip));
}
static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev)
{
*(__be32 *)XFS_DFORK_DPTR(dip) = cpu_to_be32(rdev);
}
/*
* Values for di_flags
* There should be a one-to-one correspondence between these flags and the
* XFS_XFLAG_s.
*/
#define XFS_DIFLAG_REALTIME_BIT 0 /* file's blocks come from rt area */
#define XFS_DIFLAG_PREALLOC_BIT 1 /* file space has been preallocated */
#define XFS_DIFLAG_NEWRTBM_BIT 2 /* for rtbitmap inode, new format */
#define XFS_DIFLAG_IMMUTABLE_BIT 3 /* inode is immutable */
#define XFS_DIFLAG_APPEND_BIT 4 /* inode is append-only */
#define XFS_DIFLAG_SYNC_BIT 5 /* inode is written synchronously */
#define XFS_DIFLAG_NOATIME_BIT 6 /* do not update atime */
#define XFS_DIFLAG_NODUMP_BIT 7 /* do not dump */
#define XFS_DIFLAG_RTINHERIT_BIT 8 /* create with realtime bit set */
#define XFS_DIFLAG_PROJINHERIT_BIT 9 /* create with parents projid */
#define XFS_DIFLAG_NOSYMLINKS_BIT 10 /* disallow symlink creation */
#define XFS_DIFLAG_EXTSIZE_BIT 11 /* inode extent size allocator hint */
#define XFS_DIFLAG_EXTSZINHERIT_BIT 12 /* inherit inode extent size */
#define XFS_DIFLAG_NODEFRAG_BIT 13 /* do not reorganize/defragment */
#define XFS_DIFLAG_FILESTREAM_BIT 14 /* use filestream allocator */
#define XFS_DIFLAG_REALTIME (1 << XFS_DIFLAG_REALTIME_BIT)
#define XFS_DIFLAG_PREALLOC (1 << XFS_DIFLAG_PREALLOC_BIT)
#define XFS_DIFLAG_NEWRTBM (1 << XFS_DIFLAG_NEWRTBM_BIT)
#define XFS_DIFLAG_IMMUTABLE (1 << XFS_DIFLAG_IMMUTABLE_BIT)
#define XFS_DIFLAG_APPEND (1 << XFS_DIFLAG_APPEND_BIT)
#define XFS_DIFLAG_SYNC (1 << XFS_DIFLAG_SYNC_BIT)
#define XFS_DIFLAG_NOATIME (1 << XFS_DIFLAG_NOATIME_BIT)
#define XFS_DIFLAG_NODUMP (1 << XFS_DIFLAG_NODUMP_BIT)
#define XFS_DIFLAG_RTINHERIT (1 << XFS_DIFLAG_RTINHERIT_BIT)
#define XFS_DIFLAG_PROJINHERIT (1 << XFS_DIFLAG_PROJINHERIT_BIT)
#define XFS_DIFLAG_NOSYMLINKS (1 << XFS_DIFLAG_NOSYMLINKS_BIT)
#define XFS_DIFLAG_EXTSIZE (1 << XFS_DIFLAG_EXTSIZE_BIT)
#define XFS_DIFLAG_EXTSZINHERIT (1 << XFS_DIFLAG_EXTSZINHERIT_BIT)
#define XFS_DIFLAG_NODEFRAG (1 << XFS_DIFLAG_NODEFRAG_BIT)
#define XFS_DIFLAG_FILESTREAM (1 << XFS_DIFLAG_FILESTREAM_BIT)
#ifdef CONFIG_XFS_RT
#define XFS_IS_REALTIME_INODE(ip) ((ip)->i_d.di_flags & XFS_DIFLAG_REALTIME)
#else
#define XFS_IS_REALTIME_INODE(ip) (0)
#endif
#define XFS_DIFLAG_ANY \
(XFS_DIFLAG_REALTIME | XFS_DIFLAG_PREALLOC | XFS_DIFLAG_NEWRTBM | \
XFS_DIFLAG_IMMUTABLE | XFS_DIFLAG_APPEND | XFS_DIFLAG_SYNC | \
XFS_DIFLAG_NOATIME | XFS_DIFLAG_NODUMP | XFS_DIFLAG_RTINHERIT | \
XFS_DIFLAG_PROJINHERIT | XFS_DIFLAG_NOSYMLINKS | XFS_DIFLAG_EXTSIZE | \
XFS_DIFLAG_EXTSZINHERIT | XFS_DIFLAG_NODEFRAG | XFS_DIFLAG_FILESTREAM)
#endif /* __XFS_DINODE_H__ */