From 7cc718d56c8297bd3a3c106ca09e8abf9814bb27 Mon Sep 17 00:00:00 2001 From: Marcin Slusarz Date: Sun, 10 Feb 2008 11:21:54 +0100 Subject: [PATCH 01/51] slab: avoid double initialization & do initialization in 1 place - alloc_slabmgmt: initialize all slab fields in 1 place - slab->nodeid was initialized twice: in alloc_slabmgmt and immediately after it in cache_grow Signed-off-by: Marcin Slusarz CC: Christoph Lameter Reviewed-by: Pekka Enberg Signed-off-by: Christoph Lameter --- mm/slab.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/slab.c b/mm/slab.c index 40c00dacbe4b..473e6c2eaefb 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2630,6 +2630,7 @@ static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp, slabp->colouroff = colour_off; slabp->s_mem = objp + colour_off; slabp->nodeid = nodeid; + slabp->free = 0; return slabp; } @@ -2683,7 +2684,6 @@ static void cache_init_objs(struct kmem_cache *cachep, slab_bufctl(slabp)[i] = i + 1; } slab_bufctl(slabp)[i - 1] = BUFCTL_END; - slabp->free = 0; } static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags) @@ -2816,7 +2816,6 @@ static int cache_grow(struct kmem_cache *cachep, if (!slabp) goto opps1; - slabp->nodeid = nodeid; slab_map_pages(cachep, slabp, objp); cache_init_objs(cachep, slabp); From 4187377b2411d43ea4470b35162917a5093857bf Mon Sep 17 00:00:00 2001 From: Eric Dujardin Date: Sat, 23 Feb 2008 22:51:28 -0700 Subject: [PATCH 02/51] [POWERPC] Add export for mpc52xx_set_psc_clkdiv mpc52xx_set_psc_clkdiv is needed by PSC device drivers. Signed-off-by: Eric Dujardin Signed-off-by: Grant Likely --- arch/powerpc/platforms/52xx/mpc52xx_common.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/platforms/52xx/mpc52xx_common.c b/arch/powerpc/platforms/52xx/mpc52xx_common.c index 9aa4425d80b2..4d5fd1dbd400 100644 --- a/arch/powerpc/platforms/52xx/mpc52xx_common.c +++ b/arch/powerpc/platforms/52xx/mpc52xx_common.c @@ -199,6 +199,7 @@ int mpc52xx_set_psc_clkdiv(int psc_id, int clkdiv) return 0; } +EXPORT_SYMBOL(mpc52xx_set_psc_clkdiv); /** * mpc52xx_restart: ppc_md->restart hook for mpc5200 using the watchdog timer From d58831375d68a3bd39d5ebab9eca711fbb4ee108 Mon Sep 17 00:00:00 2001 From: Jeremy Kerr Date: Tue, 26 Feb 2008 13:31:42 +1100 Subject: [PATCH 03/51] [POWERPC] spufs: fix context destruction during psmap fault We have a small window where a spu context may be destroyed while we're servicing a page fault (from another thread) to the context's problem state mapping. After we up_read() the mmap_sem, it's possible that the context is destroyed by its owning thread, and so the later references to ctx are invalid. This can maifest as a deadlock on the (now free()-ed) context state mutex. This change adds a reference to the context before we release the mmap_sem, so that the context cannot be destroyed. Signed-off-by: Jeremy Kerr --- arch/powerpc/platforms/cell/spufs/file.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/cell/spufs/file.c b/arch/powerpc/platforms/cell/spufs/file.c index c66c3756970d..f7a7e8635fb6 100644 --- a/arch/powerpc/platforms/cell/spufs/file.c +++ b/arch/powerpc/platforms/cell/spufs/file.c @@ -366,6 +366,13 @@ static unsigned long spufs_ps_nopfn(struct vm_area_struct *vma, if (offset >= ps_size) return NOPFN_SIGBUS; + /* + * Because we release the mmap_sem, the context may be destroyed while + * we're in spu_wait. Grab an extra reference so it isn't destroyed + * in the meantime. + */ + get_spu_context(ctx); + /* * We have to wait for context to be loaded before we have * pages to hand out to the user, but we don't want to wait @@ -375,7 +382,7 @@ static unsigned long spufs_ps_nopfn(struct vm_area_struct *vma, * hanged. */ if (spu_acquire(ctx)) - return NOPFN_REFAULT; + goto refault; if (ctx->state == SPU_STATE_SAVED) { up_read(¤t->mm->mmap_sem); @@ -391,6 +398,9 @@ static unsigned long spufs_ps_nopfn(struct vm_area_struct *vma, if (!ret) spu_release(ctx); + +refault: + put_spu_context(ctx); return NOPFN_REFAULT; } From e33eb074cb95783b4497327098e4128e9f8c15b9 Mon Sep 17 00:00:00 2001 From: Stefan Roese Date: Wed, 20 Feb 2008 21:45:58 +1100 Subject: [PATCH 04/51] [POWERPC] 4xx: Fix Haleakala PCIe compatibility problem in dts Since the 4xx PCIe driver checks for 405ex compatibility, the PCIe interface was not detected as it is currently defined as "405exr" compatible. This patch changes it to "405ex". The 405EX and 405EXr are identical exept that the 2nd PCIe and the 2nd EMAC interfaces are missing. Signed-off-by: Stefan Roese Signed-off-by: Josh Boyer --- arch/powerpc/boot/dts/haleakala.dts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/boot/dts/haleakala.dts b/arch/powerpc/boot/dts/haleakala.dts index 5dd3d15f0feb..ae68fefc01b6 100644 --- a/arch/powerpc/boot/dts/haleakala.dts +++ b/arch/powerpc/boot/dts/haleakala.dts @@ -235,7 +235,7 @@ #interrupt-cells = <1>; #size-cells = <2>; #address-cells = <3>; - compatible = "ibm,plb-pciex-405exr", "ibm,plb-pciex"; + compatible = "ibm,plb-pciex-405ex", "ibm,plb-pciex"; primary; port = <0>; /* port number */ reg = Date: Fri, 22 Feb 2008 02:21:37 +1100 Subject: [PATCH 05/51] [POWERPC] 4xx: Fix L1 cache size in katmai DTS This patch changes the katmai (440SPe) L1 cache size to 32k. Some whitespace issues are cleaned up too. Signed-off-by: Stefan Roese Signed-off-by: Josh Boyer --- arch/powerpc/boot/dts/katmai.dts | 58 ++++++++++++++++---------------- 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/arch/powerpc/boot/dts/katmai.dts b/arch/powerpc/boot/dts/katmai.dts index bc32ac7250ec..fc86e5a3afc4 100644 --- a/arch/powerpc/boot/dts/katmai.dts +++ b/arch/powerpc/boot/dts/katmai.dts @@ -38,8 +38,8 @@ timebase-frequency = <0>; /* Filled in by zImage */ i-cache-line-size = <20>; d-cache-line-size = <20>; - i-cache-size = <20000>; - d-cache-size = <20000>; + i-cache-size = <8000>; + d-cache-size = <8000>; dcr-controller; dcr-access-method = "native"; }; @@ -136,11 +136,11 @@ }; POB0: opb { - compatible = "ibm,opb-440spe", "ibm,opb-440gp", "ibm,opb"; + compatible = "ibm,opb-440spe", "ibm,opb-440gp", "ibm,opb"; #address-cells = <1>; #size-cells = <1>; - ranges = <00000000 4 e0000000 20000000>; - clock-frequency = <0>; /* Filled in by zImage */ + ranges = <00000000 4 e0000000 20000000>; + clock-frequency = <0>; /* Filled in by zImage */ EBC0: ebc { compatible = "ibm,ebc-440spe", "ibm,ebc-440gp", "ibm,ebc"; @@ -153,38 +153,38 @@ }; UART0: serial@10000200 { - device_type = "serial"; - compatible = "ns16550"; - reg = <10000200 8>; + device_type = "serial"; + compatible = "ns16550"; + reg = <10000200 8>; virtual-reg = ; - clock-frequency = <0>; /* Filled in by zImage */ - current-speed = <1c200>; - interrupt-parent = <&UIC0>; - interrupts = <0 4>; - }; + clock-frequency = <0>; /* Filled in by zImage */ + current-speed = <1c200>; + interrupt-parent = <&UIC0>; + interrupts = <0 4>; + }; UART1: serial@10000300 { - device_type = "serial"; - compatible = "ns16550"; - reg = <10000300 8>; + device_type = "serial"; + compatible = "ns16550"; + reg = <10000300 8>; virtual-reg = ; - clock-frequency = <0>; - current-speed = <0>; - interrupt-parent = <&UIC0>; - interrupts = <1 4>; - }; + clock-frequency = <0>; + current-speed = <0>; + interrupt-parent = <&UIC0>; + interrupts = <1 4>; + }; UART2: serial@10000600 { - device_type = "serial"; - compatible = "ns16550"; - reg = <10000600 8>; + device_type = "serial"; + compatible = "ns16550"; + reg = <10000600 8>; virtual-reg = ; - clock-frequency = <0>; - current-speed = <0>; - interrupt-parent = <&UIC1>; - interrupts = <5 4>; - }; + clock-frequency = <0>; + current-speed = <0>; + interrupt-parent = <&UIC1>; + interrupts = <5 4>; + }; IIC0: i2c@10000400 { compatible = "ibm,iic-440spe", "ibm,iic-440gp", "ibm,iic"; From c91f91e5fb04fc8fd8fa4b5e9d949031e631c107 Mon Sep 17 00:00:00 2001 From: Valentine Barshak Date: Wed, 27 Feb 2008 01:58:53 +1100 Subject: [PATCH 06/51] [POWERPC] 44x: add missing define TARGET_4xx and TARGET_440GX to cuboot-taishan In order to get the proper boad info (bd_info) structure defined in ppcboot.h both TARGET_4xx and TARGET_44x should be defined for all PowerPC 440 boards. The 440GX boards also need TARGET_440GX defined since they have 4 EMACs and there are 4 MAC addesses in bd_info passed by u-boot. Signed-off-by: Valentine Barshak Signed-off-by: Josh Boyer --- arch/powerpc/boot/cuboot-taishan.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/powerpc/boot/cuboot-taishan.c b/arch/powerpc/boot/cuboot-taishan.c index f66455a45ab1..b55b80467eed 100644 --- a/arch/powerpc/boot/cuboot-taishan.c +++ b/arch/powerpc/boot/cuboot-taishan.c @@ -21,7 +21,9 @@ #include "dcr.h" #include "4xx.h" +#define TARGET_4xx #define TARGET_44x +#define TARGET_440GX #include "ppcboot.h" static bd_t bd; From 0111a701867a796a7ca6ecbc385e4befc9f35066 Mon Sep 17 00:00:00 2001 From: Jeremy Kerr Date: Wed, 27 Feb 2008 19:08:13 +1100 Subject: [PATCH 07/51] [POWERPC] spufs: fix invalid scheduling of forgotten contexts At present, we have a situation where a context with no owner is re-scheduled by spu_forget: Thread 1: reading regs file Thread 2: context owner spu_forget() - ctx->owner = NULL - set SPU_SCHED_WAS_ACTIVE spu_acquire_saved() - context is in saved state spu_release_saved() - SPU_SCHED_WAS_ACTIVE is set, so spu_activate() the context, which now has no owner In spu_forget(), we shouldn't be requesting a re-schedule by setting SPU_SCHED_WAS_ACTIVE. This change removes the set_bit in spu_forget(), so that spu_release_saved() doesn't reinsert this destroyed context on to the run queue. Signed-off-by: Jeremy Kerr --- arch/powerpc/platforms/cell/spufs/context.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/platforms/cell/spufs/context.c b/arch/powerpc/platforms/cell/spufs/context.c index 133995ed5cc7..cf6c2c89211d 100644 --- a/arch/powerpc/platforms/cell/spufs/context.c +++ b/arch/powerpc/platforms/cell/spufs/context.c @@ -109,13 +109,12 @@ void spu_forget(struct spu_context *ctx) /* * This is basically an open-coded spu_acquire_saved, except that - * we don't acquire the state mutex interruptible. + * we don't acquire the state mutex interruptible, and we don't + * want this context to be rescheduled on release. */ mutex_lock(&ctx->state_mutex); - if (ctx->state != SPU_STATE_SAVED) { - set_bit(SPU_SCHED_WAS_ACTIVE, &ctx->sched_flags); + if (ctx->state != SPU_STATE_SAVED) spu_deactivate(ctx); - } mm = ctx->owner; ctx->owner = NULL; From fe57e8be9e858b6d7af4e088cbbe718f51241eee Mon Sep 17 00:00:00 2001 From: Josh Boyer Date: Thu, 28 Feb 2008 08:16:27 -0600 Subject: [PATCH 08/51] [POWERPC] 4xx: Use correct board info structure in cuboot wrappers Correct the remaining 44x cuboot wrappers to define TARGET_4xx as well. This creates the correct structure to use, including things like the second MAC address. Signed-off-by: Josh Boyer --- arch/powerpc/boot/cuboot-bamboo.c | 1 + arch/powerpc/boot/cuboot-ebony.c | 1 + arch/powerpc/boot/cuboot-katmai.c | 1 + arch/powerpc/boot/cuboot-warp.c | 1 + 4 files changed, 4 insertions(+) diff --git a/arch/powerpc/boot/cuboot-bamboo.c b/arch/powerpc/boot/cuboot-bamboo.c index 900c7ff2b7e9..b5c30f766c40 100644 --- a/arch/powerpc/boot/cuboot-bamboo.c +++ b/arch/powerpc/boot/cuboot-bamboo.c @@ -17,6 +17,7 @@ #include "44x.h" #include "cuboot.h" +#define TARGET_4xx #define TARGET_44x #include "ppcboot.h" diff --git a/arch/powerpc/boot/cuboot-ebony.c b/arch/powerpc/boot/cuboot-ebony.c index c5f37ce172ea..56564ba37f62 100644 --- a/arch/powerpc/boot/cuboot-ebony.c +++ b/arch/powerpc/boot/cuboot-ebony.c @@ -17,6 +17,7 @@ #include "44x.h" #include "cuboot.h" +#define TARGET_4xx #define TARGET_44x #include "ppcboot.h" diff --git a/arch/powerpc/boot/cuboot-katmai.c b/arch/powerpc/boot/cuboot-katmai.c index c021167f9381..5434d70b5660 100644 --- a/arch/powerpc/boot/cuboot-katmai.c +++ b/arch/powerpc/boot/cuboot-katmai.c @@ -22,6 +22,7 @@ #include "44x.h" #include "cuboot.h" +#define TARGET_4xx #define TARGET_44x #include "ppcboot.h" diff --git a/arch/powerpc/boot/cuboot-warp.c b/arch/powerpc/boot/cuboot-warp.c index bdedebe1bc14..3db93e85e9ea 100644 --- a/arch/powerpc/boot/cuboot-warp.c +++ b/arch/powerpc/boot/cuboot-warp.c @@ -11,6 +11,7 @@ #include "4xx.h" #include "cuboot.h" +#define TARGET_4xx #define TARGET_44x #include "ppcboot.h" From f62f2fdd9c33160584b800da8c4a25ff1679225a Mon Sep 17 00:00:00 2001 From: Stephen Neuendorffer Date: Mon, 25 Feb 2008 10:34:47 +1100 Subject: [PATCH 09/51] [POWERPC] Xilinx: hwicap cleanup This fixes various items pointed out during a review of the hwicap driver. Primarily, reversed memcpy calls, re-entrancy issues, and mutex conversion have been addressed. There are also fixes to comments to use the kerneldoc format, as well as some sparse annotations. Signed-off-by: Stephen Neuendorffer Acked-by: Grant Likely Signed-off-by: Josh Boyer --- drivers/char/xilinx_hwicap/buffer_icap.c | 80 ++++++------ drivers/char/xilinx_hwicap/fifo_icap.c | 60 ++++----- drivers/char/xilinx_hwicap/xilinx_hwicap.c | 138 +++++++++------------ drivers/char/xilinx_hwicap/xilinx_hwicap.h | 24 ++-- 4 files changed, 144 insertions(+), 158 deletions(-) diff --git a/drivers/char/xilinx_hwicap/buffer_icap.c b/drivers/char/xilinx_hwicap/buffer_icap.c index dfea2bde162b..f577daedb630 100644 --- a/drivers/char/xilinx_hwicap/buffer_icap.c +++ b/drivers/char/xilinx_hwicap/buffer_icap.c @@ -73,8 +73,8 @@ #define XHI_BUFFER_START 0 /** - * buffer_icap_get_status: Get the contents of the status register. - * @parameter base_address: is the base address of the device + * buffer_icap_get_status - Get the contents of the status register. + * @base_address: is the base address of the device * * The status register contains the ICAP status and the done bit. * @@ -94,9 +94,9 @@ static inline u32 buffer_icap_get_status(void __iomem *base_address) } /** - * buffer_icap_get_bram: Reads data from the storage buffer bram. - * @parameter base_address: contains the base address of the component. - * @parameter offset: The word offset from which the data should be read. + * buffer_icap_get_bram - Reads data from the storage buffer bram. + * @base_address: contains the base address of the component. + * @offset: The word offset from which the data should be read. * * A bram is used as a configuration memory cache. One frame of data can * be stored in this "storage buffer". @@ -108,8 +108,8 @@ static inline u32 buffer_icap_get_bram(void __iomem *base_address, } /** - * buffer_icap_busy: Return true if the icap device is busy - * @parameter base_address: is the base address of the device + * buffer_icap_busy - Return true if the icap device is busy + * @base_address: is the base address of the device * * The queries the low order bit of the status register, which * indicates whether the current configuration or readback operation @@ -121,8 +121,8 @@ static inline bool buffer_icap_busy(void __iomem *base_address) } /** - * buffer_icap_busy: Return true if the icap device is not busy - * @parameter base_address: is the base address of the device + * buffer_icap_busy - Return true if the icap device is not busy + * @base_address: is the base address of the device * * The queries the low order bit of the status register, which * indicates whether the current configuration or readback operation @@ -134,9 +134,9 @@ static inline bool buffer_icap_done(void __iomem *base_address) } /** - * buffer_icap_set_size: Set the size register. - * @parameter base_address: is the base address of the device - * @parameter data: The size in bytes. + * buffer_icap_set_size - Set the size register. + * @base_address: is the base address of the device + * @data: The size in bytes. * * The size register holds the number of 8 bit bytes to transfer between * bram and the icap (or icap to bram). @@ -148,9 +148,9 @@ static inline void buffer_icap_set_size(void __iomem *base_address, } /** - * buffer_icap_mSetoffsetReg: Set the bram offset register. - * @parameter base_address: contains the base address of the device. - * @parameter data: is the value to be written to the data register. + * buffer_icap_set_offset - Set the bram offset register. + * @base_address: contains the base address of the device. + * @data: is the value to be written to the data register. * * The bram offset register holds the starting bram address to transfer * data from during configuration or write data to during readback. @@ -162,9 +162,9 @@ static inline void buffer_icap_set_offset(void __iomem *base_address, } /** - * buffer_icap_set_rnc: Set the RNC (Readback not Configure) register. - * @parameter base_address: contains the base address of the device. - * @parameter data: is the value to be written to the data register. + * buffer_icap_set_rnc - Set the RNC (Readback not Configure) register. + * @base_address: contains the base address of the device. + * @data: is the value to be written to the data register. * * The RNC register determines the direction of the data transfer. It * controls whether a configuration or readback take place. Writing to @@ -178,10 +178,10 @@ static inline void buffer_icap_set_rnc(void __iomem *base_address, } /** - * buffer_icap_set_bram: Write data to the storage buffer bram. - * @parameter base_address: contains the base address of the component. - * @parameter offset: The word offset at which the data should be written. - * @parameter data: The value to be written to the bram offset. + * buffer_icap_set_bram - Write data to the storage buffer bram. + * @base_address: contains the base address of the component. + * @offset: The word offset at which the data should be written. + * @data: The value to be written to the bram offset. * * A bram is used as a configuration memory cache. One frame of data can * be stored in this "storage buffer". @@ -193,10 +193,10 @@ static inline void buffer_icap_set_bram(void __iomem *base_address, } /** - * buffer_icap_device_read: Transfer bytes from ICAP to the storage buffer. - * @parameter drvdata: a pointer to the drvdata. - * @parameter offset: The storage buffer start address. - * @parameter count: The number of words (32 bit) to read from the + * buffer_icap_device_read - Transfer bytes from ICAP to the storage buffer. + * @drvdata: a pointer to the drvdata. + * @offset: The storage buffer start address. + * @count: The number of words (32 bit) to read from the * device (ICAP). **/ static int buffer_icap_device_read(struct hwicap_drvdata *drvdata, @@ -227,10 +227,10 @@ static int buffer_icap_device_read(struct hwicap_drvdata *drvdata, }; /** - * buffer_icap_device_write: Transfer bytes from ICAP to the storage buffer. - * @parameter drvdata: a pointer to the drvdata. - * @parameter offset: The storage buffer start address. - * @parameter count: The number of words (32 bit) to read from the + * buffer_icap_device_write - Transfer bytes from ICAP to the storage buffer. + * @drvdata: a pointer to the drvdata. + * @offset: The storage buffer start address. + * @count: The number of words (32 bit) to read from the * device (ICAP). **/ static int buffer_icap_device_write(struct hwicap_drvdata *drvdata, @@ -261,8 +261,8 @@ static int buffer_icap_device_write(struct hwicap_drvdata *drvdata, }; /** - * buffer_icap_reset: Reset the logic of the icap device. - * @parameter drvdata: a pointer to the drvdata. + * buffer_icap_reset - Reset the logic of the icap device. + * @drvdata: a pointer to the drvdata. * * Writing to the status register resets the ICAP logic in an internal * version of the core. For the version of the core published in EDK, @@ -274,10 +274,10 @@ void buffer_icap_reset(struct hwicap_drvdata *drvdata) } /** - * buffer_icap_set_configuration: Load a partial bitstream from system memory. - * @parameter drvdata: a pointer to the drvdata. - * @parameter data: Kernel address of the partial bitstream. - * @parameter size: the size of the partial bitstream in 32 bit words. + * buffer_icap_set_configuration - Load a partial bitstream from system memory. + * @drvdata: a pointer to the drvdata. + * @data: Kernel address of the partial bitstream. + * @size: the size of the partial bitstream in 32 bit words. **/ int buffer_icap_set_configuration(struct hwicap_drvdata *drvdata, u32 *data, u32 size) @@ -333,10 +333,10 @@ int buffer_icap_set_configuration(struct hwicap_drvdata *drvdata, u32 *data, }; /** - * buffer_icap_get_configuration: Read configuration data from the device. - * @parameter drvdata: a pointer to the drvdata. - * @parameter data: Address of the data representing the partial bitstream - * @parameter size: the size of the partial bitstream in 32 bit words. + * buffer_icap_get_configuration - Read configuration data from the device. + * @drvdata: a pointer to the drvdata. + * @data: Address of the data representing the partial bitstream + * @size: the size of the partial bitstream in 32 bit words. **/ int buffer_icap_get_configuration(struct hwicap_drvdata *drvdata, u32 *data, u32 size) diff --git a/drivers/char/xilinx_hwicap/fifo_icap.c b/drivers/char/xilinx_hwicap/fifo_icap.c index 0988314694a6..6f45dbd47125 100644 --- a/drivers/char/xilinx_hwicap/fifo_icap.c +++ b/drivers/char/xilinx_hwicap/fifo_icap.c @@ -94,9 +94,9 @@ /** - * fifo_icap_fifo_write: Write data to the write FIFO. - * @parameter drvdata: a pointer to the drvdata. - * @parameter data: the 32-bit value to be written to the FIFO. + * fifo_icap_fifo_write - Write data to the write FIFO. + * @drvdata: a pointer to the drvdata. + * @data: the 32-bit value to be written to the FIFO. * * This function will silently fail if the fifo is full. **/ @@ -108,8 +108,8 @@ static inline void fifo_icap_fifo_write(struct hwicap_drvdata *drvdata, } /** - * fifo_icap_fifo_read: Read data from the Read FIFO. - * @parameter drvdata: a pointer to the drvdata. + * fifo_icap_fifo_read - Read data from the Read FIFO. + * @drvdata: a pointer to the drvdata. * * This function will silently fail if the fifo is empty. **/ @@ -121,9 +121,9 @@ static inline u32 fifo_icap_fifo_read(struct hwicap_drvdata *drvdata) } /** - * fifo_icap_set_read_size: Set the the size register. - * @parameter drvdata: a pointer to the drvdata. - * @parameter data: the size of the following read transaction, in words. + * fifo_icap_set_read_size - Set the the size register. + * @drvdata: a pointer to the drvdata. + * @data: the size of the following read transaction, in words. **/ static inline void fifo_icap_set_read_size(struct hwicap_drvdata *drvdata, u32 data) @@ -132,8 +132,8 @@ static inline void fifo_icap_set_read_size(struct hwicap_drvdata *drvdata, } /** - * fifo_icap_start_config: Initiate a configuration (write) to the device. - * @parameter drvdata: a pointer to the drvdata. + * fifo_icap_start_config - Initiate a configuration (write) to the device. + * @drvdata: a pointer to the drvdata. **/ static inline void fifo_icap_start_config(struct hwicap_drvdata *drvdata) { @@ -142,8 +142,8 @@ static inline void fifo_icap_start_config(struct hwicap_drvdata *drvdata) } /** - * fifo_icap_start_readback: Initiate a readback from the device. - * @parameter drvdata: a pointer to the drvdata. + * fifo_icap_start_readback - Initiate a readback from the device. + * @drvdata: a pointer to the drvdata. **/ static inline void fifo_icap_start_readback(struct hwicap_drvdata *drvdata) { @@ -152,8 +152,8 @@ static inline void fifo_icap_start_readback(struct hwicap_drvdata *drvdata) } /** - * fifo_icap_busy: Return true if the ICAP is still processing a transaction. - * @parameter drvdata: a pointer to the drvdata. + * fifo_icap_busy - Return true if the ICAP is still processing a transaction. + * @drvdata: a pointer to the drvdata. **/ static inline u32 fifo_icap_busy(struct hwicap_drvdata *drvdata) { @@ -163,8 +163,8 @@ static inline u32 fifo_icap_busy(struct hwicap_drvdata *drvdata) } /** - * fifo_icap_write_fifo_vacancy: Query the write fifo available space. - * @parameter drvdata: a pointer to the drvdata. + * fifo_icap_write_fifo_vacancy - Query the write fifo available space. + * @drvdata: a pointer to the drvdata. * * Return the number of words that can be safely pushed into the write fifo. **/ @@ -175,8 +175,8 @@ static inline u32 fifo_icap_write_fifo_vacancy( } /** - * fifo_icap_read_fifo_occupancy: Query the read fifo available data. - * @parameter drvdata: a pointer to the drvdata. + * fifo_icap_read_fifo_occupancy - Query the read fifo available data. + * @drvdata: a pointer to the drvdata. * * Return the number of words that can be safely read from the read fifo. **/ @@ -187,11 +187,11 @@ static inline u32 fifo_icap_read_fifo_occupancy( } /** - * fifo_icap_set_configuration: Send configuration data to the ICAP. - * @parameter drvdata: a pointer to the drvdata. - * @parameter frame_buffer: a pointer to the data to be written to the + * fifo_icap_set_configuration - Send configuration data to the ICAP. + * @drvdata: a pointer to the drvdata. + * @frame_buffer: a pointer to the data to be written to the * ICAP device. - * @parameter num_words: the number of words (32 bit) to write to the ICAP + * @num_words: the number of words (32 bit) to write to the ICAP * device. * This function writes the given user data to the Write FIFO in @@ -266,10 +266,10 @@ int fifo_icap_set_configuration(struct hwicap_drvdata *drvdata, } /** - * fifo_icap_get_configuration: Read configuration data from the device. - * @parameter drvdata: a pointer to the drvdata. - * @parameter data: Address of the data representing the partial bitstream - * @parameter size: the size of the partial bitstream in 32 bit words. + * fifo_icap_get_configuration - Read configuration data from the device. + * @drvdata: a pointer to the drvdata. + * @data: Address of the data representing the partial bitstream + * @size: the size of the partial bitstream in 32 bit words. * * This function reads the specified number of words from the ICAP device in * the polled mode. @@ -335,8 +335,8 @@ int fifo_icap_get_configuration(struct hwicap_drvdata *drvdata, } /** - * buffer_icap_reset: Reset the logic of the icap device. - * @parameter drvdata: a pointer to the drvdata. + * buffer_icap_reset - Reset the logic of the icap device. + * @drvdata: a pointer to the drvdata. * * This function forces the software reset of the complete HWICAP device. * All the registers will return to the default value and the FIFO is also @@ -360,8 +360,8 @@ void fifo_icap_reset(struct hwicap_drvdata *drvdata) } /** - * fifo_icap_flush_fifo: This function flushes the FIFOs in the device. - * @parameter drvdata: a pointer to the drvdata. + * fifo_icap_flush_fifo - This function flushes the FIFOs in the device. + * @drvdata: a pointer to the drvdata. */ void fifo_icap_flush_fifo(struct hwicap_drvdata *drvdata) { diff --git a/drivers/char/xilinx_hwicap/xilinx_hwicap.c b/drivers/char/xilinx_hwicap/xilinx_hwicap.c index 24f6aef0fd3c..2284fa2a5a57 100644 --- a/drivers/char/xilinx_hwicap/xilinx_hwicap.c +++ b/drivers/char/xilinx_hwicap/xilinx_hwicap.c @@ -84,7 +84,7 @@ #include #include #include -#include +#include #include #include #include @@ -119,6 +119,7 @@ module_param(xhwicap_minor, int, S_IRUGO); /* An array, which is set to true when the device is registered. */ static bool probed_devices[HWICAP_DEVICES]; +static struct mutex icap_sem; static struct class *icap_class; @@ -199,14 +200,14 @@ static const struct config_registers v5_config_registers = { }; /** - * hwicap_command_desync: Send a DESYNC command to the ICAP port. - * @parameter drvdata: a pointer to the drvdata. + * hwicap_command_desync - Send a DESYNC command to the ICAP port. + * @drvdata: a pointer to the drvdata. * * This command desynchronizes the ICAP After this command, a * bitstream containing a NULL packet, followed by a SYNCH packet is * required before the ICAP will recognize commands. */ -int hwicap_command_desync(struct hwicap_drvdata *drvdata) +static int hwicap_command_desync(struct hwicap_drvdata *drvdata) { u32 buffer[4]; u32 index = 0; @@ -228,51 +229,18 @@ int hwicap_command_desync(struct hwicap_drvdata *drvdata) } /** - * hwicap_command_capture: Send a CAPTURE command to the ICAP port. - * @parameter drvdata: a pointer to the drvdata. - * - * This command captures all of the flip flop states so they will be - * available during readback. One can use this command instead of - * enabling the CAPTURE block in the design. - */ -int hwicap_command_capture(struct hwicap_drvdata *drvdata) -{ - u32 buffer[7]; - u32 index = 0; - - /* - * Create the data to be written to the ICAP. - */ - buffer[index++] = XHI_DUMMY_PACKET; - buffer[index++] = XHI_SYNC_PACKET; - buffer[index++] = XHI_NOOP_PACKET; - buffer[index++] = hwicap_type_1_write(drvdata->config_regs->CMD) | 1; - buffer[index++] = XHI_CMD_GCAPTURE; - buffer[index++] = XHI_DUMMY_PACKET; - buffer[index++] = XHI_DUMMY_PACKET; - - /* - * Write the data to the FIFO and intiate the transfer of data - * present in the FIFO to the ICAP device. - */ - return drvdata->config->set_configuration(drvdata, - &buffer[0], index); - -} - -/** - * hwicap_get_configuration_register: Query a configuration register. - * @parameter drvdata: a pointer to the drvdata. - * @parameter reg: a constant which represents the configuration + * hwicap_get_configuration_register - Query a configuration register. + * @drvdata: a pointer to the drvdata. + * @reg: a constant which represents the configuration * register value to be returned. * Examples: XHI_IDCODE, XHI_FLR. - * @parameter RegData: returns the value of the register. + * @reg_data: returns the value of the register. * * Sends a query packet to the ICAP and then receives the response. * The icap is left in Synched state. */ -int hwicap_get_configuration_register(struct hwicap_drvdata *drvdata, - u32 reg, u32 *RegData) +static int hwicap_get_configuration_register(struct hwicap_drvdata *drvdata, + u32 reg, u32 *reg_data) { int status; u32 buffer[6]; @@ -300,14 +268,14 @@ int hwicap_get_configuration_register(struct hwicap_drvdata *drvdata, /* * Read the configuration register */ - status = drvdata->config->get_configuration(drvdata, RegData, 1); + status = drvdata->config->get_configuration(drvdata, reg_data, 1); if (status) return status; return 0; } -int hwicap_initialize_hwicap(struct hwicap_drvdata *drvdata) +static int hwicap_initialize_hwicap(struct hwicap_drvdata *drvdata) { int status; u32 idcode; @@ -344,7 +312,7 @@ int hwicap_initialize_hwicap(struct hwicap_drvdata *drvdata) } static ssize_t -hwicap_read(struct file *file, char *buf, size_t count, loff_t *ppos) +hwicap_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct hwicap_drvdata *drvdata = file->private_data; ssize_t bytes_to_read = 0; @@ -353,8 +321,9 @@ hwicap_read(struct file *file, char *buf, size_t count, loff_t *ppos) u32 bytes_remaining; int status; - if (down_interruptible(&drvdata->sem)) - return -ERESTARTSYS; + status = mutex_lock_interruptible(&drvdata->sem); + if (status) + return status; if (drvdata->read_buffer_in_use) { /* If there are leftover bytes in the buffer, just */ @@ -370,8 +339,9 @@ hwicap_read(struct file *file, char *buf, size_t count, loff_t *ppos) goto error; } drvdata->read_buffer_in_use -= bytes_to_read; - memcpy(drvdata->read_buffer + bytes_to_read, - drvdata->read_buffer, 4 - bytes_to_read); + memmove(drvdata->read_buffer, + drvdata->read_buffer + bytes_to_read, + 4 - bytes_to_read); } else { /* Get new data from the ICAP, and return was was requested. */ kbuf = (u32 *) get_zeroed_page(GFP_KERNEL); @@ -414,18 +384,20 @@ hwicap_read(struct file *file, char *buf, size_t count, loff_t *ppos) status = -EFAULT; goto error; } - memcpy(kbuf, drvdata->read_buffer, bytes_remaining); + memcpy(drvdata->read_buffer, + kbuf, + bytes_remaining); drvdata->read_buffer_in_use = bytes_remaining; free_page((unsigned long)kbuf); } status = bytes_to_read; error: - up(&drvdata->sem); + mutex_unlock(&drvdata->sem); return status; } static ssize_t -hwicap_write(struct file *file, const char *buf, +hwicap_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct hwicap_drvdata *drvdata = file->private_data; @@ -435,8 +407,9 @@ hwicap_write(struct file *file, const char *buf, ssize_t len; ssize_t status; - if (down_interruptible(&drvdata->sem)) - return -ERESTARTSYS; + status = mutex_lock_interruptible(&drvdata->sem); + if (status) + return status; left += drvdata->write_buffer_in_use; @@ -465,7 +438,7 @@ hwicap_write(struct file *file, const char *buf, memcpy(kbuf, drvdata->write_buffer, drvdata->write_buffer_in_use); if (copy_from_user( - (((char *)kbuf) + (drvdata->write_buffer_in_use)), + (((char *)kbuf) + drvdata->write_buffer_in_use), buf + written, len - (drvdata->write_buffer_in_use))) { free_page((unsigned long)kbuf); @@ -508,7 +481,7 @@ hwicap_write(struct file *file, const char *buf, free_page((unsigned long)kbuf); status = written; error: - up(&drvdata->sem); + mutex_unlock(&drvdata->sem); return status; } @@ -519,8 +492,9 @@ static int hwicap_open(struct inode *inode, struct file *file) drvdata = container_of(inode->i_cdev, struct hwicap_drvdata, cdev); - if (down_interruptible(&drvdata->sem)) - return -ERESTARTSYS; + status = mutex_lock_interruptible(&drvdata->sem); + if (status) + return status; if (drvdata->is_open) { status = -EBUSY; @@ -539,7 +513,7 @@ static int hwicap_open(struct inode *inode, struct file *file) drvdata->is_open = 1; error: - up(&drvdata->sem); + mutex_unlock(&drvdata->sem); return status; } @@ -549,8 +523,7 @@ static int hwicap_release(struct inode *inode, struct file *file) int i; int status = 0; - if (down_interruptible(&drvdata->sem)) - return -ERESTARTSYS; + mutex_lock(&drvdata->sem); if (drvdata->write_buffer_in_use) { /* Flush write buffer. */ @@ -569,7 +542,7 @@ static int hwicap_release(struct inode *inode, struct file *file) error: drvdata->is_open = 0; - up(&drvdata->sem); + mutex_unlock(&drvdata->sem); return status; } @@ -592,31 +565,36 @@ static int __devinit hwicap_setup(struct device *dev, int id, dev_info(dev, "Xilinx icap port driver\n"); + mutex_lock(&icap_sem); + if (id < 0) { for (id = 0; id < HWICAP_DEVICES; id++) if (!probed_devices[id]) break; } if (id < 0 || id >= HWICAP_DEVICES) { + mutex_unlock(&icap_sem); dev_err(dev, "%s%i too large\n", DRIVER_NAME, id); return -EINVAL; } if (probed_devices[id]) { + mutex_unlock(&icap_sem); dev_err(dev, "cannot assign to %s%i; it is already in use\n", DRIVER_NAME, id); return -EBUSY; } probed_devices[id] = 1; + mutex_unlock(&icap_sem); devt = MKDEV(xhwicap_major, xhwicap_minor + id); - drvdata = kmalloc(sizeof(struct hwicap_drvdata), GFP_KERNEL); + drvdata = kzalloc(sizeof(struct hwicap_drvdata), GFP_KERNEL); if (!drvdata) { dev_err(dev, "Couldn't allocate device private record\n"); - return -ENOMEM; + retval = -ENOMEM; + goto failed0; } - memset((void *)drvdata, 0, sizeof(struct hwicap_drvdata)); dev_set_drvdata(dev, (void *)drvdata); if (!regs_res) { @@ -648,7 +626,7 @@ static int __devinit hwicap_setup(struct device *dev, int id, drvdata->config = config; drvdata->config_regs = config_regs; - init_MUTEX(&drvdata->sem); + mutex_init(&drvdata->sem); drvdata->is_open = 0; dev_info(dev, "ioremap %lx to %p with size %x\n", @@ -663,7 +641,7 @@ static int __devinit hwicap_setup(struct device *dev, int id, goto failed3; } /* devfs_mk_cdev(devt, S_IFCHR|S_IRUGO|S_IWUGO, DRIVER_NAME); */ - class_device_create(icap_class, NULL, devt, NULL, DRIVER_NAME); + device_create(icap_class, dev, devt, "%s%d", DRIVER_NAME, id); return 0; /* success */ failed3: @@ -675,6 +653,11 @@ static int __devinit hwicap_setup(struct device *dev, int id, failed1: kfree(drvdata); + failed0: + mutex_lock(&icap_sem); + probed_devices[id] = 0; + mutex_unlock(&icap_sem); + return retval; } @@ -699,14 +682,16 @@ static int __devexit hwicap_remove(struct device *dev) if (!drvdata) return 0; - class_device_destroy(icap_class, drvdata->devt); + device_destroy(icap_class, drvdata->devt); cdev_del(&drvdata->cdev); iounmap(drvdata->base_address); release_mem_region(drvdata->mem_start, drvdata->mem_size); kfree(drvdata); dev_set_drvdata(dev, NULL); - probed_devices[MINOR(dev->devt)-xhwicap_minor] = 0; + mutex_lock(&icap_sem); + probed_devices[MINOR(dev->devt)-xhwicap_minor] = 0; + mutex_unlock(&icap_sem); return 0; /* success */ } @@ -821,28 +806,29 @@ static struct of_platform_driver hwicap_of_driver = { }; /* Registration helpers to keep the number of #ifdefs to a minimum */ -static inline int __devinit hwicap_of_register(void) +static inline int __init hwicap_of_register(void) { pr_debug("hwicap: calling of_register_platform_driver()\n"); return of_register_platform_driver(&hwicap_of_driver); } -static inline void __devexit hwicap_of_unregister(void) +static inline void __exit hwicap_of_unregister(void) { of_unregister_platform_driver(&hwicap_of_driver); } #else /* CONFIG_OF */ /* CONFIG_OF not enabled; do nothing helpers */ -static inline int __devinit hwicap_of_register(void) { return 0; } -static inline void __devexit hwicap_of_unregister(void) { } +static inline int __init hwicap_of_register(void) { return 0; } +static inline void __exit hwicap_of_unregister(void) { } #endif /* CONFIG_OF */ -static int __devinit hwicap_module_init(void) +static int __init hwicap_module_init(void) { dev_t devt; int retval; icap_class = class_create(THIS_MODULE, "xilinx_config"); + mutex_init(&icap_sem); if (xhwicap_major) { devt = MKDEV(xhwicap_major, xhwicap_minor); @@ -883,7 +869,7 @@ static int __devinit hwicap_module_init(void) return retval; } -static void __devexit hwicap_module_cleanup(void) +static void __exit hwicap_module_cleanup(void) { dev_t devt = MKDEV(xhwicap_major, xhwicap_minor); diff --git a/drivers/char/xilinx_hwicap/xilinx_hwicap.h b/drivers/char/xilinx_hwicap/xilinx_hwicap.h index ae771cac1629..405fee7e189b 100644 --- a/drivers/char/xilinx_hwicap/xilinx_hwicap.h +++ b/drivers/char/xilinx_hwicap/xilinx_hwicap.h @@ -48,9 +48,9 @@ struct hwicap_drvdata { u8 write_buffer[4]; u32 read_buffer_in_use; /* Always in [0,3] */ u8 read_buffer[4]; - u32 mem_start; /* phys. address of the control registers */ - u32 mem_end; /* phys. address of the control registers */ - u32 mem_size; + resource_size_t mem_start;/* phys. address of the control registers */ + resource_size_t mem_end; /* phys. address of the control registers */ + resource_size_t mem_size; void __iomem *base_address;/* virt. address of the control registers */ struct device *dev; @@ -61,7 +61,7 @@ struct hwicap_drvdata { const struct config_registers *config_regs; void *private_data; bool is_open; - struct semaphore sem; + struct mutex sem; }; struct hwicap_driver_config { @@ -164,29 +164,29 @@ struct config_registers { #define XHI_DISABLED_AUTO_CRC 0x0000DEFCUL /** - * hwicap_type_1_read: Generates a Type 1 read packet header. - * @parameter: Register is the address of the register to be read back. + * hwicap_type_1_read - Generates a Type 1 read packet header. + * @reg: is the address of the register to be read back. * * Generates a Type 1 read packet header, which is used to indirectly * read registers in the configuration logic. This packet must then * be sent through the icap device, and a return packet received with * the information. **/ -static inline u32 hwicap_type_1_read(u32 Register) +static inline u32 hwicap_type_1_read(u32 reg) { return (XHI_TYPE_1 << XHI_TYPE_SHIFT) | - (Register << XHI_REGISTER_SHIFT) | + (reg << XHI_REGISTER_SHIFT) | (XHI_OP_READ << XHI_OP_SHIFT); } /** - * hwicap_type_1_write: Generates a Type 1 write packet header - * @parameter: Register is the address of the register to be read back. + * hwicap_type_1_write - Generates a Type 1 write packet header + * @reg: is the address of the register to be read back. **/ -static inline u32 hwicap_type_1_write(u32 Register) +static inline u32 hwicap_type_1_write(u32 reg) { return (XHI_TYPE_1 << XHI_TYPE_SHIFT) | - (Register << XHI_REGISTER_SHIFT) | + (reg << XHI_REGISTER_SHIFT) | (XHI_OP_WRITE << XHI_OP_SHIFT); } From 71791bee90dd29b292c7e55c1c00857578c912bd Mon Sep 17 00:00:00 2001 From: Jeremy Kerr Date: Mon, 25 Feb 2008 14:58:37 +1100 Subject: [PATCH 10/51] [POWERPC] spufs: fix order of sputrace thread IDs Currently, we get the following output from sputrace: [5.097935954] 1606: spufs_ps_nopfn__enter (thread = 1605, spu = -1) [5.097958164] 1606: spufs_ps_nopfn__insert (thread = 1605, spu = 15) [5.097973529] 1607: spufs_ps_nopfn__enter (thread = 1605, spu = -1) [5.097989174] 1607: spufs_ps_nopfn__insert (thread = 1605, spu = 14) Which leads me to believe that 160[67] is the current thread ID, and 1605 is the context backing the psmap. However, the 'current' and 'owner' tids are reversed - the 'current' tid is on the right. This change puts the current thread ID in the left-hand column instead, and renames the right to 'ctxthread'. Signed-off-by: Jeremy Kerr --- arch/powerpc/platforms/cell/spufs/sputrace.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/platforms/cell/spufs/sputrace.c b/arch/powerpc/platforms/cell/spufs/sputrace.c index 01974f7776e1..79aa773f3c99 100644 --- a/arch/powerpc/platforms/cell/spufs/sputrace.c +++ b/arch/powerpc/platforms/cell/spufs/sputrace.c @@ -58,12 +58,12 @@ static int sputrace_sprint(char *tbuf, int n) ktime_to_timespec(ktime_sub(t->tstamp, sputrace_start)); return snprintf(tbuf, n, - "[%lu.%09lu] %d: %s (thread = %d, spu = %d)\n", + "[%lu.%09lu] %d: %s (ctxthread = %d, spu = %d)\n", (unsigned long) tv.tv_sec, (unsigned long) tv.tv_nsec, - t->owner_tid, - t->name, t->curr_tid, + t->name, + t->owner_tid, t->number); } @@ -188,6 +188,7 @@ struct spu_probe spu_probes[] = { { "spufs_ps_nopfn__insert", "%p %p", spu_context_event }, { "spu_acquire_saved__enter", "%p", spu_context_nospu_event }, { "destroy_spu_context__enter", "%p", spu_context_nospu_event }, + { "spufs_stop_callback__enter", "%p %p", spu_context_event }, }; static int __init sputrace_init(void) From fae9ca791507876c3ccaa8ab686b2ce42dc7a560 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 29 Feb 2008 15:16:48 +1100 Subject: [PATCH 11/51] [POWERPC] spufs: synchronize IRQ when disabling There is a small race between the context save procedure and the SPU interrupt handling, where we expect all interrupt processing to have finished after disabling them, while an interrupt is still being processed on another CPU. The obvious fix is to call synchronize_irq() after disabling the interrupts at the start of the context save procedure to make sure we never access the SPU any more during an ongoing save or even after that. Thanks to Benjamin Herrenschmidt for pointing this out. Acked-by: Benjamin Herrenschmidt Signed-off-by: Arnd Bergmann Signed-off-by: Jeremy Kerr --- arch/powerpc/platforms/cell/spufs/switch.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/powerpc/platforms/cell/spufs/switch.c b/arch/powerpc/platforms/cell/spufs/switch.c index 6f5886c7b1f9..e9dc7a55d1b9 100644 --- a/arch/powerpc/platforms/cell/spufs/switch.c +++ b/arch/powerpc/platforms/cell/spufs/switch.c @@ -34,6 +34,7 @@ #include #include +#include #include #include #include @@ -117,6 +118,8 @@ static inline void disable_interrupts(struct spu_state *csa, struct spu *spu) * Write INT_MASK_class1 with value of 0. * Save INT_Mask_class2 in CSA. * Write INT_MASK_class2 with value of 0. + * Synchronize all three interrupts to be sure + * we no longer execute a handler on another CPU. */ spin_lock_irq(&spu->register_lock); if (csa) { @@ -129,6 +132,9 @@ static inline void disable_interrupts(struct spu_state *csa, struct spu *spu) spu_int_mask_set(spu, 2, 0ul); eieio(); spin_unlock_irq(&spu->register_lock); + synchronize_irq(spu->irqs[0]); + synchronize_irq(spu->irqs[1]); + synchronize_irq(spu->irqs[2]); } static inline void set_watchdog_timer(struct spu_state *csa, struct spu *spu) From cc4b7c1814c9ad375e8167ea4a9ec4a0ec1ada04 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 26 Feb 2008 07:01:56 +0100 Subject: [PATCH 12/51] [POWERPC] spufs: invalidate SLB translation before adding a new entry When we replace an SLB entry in the MFC after using up all the available entries, there is a short window in which an incorrect entry is marked as valid. The problem is that the 'valid' bit is stored in the ESID, which is always written after the VSID. Overwriting the VSID first will make the original ESID entry point to the new VSID, which means that any concurrent DMA accessing the old ESID ends up being redirected to the new virtual address. A few cycles later, we write the new ESID and everything is fine again. That race can be closed by writing a zero entry to the ESID first, which makes sure that the VSID is not accessed until we write the new ESID. Note that we don't actually need to invalidate the SLB entry using the invalidation register, which would also flush any ERAT entries for that segment, because the segment translation does not become invalid but is only removed from the SLB cache. Signed-off-by: Arnd Bergmann Signed-off-by: Jeremy Kerr --- arch/powerpc/platforms/cell/spu_base.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/powerpc/platforms/cell/spu_base.c b/arch/powerpc/platforms/cell/spu_base.c index 87eb07f94c5f..cfc28e93c825 100644 --- a/arch/powerpc/platforms/cell/spu_base.c +++ b/arch/powerpc/platforms/cell/spu_base.c @@ -148,7 +148,11 @@ static inline void spu_load_slb(struct spu *spu, int slbe, struct spu_slb *slb) __func__, slbe, slb->vsid, slb->esid); out_be64(&priv2->slb_index_W, slbe); + /* set invalid before writing vsid */ + out_be64(&priv2->slb_esid_RW, 0); + /* now it's safe to write the vsid */ out_be64(&priv2->slb_vsid_RW, slb->vsid); + /* setting the new esid makes the entry valid again */ out_be64(&priv2->slb_esid_RW, slb->esid); } From c92a1acb675058375cc508ad024c33358b42d766 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 28 Feb 2008 06:06:30 +0100 Subject: [PATCH 13/51] [POWERPC] spufs: serialize SLB invalidation against SLB loading There is a potential race between flushes of the entire SLB in the MFC and the point where new entries are being established. The problem is that we might put a ESID entry into the MFC SLB when the VSID entry has just been cleared by the global flush. This can be circumvented by holding the register_lock throughout both the flushing and the creation of SLB entries. Signed-off-by: Arnd Bergmann Signed-off-by: Jeremy Kerr --- arch/powerpc/platforms/cell/spu_base.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/platforms/cell/spu_base.c b/arch/powerpc/platforms/cell/spu_base.c index cfc28e93c825..712001f6b7da 100644 --- a/arch/powerpc/platforms/cell/spu_base.c +++ b/arch/powerpc/platforms/cell/spu_base.c @@ -81,9 +81,12 @@ struct spu_slb { void spu_invalidate_slbs(struct spu *spu) { struct spu_priv2 __iomem *priv2 = spu->priv2; + unsigned long flags; + spin_lock_irqsave(&spu->register_lock, flags); if (spu_mfc_sr1_get(spu) & MFC_STATE1_RELOCATE_MASK) out_be64(&priv2->slb_invalidate_all_W, 0UL); + spin_unlock_irqrestore(&spu->register_lock, flags); } EXPORT_SYMBOL_GPL(spu_invalidate_slbs); @@ -294,9 +297,11 @@ void spu_setup_kernel_slbs(struct spu *spu, struct spu_lscsa *lscsa, nr_slbs++; } + spin_lock_irq(&spu->register_lock); /* Add the set of SLBs */ for (i = 0; i < nr_slbs; i++) spu_load_slb(spu, i, &slbs[i]); + spin_unlock_irq(&spu->register_lock); } EXPORT_SYMBOL_GPL(spu_setup_kernel_slbs); @@ -341,13 +346,14 @@ spu_irq_class_1(int irq, void *data) if (stat & CLASS1_STORAGE_FAULT_INTR) spu_mfc_dsisr_set(spu, 0ul); spu_int_stat_clear(spu, 1, stat); - spin_unlock(&spu->register_lock); - pr_debug("%s: %lx %lx %lx %lx\n", __FUNCTION__, mask, stat, - dar, dsisr); if (stat & CLASS1_SEGMENT_FAULT_INTR) __spu_trap_data_seg(spu, dar); + spin_unlock(&spu->register_lock); + pr_debug("%s: %lx %lx %lx %lx\n", __FUNCTION__, mask, stat, + dar, dsisr); + if (stat & CLASS1_STORAGE_FAULT_INTR) __spu_trap_data_map(spu, dar, dsisr); From 2a58aa33daef37134c8a43dca0b7578c3fa7f993 Mon Sep 17 00:00:00 2001 From: Andre Detsch Date: Mon, 25 Feb 2008 15:07:42 -0300 Subject: [PATCH 14/51] [POWERPC] spufs: fix use time accounting on SPE-overcommit The spu_runcntl_RW register is restored within spu_restore function. So, at the end of spu_bind_context, the SPU context is not just loaded, but running. This change corrects the state switch to account the time as USER. Signed-off-by: Andre Detsch Signed-off-by: Jeremy Kerr --- arch/powerpc/platforms/cell/spufs/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c index 3a5972117de7..5d5f680cd0b8 100644 --- a/arch/powerpc/platforms/cell/spufs/sched.c +++ b/arch/powerpc/platforms/cell/spufs/sched.c @@ -246,7 +246,7 @@ static void spu_bind_context(struct spu *spu, struct spu_context *ctx) spu_switch_notify(spu, ctx); ctx->state = SPU_STATE_RUNNABLE; - spuctx_switch_state(ctx, SPU_UTIL_IDLE_LOADED); + spuctx_switch_state(ctx, SPU_UTIL_USER); } /* From 0c82d83cb09a1c9fd4d24d32064ce827709c104b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Wed, 27 Feb 2008 13:44:59 +0100 Subject: [PATCH 15/51] [ARM] Fix freeing of page tables for ARM in free_pgd_slow MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since 2f569af (CONFIG_HIGHPTE vs. sub-page page tables.) pte_free() calls pte_lock_deinit() and dec_zone_page_state(). So free_pgd_slow must not call the latter two when calling the first. Signed-off-by: Uwe Kleine-König Cc: Martin Schwidefsky Cc: Andrew Morton Cc: Linus Torvalds Signed-off-by: Russell King --- arch/arm/mm/pgd.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/arm/mm/pgd.c b/arch/arm/mm/pgd.c index 500c9610ab30..e0f19ab91163 100644 --- a/arch/arm/mm/pgd.c +++ b/arch/arm/mm/pgd.c @@ -75,7 +75,7 @@ no_pgd: void free_pgd_slow(struct mm_struct *mm, pgd_t *pgd) { pmd_t *pmd; - struct page *pte; + pgtable_t pte; if (!pgd) return; @@ -90,10 +90,8 @@ void free_pgd_slow(struct mm_struct *mm, pgd_t *pgd) goto free; } - pte = pmd_page(*pmd); + pte = pmd_pgtable(*pmd); pmd_clear(pmd); - dec_zone_page_state(virt_to_page((unsigned long *)pgd), NR_PAGETABLE); - pte_lock_deinit(pte); pte_free(mm, pte); pmd_free(mm, pmd); free: From 101fd46a753f8931a05d252bf5564c9415a5f8d7 Mon Sep 17 00:00:00 2001 From: Bob Nelson Date: Wed, 20 Feb 2008 05:00:56 +0100 Subject: [PATCH 16/51] [POWERPC] OProfile: enable callgraph support for Cell This patch enables OProfile callgraph support for the Cell processor. The original code was just calling a function to add the PC value, now it will call a function that first checks the callgraph depth. Callgraph is already enabled on the other Power platforms. Signed-off-by: Bob Nelson Signed-off-by: Arnd Bergmann --- arch/powerpc/oprofile/op_model_cell.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/oprofile/op_model_cell.c b/arch/powerpc/oprofile/op_model_cell.c index 13929771bee7..9eed1f68fcab 100644 --- a/arch/powerpc/oprofile/op_model_cell.c +++ b/arch/powerpc/oprofile/op_model_cell.c @@ -1151,7 +1151,7 @@ static void cell_handle_interrupt(struct pt_regs *regs, for (i = 0; i < num_counters; ++i) { if ((interrupt_mask & CBE_PM_CTR_OVERFLOW_INTR(i)) && ctr[i].enabled) { - oprofile_add_pc(pc, is_kernel, i); + oprofile_add_ext_sample(pc, regs, i, is_kernel); cbe_write_ctr(cpu, i, reset_value[i]); } } From 9176c0b1f5a9099cebc07458042ae6a7c75af7b2 Mon Sep 17 00:00:00 2001 From: Jens Osterkamp Date: Thu, 28 Feb 2008 11:26:21 +0100 Subject: [PATCH 17/51] [POWERPC] move celleb DABRX definitions This moves the private DABRX definitions for celleb from beat.h to reg.h to make them usable for all. Signed-off-by: Jens Osterkamp Signed-off-by: Arnd Bergmann --- arch/powerpc/platforms/celleb/beat.h | 3 --- include/asm-powerpc/reg.h | 3 +++ 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/platforms/celleb/beat.h b/arch/powerpc/platforms/celleb/beat.h index b2e292df13ca..ac82ac35b991 100644 --- a/arch/powerpc/platforms/celleb/beat.h +++ b/arch/powerpc/platforms/celleb/beat.h @@ -21,9 +21,6 @@ #ifndef _CELLEB_BEAT_H #define _CELLEB_BEAT_H -#define DABRX_KERNEL (1UL<<1) -#define DABRX_USER (1UL<<0) - int64_t beat_get_term_char(uint64_t,uint64_t*,uint64_t*,uint64_t*); int64_t beat_put_term_char(uint64_t,uint64_t,uint64_t,uint64_t); int64_t beat_repository_encode(int, const char *, uint64_t[4]); diff --git a/include/asm-powerpc/reg.h b/include/asm-powerpc/reg.h index 0d6238987df8..edc0cfd7f6e2 100644 --- a/include/asm-powerpc/reg.h +++ b/include/asm-powerpc/reg.h @@ -153,6 +153,9 @@ #define CTRL_RUNLATCH 0x1 #define SPRN_DABR 0x3F5 /* Data Address Breakpoint Register */ #define DABR_TRANSLATION (1UL << 2) +#define SPRN_DABRX 0x3F7 /* Data Address Breakpoint Register Extension */ +#define DABRX_USER (1UL << 0) +#define DABRX_KERNEL (1UL << 1) #define SPRN_DAR 0x013 /* Data Address Register */ #define SPRN_DSISR 0x012 /* Data Storage Interrupt Status Register */ #define DSISR_NOHPTE 0x40000000 /* no translation found */ From f3c1ed9720ec62626bbf3e0c3648568c131978e2 Mon Sep 17 00:00:00 2001 From: Jens Osterkamp Date: Thu, 28 Feb 2008 11:27:31 +0100 Subject: [PATCH 18/51] [POWERPC] enable hardware watchpoints on cell blades Ulrich Weigand has found that the hardware watchpoints on cell were not working back in November : http://ozlabs.org/pipermail/linuxppc-dev/2007-November/046135.html This patch sets them during initialization. Signed-off-by: Jens Osterkamp Signed-off-by: Arnd Bergmann --- arch/powerpc/platforms/cell/setup.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/powerpc/platforms/cell/setup.c b/arch/powerpc/platforms/cell/setup.c index a7f609b3b876..dda34650cb07 100644 --- a/arch/powerpc/platforms/cell/setup.c +++ b/arch/powerpc/platforms/cell/setup.c @@ -149,6 +149,11 @@ static void __init cell_init_irq(void) mpic_init_IRQ(); } +static void __init cell_set_dabrx(void) +{ + mtspr(SPRN_DABRX, DABRX_KERNEL | DABRX_USER); +} + static void __init cell_setup_arch(void) { #ifdef CONFIG_SPU_BASE @@ -158,6 +163,8 @@ static void __init cell_setup_arch(void) cbe_regs_init(); + cell_set_dabrx(); + #ifdef CONFIG_CBE_RAS cbe_ras_init(); #endif From f9660e8a6c16e17935777cdee5194842904c2d72 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 29 Feb 2008 18:33:22 +1100 Subject: [PATCH 19/51] [POWERPC] Clearup cell IOMMU fixed mapping terminology It's called the fixed mapping, not the static mapping. Signed-off-by: Michael Ellerman Signed-off-by: Arnd Bergmann --- arch/powerpc/platforms/cell/iommu.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/platforms/cell/iommu.c b/arch/powerpc/platforms/cell/iommu.c index edab631a8dcb..bbe838996470 100644 --- a/arch/powerpc/platforms/cell/iommu.c +++ b/arch/powerpc/platforms/cell/iommu.c @@ -549,7 +549,7 @@ static void cell_dma_dev_setup_iommu(struct device *dev) archdata->dma_data = &window->table; } -static void cell_dma_dev_setup_static(struct device *dev); +static void cell_dma_dev_setup_fixed(struct device *dev); static void cell_dma_dev_setup(struct device *dev) { @@ -557,7 +557,7 @@ static void cell_dma_dev_setup(struct device *dev) /* Order is important here, these are not mutually exclusive */ if (get_dma_ops(dev) == &dma_iommu_fixed_ops) - cell_dma_dev_setup_static(dev); + cell_dma_dev_setup_fixed(dev); else if (get_pci_dma_ops() == &dma_iommu_ops) cell_dma_dev_setup_iommu(dev); else if (get_pci_dma_ops() == &dma_direct_ops) @@ -858,7 +858,7 @@ static int dma_set_mask_and_switch(struct device *dev, u64 dma_mask) return 0; } -static void cell_dma_dev_setup_static(struct device *dev) +static void cell_dma_dev_setup_fixed(struct device *dev) { struct dev_archdata *archdata = &dev->archdata; u64 addr; @@ -894,7 +894,7 @@ static void cell_iommu_setup_fixed_ptab(struct cbe_iommu *iommu, for (i = fbase; i < fbase + fsize; i++, uaddr += IOMMU_PAGE_SIZE) { /* Don't touch the dynamic region */ if (i >= dbase && i < (dbase + dsize)) { - pr_debug("iommu: static/dynamic overlap, skipping\n"); + pr_debug("iommu: fixed/dynamic overlap, skipping\n"); continue; } io_pte[i] = base_pte | (__pa(uaddr) & IOPTE_RPN_Mask); From 0d7386ebffd8506b28c37a7d5541132a576f64e2 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 29 Feb 2008 18:33:23 +1100 Subject: [PATCH 20/51] [POWERPC] Use it_offset not pte_offset in cell IOMMU code The cell IOMMU tce build and free routines use pte_offset to convert the index passed from the generic IOMMU code into a page table offset. This takes into account the SPIDER_DMA_OFFSET which sets the top bit of every DMA address. However it doesn't cater for the IOMMU window starting at a non-zero address, as the base of the window is not incorporated into pte_offset at all. As it turns out tbl->it_offset already contains the value we need, it takes into account the base of the window and also pte_offset. So use it instead! Signed-off-by: Michael Ellerman Signed-off-by: Arnd Bergmann --- arch/powerpc/platforms/cell/iommu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/platforms/cell/iommu.c b/arch/powerpc/platforms/cell/iommu.c index bbe838996470..4e75919bf6b9 100644 --- a/arch/powerpc/platforms/cell/iommu.c +++ b/arch/powerpc/platforms/cell/iommu.c @@ -200,7 +200,7 @@ static void tce_build_cell(struct iommu_table *tbl, long index, long npages, (window->ioid & IOPTE_IOID_Mask); #endif - io_pte = (unsigned long *)tbl->it_base + (index - window->pte_offset); + io_pte = (unsigned long *)tbl->it_base + (index - tbl->it_offset); for (i = 0; i < npages; i++, uaddr += IOMMU_PAGE_SIZE) io_pte[i] = base_pte | (__pa(uaddr) & IOPTE_RPN_Mask); @@ -232,7 +232,7 @@ static void tce_free_cell(struct iommu_table *tbl, long index, long npages) | (window->ioid & IOPTE_IOID_Mask); #endif - io_pte = (unsigned long *)tbl->it_base + (index - window->pte_offset); + io_pte = (unsigned long *)tbl->it_base + (index - tbl->it_offset); for (i = 0; i < npages; i++) io_pte[i] = pte; From 08e024272e529076663e5b4dc8eeecd4131f8a48 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 29 Feb 2008 18:33:23 +1100 Subject: [PATCH 21/51] [POWERPC] Remove unused pte_offset variable The cell IOMMU code no longer needs to save the pte_offset variable separately, it is incorporated into tbl->it_offset. Signed-off-by: Michael Ellerman Signed-off-by: Arnd Bergmann --- arch/powerpc/platforms/cell/iommu.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/arch/powerpc/platforms/cell/iommu.c b/arch/powerpc/platforms/cell/iommu.c index 4e75919bf6b9..555d264ad568 100644 --- a/arch/powerpc/platforms/cell/iommu.c +++ b/arch/powerpc/platforms/cell/iommu.c @@ -123,7 +123,6 @@ struct iommu_window { struct cbe_iommu *iommu; unsigned long offset; unsigned long size; - unsigned long pte_offset; unsigned int ioid; struct iommu_table table; }; @@ -475,13 +474,11 @@ cell_iommu_setup_window(struct cbe_iommu *iommu, struct device_node *np, window->size = size; window->ioid = ioid; window->iommu = iommu; - window->pte_offset = pte_offset; window->table.it_blocksize = 16; window->table.it_base = (unsigned long)iommu->ptab; window->table.it_index = iommu->nid; - window->table.it_offset = (offset >> IOMMU_PAGE_SHIFT) + - window->pte_offset; + window->table.it_offset = (offset >> IOMMU_PAGE_SHIFT) + pte_offset; window->table.it_size = size >> IOMMU_PAGE_SHIFT; iommu_init_table(&window->table, iommu->nid); From edf441fb80f9d7a962c298e8da94c8c64802fffa Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 29 Feb 2008 18:33:24 +1100 Subject: [PATCH 22/51] [POWERPC] Move allocation of cell IOMMU pad page There's no need to allocate the pad page unless we're going to actually use it - so move the allocation to where we know we're going to use it. Signed-off-by: Michael Ellerman Signed-off-by: Arnd Bergmann --- arch/powerpc/platforms/cell/iommu.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/platforms/cell/iommu.c b/arch/powerpc/platforms/cell/iommu.c index 555d264ad568..8e57e1af3785 100644 --- a/arch/powerpc/platforms/cell/iommu.c +++ b/arch/powerpc/platforms/cell/iommu.c @@ -344,12 +344,6 @@ static void cell_iommu_setup_page_tables(struct cbe_iommu *iommu, iommu->ptab = page_address(page); memset(iommu->ptab, 0, ptab_size); - /* allocate a bogus page for the end of each mapping */ - page = alloc_pages_node(iommu->nid, GFP_KERNEL, 0); - BUG_ON(!page); - iommu->pad_page = page_address(page); - clear_page(iommu->pad_page); - /* number of pages needed for a page table */ n_pte_pages = (pages_per_segment * sizeof(unsigned long)) >> IOMMU_PAGE_SHIFT; @@ -463,6 +457,7 @@ cell_iommu_setup_window(struct cbe_iommu *iommu, struct device_node *np, unsigned long pte_offset) { struct iommu_window *window; + struct page *page; u32 ioid; ioid = cell_iommu_get_ioid(np); @@ -501,6 +496,11 @@ cell_iommu_setup_window(struct cbe_iommu *iommu, struct device_node *np, * This code also assumes that we have a window that starts at 0, * which is the case on all spider based blades. */ + page = alloc_pages_node(iommu->nid, GFP_KERNEL, 0); + BUG_ON(!page); + iommu->pad_page = page_address(page); + clear_page(iommu->pad_page); + __set_bit(0, window->table.it_map); tce_build_cell(&window->table, window->table.it_offset, 1, (unsigned long)iommu->pad_page, DMA_TO_DEVICE); From 7d432ff1b7db87e78eb74d42631d2a23ca6f26f2 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 29 Feb 2008 18:33:25 +1100 Subject: [PATCH 23/51] [POWERPC] Split setup of IOMMU stab and ptab, allocate dynamic/fixed ptabs separately Currently the cell IOMMU code allocates the entire IOMMU page table in a contiguous chunk. This is nice and tidy, but for machines with larger amounts of RAM the page table allocation can fail due to it simply being too large. So split the segment table and page table setup routine, and arrange to have the dynamic and fixed page tables allocated separately. Signed-off-by: Michael Ellerman Signed-off-by: Arnd Bergmann --- arch/powerpc/platforms/cell/iommu.c | 69 ++++++++++++++++++----------- 1 file changed, 43 insertions(+), 26 deletions(-) diff --git a/arch/powerpc/platforms/cell/iommu.c b/arch/powerpc/platforms/cell/iommu.c index 8e57e1af3785..187a723eafcd 100644 --- a/arch/powerpc/platforms/cell/iommu.c +++ b/arch/powerpc/platforms/cell/iommu.c @@ -306,50 +306,54 @@ static int cell_iommu_find_ioc(int nid, unsigned long *base) return -ENODEV; } -static void cell_iommu_setup_page_tables(struct cbe_iommu *iommu, +static void cell_iommu_setup_stab(struct cbe_iommu *iommu, unsigned long dbase, unsigned long dsize, unsigned long fbase, unsigned long fsize) { struct page *page; - int i; - unsigned long reg, segments, pages_per_segment, ptab_size, stab_size, - n_pte_pages, base; - - base = dbase; - if (fsize != 0) - base = min(fbase, dbase); + unsigned long segments, stab_size; segments = max(dbase + dsize, fbase + fsize) >> IO_SEGMENT_SHIFT; - pages_per_segment = 1ull << IO_PAGENO_BITS; - pr_debug("%s: iommu[%d]: segments: %lu, pages per segment: %lu\n", - __FUNCTION__, iommu->nid, segments, pages_per_segment); + pr_debug("%s: iommu[%d]: segments: %lu\n", + __FUNCTION__, iommu->nid, segments); /* set up the segment table */ stab_size = segments * sizeof(unsigned long); page = alloc_pages_node(iommu->nid, GFP_KERNEL, get_order(stab_size)); BUG_ON(!page); iommu->stab = page_address(page); - clear_page(iommu->stab); + memset(iommu->stab, 0, stab_size); +} + +static unsigned long *cell_iommu_alloc_ptab(struct cbe_iommu *iommu, + unsigned long base, unsigned long size, unsigned long gap_base, + unsigned long gap_size) +{ + struct page *page; + int i; + unsigned long reg, segments, pages_per_segment, ptab_size, + n_pte_pages, start_seg, *ptab; + + start_seg = base >> IO_SEGMENT_SHIFT; + segments = size >> IO_SEGMENT_SHIFT; + pages_per_segment = 1ull << IO_PAGENO_BITS; - /* ... and the page tables. Since these are contiguous, we can treat - * the page tables as one array of ptes, like pSeries does. - */ ptab_size = segments * pages_per_segment * sizeof(unsigned long); pr_debug("%s: iommu[%d]: ptab_size: %lu, order: %d\n", __FUNCTION__, iommu->nid, ptab_size, get_order(ptab_size)); page = alloc_pages_node(iommu->nid, GFP_KERNEL, get_order(ptab_size)); BUG_ON(!page); - iommu->ptab = page_address(page); - memset(iommu->ptab, 0, ptab_size); + ptab = page_address(page); + memset(ptab, 0, ptab_size); /* number of pages needed for a page table */ n_pte_pages = (pages_per_segment * sizeof(unsigned long)) >> IOMMU_PAGE_SHIFT; pr_debug("%s: iommu[%d]: stab at %p, ptab at %p, n_pte_pages: %lu\n", - __FUNCTION__, iommu->nid, iommu->stab, iommu->ptab, + __FUNCTION__, iommu->nid, iommu->stab, ptab, n_pte_pages); /* initialise the STEs */ @@ -364,12 +368,21 @@ static void cell_iommu_setup_page_tables(struct cbe_iommu *iommu, __unknown_page_size_error(); } + gap_base = gap_base >> IO_SEGMENT_SHIFT; + gap_size = gap_size >> IO_SEGMENT_SHIFT; + pr_debug("Setting up IOMMU stab:\n"); - for (i = base >> IO_SEGMENT_SHIFT; i < segments; i++) { - iommu->stab[i] = reg | - (__pa(iommu->ptab) + n_pte_pages * IOMMU_PAGE_SIZE * i); + for (i = start_seg; i < (start_seg + segments); i++) { + if (i >= gap_base && i < (gap_base + gap_size)) { + pr_debug("\toverlap at %d, skipping\n", i); + continue; + } + iommu->stab[i] = reg | (__pa(ptab) + n_pte_pages * + IOMMU_PAGE_SIZE * (i - start_seg)); pr_debug("\t[%d] 0x%016lx\n", i, iommu->stab[i]); } + + return ptab; } static void cell_iommu_enable_hardware(struct cbe_iommu *iommu) @@ -416,7 +429,8 @@ static void cell_iommu_enable_hardware(struct cbe_iommu *iommu) static void cell_iommu_setup_hardware(struct cbe_iommu *iommu, unsigned long base, unsigned long size) { - cell_iommu_setup_page_tables(iommu, base, size, 0, 0); + cell_iommu_setup_stab(iommu, base, size, 0, 0); + iommu->ptab = cell_iommu_alloc_ptab(iommu, base, size, 0, 0); cell_iommu_enable_hardware(iommu); } @@ -870,8 +884,10 @@ static void cell_iommu_setup_fixed_ptab(struct cbe_iommu *iommu, struct device_node *np, unsigned long dbase, unsigned long dsize, unsigned long fbase, unsigned long fsize) { - unsigned long base_pte, uaddr, *io_pte; int i; + unsigned long base_pte, uaddr, *io_pte, *ptab; + + ptab = cell_iommu_alloc_ptab(iommu, fbase, fsize, dbase, dsize); dma_iommu_fixed_base = fbase; @@ -883,7 +899,7 @@ static void cell_iommu_setup_fixed_ptab(struct cbe_iommu *iommu, pr_debug("iommu: mapping 0x%lx pages from 0x%lx\n", fsize, fbase); - io_pte = iommu->ptab; + io_pte = ptab; base_pte = IOPTE_PP_W | IOPTE_PP_R | IOPTE_M | IOPTE_SO_RW | (cell_iommu_get_ioid(np) & IOPTE_IOID_Mask); @@ -894,7 +910,7 @@ static void cell_iommu_setup_fixed_ptab(struct cbe_iommu *iommu, pr_debug("iommu: fixed/dynamic overlap, skipping\n"); continue; } - io_pte[i] = base_pte | (__pa(uaddr) & IOPTE_RPN_Mask); + io_pte[i - fbase] = base_pte | (__pa(uaddr) & IOPTE_RPN_Mask); } mb(); @@ -992,7 +1008,8 @@ static int __init cell_iommu_fixed_mapping_init(void) "fixed window 0x%lx-0x%lx\n", iommu->nid, dbase, dbase + dsize, fbase, fbase + fsize); - cell_iommu_setup_page_tables(iommu, dbase, dsize, fbase, fsize); + cell_iommu_setup_stab(iommu, dbase, dsize, fbase, fsize); + iommu->ptab = cell_iommu_alloc_ptab(iommu, dbase, dsize, 0, 0); cell_iommu_setup_fixed_ptab(iommu, np, dbase, dsize, fbase, fsize); cell_iommu_enable_hardware(iommu); From 3d3e6da17d6af42a3fd4891fb09d93dca002e590 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 29 Feb 2008 18:33:26 +1100 Subject: [PATCH 24/51] [POWERPC] Cell IOMMU: n_pte_pages is in 4K page units, not IOMMU_PAGE_SIZE We use n_pte_pages to calculate the stride through the page tables, but we also use it to set the NPPT value in the segment table entry. That is defined as the number of 4K pages per segment, so we should calculate it as such regardless of the IOMMU page size. Signed-off-by: Michael Ellerman Signed-off-by: Arnd Bergmann --- arch/powerpc/platforms/cell/iommu.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/platforms/cell/iommu.c b/arch/powerpc/platforms/cell/iommu.c index 187a723eafcd..7a861cb960d2 100644 --- a/arch/powerpc/platforms/cell/iommu.c +++ b/arch/powerpc/platforms/cell/iommu.c @@ -348,9 +348,8 @@ static unsigned long *cell_iommu_alloc_ptab(struct cbe_iommu *iommu, ptab = page_address(page); memset(ptab, 0, ptab_size); - /* number of pages needed for a page table */ - n_pte_pages = (pages_per_segment * - sizeof(unsigned long)) >> IOMMU_PAGE_SHIFT; + /* number of 4K pages needed for a page table */ + n_pte_pages = (pages_per_segment * sizeof(unsigned long)) >> 12; pr_debug("%s: iommu[%d]: stab at %p, ptab at %p, n_pte_pages: %lu\n", __FUNCTION__, iommu->nid, iommu->stab, ptab, @@ -377,8 +376,8 @@ static unsigned long *cell_iommu_alloc_ptab(struct cbe_iommu *iommu, pr_debug("\toverlap at %d, skipping\n", i); continue; } - iommu->stab[i] = reg | (__pa(ptab) + n_pte_pages * - IOMMU_PAGE_SIZE * (i - start_seg)); + iommu->stab[i] = reg | (__pa(ptab) + (n_pte_pages << 12) * + (i - start_seg)); pr_debug("\t[%d] 0x%016lx\n", i, iommu->stab[i]); } From 225d49050f9b6506f2f9df6b40e591ee93939d11 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 29 Feb 2008 18:33:27 +1100 Subject: [PATCH 25/51] [POWERPC] Allow for different IOMMU page sizes in cell IOMMU code Make some preliminary changes to cell_iommu_alloc_ptab() to allow it to take the page size as a parameter rather than assuming IOMMU_PAGE_SIZE. Signed-off-by: Michael Ellerman Signed-off-by: Arnd Bergmann --- arch/powerpc/platforms/cell/iommu.c | 31 +++++++++++++++++------------ 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/arch/powerpc/platforms/cell/iommu.c b/arch/powerpc/platforms/cell/iommu.c index 7a861cb960d2..b0e347e4933a 100644 --- a/arch/powerpc/platforms/cell/iommu.c +++ b/arch/powerpc/platforms/cell/iommu.c @@ -113,7 +113,7 @@ /* IOMMU sizing */ #define IO_SEGMENT_SHIFT 28 -#define IO_PAGENO_BITS (IO_SEGMENT_SHIFT - IOMMU_PAGE_SHIFT) +#define IO_PAGENO_BITS(shift) (IO_SEGMENT_SHIFT - (shift)) /* The high bit needs to be set on every DMA address */ #define SPIDER_DMA_OFFSET 0x80000000ul @@ -328,7 +328,7 @@ static void cell_iommu_setup_stab(struct cbe_iommu *iommu, static unsigned long *cell_iommu_alloc_ptab(struct cbe_iommu *iommu, unsigned long base, unsigned long size, unsigned long gap_base, - unsigned long gap_size) + unsigned long gap_size, unsigned long page_shift) { struct page *page; int i; @@ -337,7 +337,10 @@ static unsigned long *cell_iommu_alloc_ptab(struct cbe_iommu *iommu, start_seg = base >> IO_SEGMENT_SHIFT; segments = size >> IO_SEGMENT_SHIFT; - pages_per_segment = 1ull << IO_PAGENO_BITS; + pages_per_segment = 1ull << IO_PAGENO_BITS(page_shift); + /* PTEs for each segment must start on a 4K bounday */ + pages_per_segment = max(pages_per_segment, + (1 << 12) / sizeof(unsigned long)); ptab_size = segments * pages_per_segment * sizeof(unsigned long); pr_debug("%s: iommu[%d]: ptab_size: %lu, order: %d\n", __FUNCTION__, @@ -358,13 +361,12 @@ static unsigned long *cell_iommu_alloc_ptab(struct cbe_iommu *iommu, /* initialise the STEs */ reg = IOSTE_V | ((n_pte_pages - 1) << 5); - if (IOMMU_PAGE_SIZE == 0x1000) - reg |= IOSTE_PS_4K; - else if (IOMMU_PAGE_SIZE == 0x10000) - reg |= IOSTE_PS_64K; - else { - extern void __unknown_page_size_error(void); - __unknown_page_size_error(); + switch (page_shift) { + case 12: reg |= IOSTE_PS_4K; break; + case 16: reg |= IOSTE_PS_64K; break; + case 20: reg |= IOSTE_PS_1M; break; + case 24: reg |= IOSTE_PS_16M; break; + default: BUG(); } gap_base = gap_base >> IO_SEGMENT_SHIFT; @@ -429,7 +431,8 @@ static void cell_iommu_setup_hardware(struct cbe_iommu *iommu, unsigned long base, unsigned long size) { cell_iommu_setup_stab(iommu, base, size, 0, 0); - iommu->ptab = cell_iommu_alloc_ptab(iommu, base, size, 0, 0); + iommu->ptab = cell_iommu_alloc_ptab(iommu, base, size, 0, 0, + IOMMU_PAGE_SHIFT); cell_iommu_enable_hardware(iommu); } @@ -886,7 +889,8 @@ static void cell_iommu_setup_fixed_ptab(struct cbe_iommu *iommu, int i; unsigned long base_pte, uaddr, *io_pte, *ptab; - ptab = cell_iommu_alloc_ptab(iommu, fbase, fsize, dbase, dsize); + ptab = cell_iommu_alloc_ptab(iommu, fbase, fsize, dbase, dsize, + IOMMU_PAGE_SHIFT); dma_iommu_fixed_base = fbase; @@ -1008,7 +1012,8 @@ static int __init cell_iommu_fixed_mapping_init(void) dbase + dsize, fbase, fbase + fsize); cell_iommu_setup_stab(iommu, dbase, dsize, fbase, fsize); - iommu->ptab = cell_iommu_alloc_ptab(iommu, dbase, dsize, 0, 0); + iommu->ptab = cell_iommu_alloc_ptab(iommu, dbase, dsize, 0, 0, + IOMMU_PAGE_SHIFT); cell_iommu_setup_fixed_ptab(iommu, np, dbase, dsize, fbase, fsize); cell_iommu_enable_hardware(iommu); From da40451bba23b51eaca4170a095891646ce72104 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 29 Feb 2008 18:33:29 +1100 Subject: [PATCH 26/51] [POWERPC] Convert the cell IOMMU fixed mapping to 16M IOMMU pages The only tricky part is we need to adjust the PTE insertion loop to cater for holes in the page table. The PTEs for each segment start on a 4K boundary, so with 16M pages we have 16 PTEs per segment and then a gap to the next 4K page boundary. It might be possible to allocate the PTEs for each segment separately, saving the memory currently filling the gaps. However we'd need to check that's OK with the hardware, and that it actually saves memory. Signed-off-by: Michael Ellerman Signed-off-by: Arnd Bergmann --- arch/powerpc/platforms/cell/iommu.c | 37 +++++++++++++++++------------ 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/arch/powerpc/platforms/cell/iommu.c b/arch/powerpc/platforms/cell/iommu.c index b0e347e4933a..20ea0e118f24 100644 --- a/arch/powerpc/platforms/cell/iommu.c +++ b/arch/powerpc/platforms/cell/iommu.c @@ -882,38 +882,45 @@ static void cell_dma_dev_setup_fixed(struct device *dev) dev_dbg(dev, "iommu: fixed addr = %lx\n", addr); } +static void insert_16M_pte(unsigned long addr, unsigned long *ptab, + unsigned long base_pte) +{ + unsigned long segment, offset; + + segment = addr >> IO_SEGMENT_SHIFT; + offset = (addr >> 24) - (segment << IO_PAGENO_BITS(24)); + ptab = ptab + (segment * (1 << 12) / sizeof(unsigned long)); + + pr_debug("iommu: addr %lx ptab %p segment %lx offset %lx\n", + addr, ptab, segment, offset); + + ptab[offset] = base_pte | (__pa(addr) & IOPTE_RPN_Mask); +} + static void cell_iommu_setup_fixed_ptab(struct cbe_iommu *iommu, struct device_node *np, unsigned long dbase, unsigned long dsize, unsigned long fbase, unsigned long fsize) { - int i; - unsigned long base_pte, uaddr, *io_pte, *ptab; + unsigned long base_pte, uaddr, ioaddr, *ptab; - ptab = cell_iommu_alloc_ptab(iommu, fbase, fsize, dbase, dsize, - IOMMU_PAGE_SHIFT); + ptab = cell_iommu_alloc_ptab(iommu, fbase, fsize, dbase, dsize, 24); dma_iommu_fixed_base = fbase; - /* convert from bytes into page table indices */ - dbase = dbase >> IOMMU_PAGE_SHIFT; - dsize = dsize >> IOMMU_PAGE_SHIFT; - fbase = fbase >> IOMMU_PAGE_SHIFT; - fsize = fsize >> IOMMU_PAGE_SHIFT; - pr_debug("iommu: mapping 0x%lx pages from 0x%lx\n", fsize, fbase); - io_pte = ptab; base_pte = IOPTE_PP_W | IOPTE_PP_R | IOPTE_M | IOPTE_SO_RW | (cell_iommu_get_ioid(np) & IOPTE_IOID_Mask); - uaddr = 0; - for (i = fbase; i < fbase + fsize; i++, uaddr += IOMMU_PAGE_SIZE) { + for (uaddr = 0; uaddr < fsize; uaddr += (1 << 24)) { /* Don't touch the dynamic region */ - if (i >= dbase && i < (dbase + dsize)) { + ioaddr = uaddr + fbase; + if (ioaddr >= dbase && ioaddr < (dbase + dsize)) { pr_debug("iommu: fixed/dynamic overlap, skipping\n"); continue; } - io_pte[i - fbase] = base_pte | (__pa(uaddr) & IOPTE_RPN_Mask); + + insert_16M_pte(uaddr, ptab, base_pte); } mb(); From 9b5cf48b06a52c04b85c88642c3b620db8e1d592 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 3 Mar 2008 01:17:37 +0100 Subject: [PATCH 27/51] x86: revert "x86: CPA: avoid split of alias mappings" Revert: commit 8be8f54bae3453588011cad06363813a5293af53 Author: Thomas Gleixner Date: Sat Feb 23 20:43:21 2008 +0100 x86: CPA: avoid split of alias mappings because it clearly mishandles the case when __change_page_attr(), called from __change_page_attr_set_clr(), changes cpa->processed to 1 and cpa_process_alias(cpa) is executed right after that. This crashes my x86-64 test box early in the boot process (ref. http://bugzilla.kernel.org/show_bug.cgi?id=10140#c4). Signed-off-by: Rafael J. Wysocki Acked-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 7049294fb469..14e48b5a94ba 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -26,7 +26,6 @@ struct cpa_data { pgprot_t mask_set; pgprot_t mask_clr; int numpages; - int processed; int flushtlb; unsigned long pfn; }; @@ -291,8 +290,8 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, */ nextpage_addr = (address + psize) & pmask; numpages = (nextpage_addr - address) >> PAGE_SHIFT; - if (numpages < cpa->processed) - cpa->processed = numpages; + if (numpages < cpa->numpages) + cpa->numpages = numpages; /* * We are safe now. Check whether the new pgprot is the same: @@ -319,7 +318,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, */ addr = address + PAGE_SIZE; pfn++; - for (i = 1; i < cpa->processed; i++, addr += PAGE_SIZE, pfn++) { + for (i = 1; i < cpa->numpages; i++, addr += PAGE_SIZE, pfn++) { pgprot_t chk_prot = static_protections(new_prot, addr, pfn); if (pgprot_val(chk_prot) != pgprot_val(new_prot)) @@ -343,7 +342,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, * that we limited the number of possible pages already to * the number of pages in the large page. */ - if (address == (nextpage_addr - psize) && cpa->processed == numpages) { + if (address == (nextpage_addr - psize) && cpa->numpages == numpages) { /* * The address is aligned and the number of pages * covers the full page. @@ -573,7 +572,7 @@ repeat: set_pte_atomic(kpte, new_pte); cpa->flushtlb = 1; } - cpa->processed = 1; + cpa->numpages = 1; return 0; } @@ -584,7 +583,7 @@ repeat: do_split = try_preserve_large_page(kpte, address, cpa); /* * When the range fits into the existing large page, - * return. cp->processed and cpa->tlbflush have been updated in + * return. cp->numpages and cpa->tlbflush have been updated in * try_large_page: */ if (do_split <= 0) @@ -663,7 +662,7 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias) * Store the remaining nr of pages for the large page * preservation check. */ - cpa->numpages = cpa->processed = numpages; + cpa->numpages = numpages; ret = __change_page_attr(cpa, checkalias); if (ret) @@ -680,9 +679,9 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias) * CPA operation. Either a large page has been * preserved or a single page update happened. */ - BUG_ON(cpa->processed > numpages); - numpages -= cpa->processed; - cpa->vaddr += cpa->processed * PAGE_SIZE; + BUG_ON(cpa->numpages > numpages); + numpages -= cpa->numpages; + cpa->vaddr += cpa->numpages * PAGE_SIZE; } return 0; } From 902955fc13259dcec1321d45251a477977fcba39 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 3 Mar 2008 13:53:58 +0100 Subject: [PATCH 28/51] x86: revert "x86: fix pmd_bad and pud_bad to support huge pages" revert commit cded932b75ab0a5f9181ee3da34a0a488d1a14fd, "x86: fix pmd_bad and pud_bad to support huge pages", it causes a bootup hang, as reported and bisected by Arjan van de Ven. Bisected-by: Arjan van de Ven Signed-off-by: Ingo Molnar --- include/asm-x86/pgtable_32.h | 4 +--- include/asm-x86/pgtable_64.h | 6 ++---- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/include/asm-x86/pgtable_32.h b/include/asm-x86/pgtable_32.h index b478efa971e0..a842c7222b1e 100644 --- a/include/asm-x86/pgtable_32.h +++ b/include/asm-x86/pgtable_32.h @@ -91,9 +91,7 @@ extern unsigned long pg0[]; /* To avoid harmful races, pmd_none(x) should check only the lower when PAE */ #define pmd_none(x) (!(unsigned long)pmd_val(x)) #define pmd_present(x) (pmd_val(x) & _PAGE_PRESENT) -#define pmd_bad(x) ((pmd_val(x) \ - & ~(PAGE_MASK | _PAGE_USER | _PAGE_PSE | _PAGE_NX)) \ - != _KERNPG_TABLE) +#define pmd_bad(x) ((pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE) #define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT)) diff --git a/include/asm-x86/pgtable_64.h b/include/asm-x86/pgtable_64.h index 0a9258333cbd..0a0b77bc736a 100644 --- a/include/asm-x86/pgtable_64.h +++ b/include/asm-x86/pgtable_64.h @@ -153,14 +153,12 @@ static inline unsigned long pgd_bad(pgd_t pgd) static inline unsigned long pud_bad(pud_t pud) { - return pud_val(pud) & - ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER | _PAGE_PSE | _PAGE_NX); + return pud_val(pud) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER); } static inline unsigned long pmd_bad(pmd_t pmd) { - return pmd_val(pmd) & - ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER | _PAGE_PSE | _PAGE_NX); + return pmd_val(pmd) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER); } #define pte_none(x) (!pte_val(x)) From a345b4ba2086bacc63884e5d72268415a97bcbff Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 3 Mar 2008 10:02:44 -0800 Subject: [PATCH 29/51] Revert "x86: fix pmd_bad and pud_bad to support huge pages" This reverts commit cded932b75ab0a5f9181ee3da34a0a488d1a14fd. Arjan bisected down a boot-time hang to this, saying: ".. it prevents the kernel to finish booting on my (Penryn based) laptop. The boot stops right after freeing the init memory." and while it's not clear exactly what triggers it, at this stage we're better off just reverting it while Ingo tries to figure out what went wrong. Requested-by: Arjan van de Ven Cc: Hans Rosenfeld Cc: Nish Aravamudan Acked-by: Ingo Molnar Signed-off-by: Linus Torvalds --- include/asm-x86/pgtable_32.h | 4 +--- include/asm-x86/pgtable_64.h | 6 ++---- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/include/asm-x86/pgtable_32.h b/include/asm-x86/pgtable_32.h index b478efa971e0..a842c7222b1e 100644 --- a/include/asm-x86/pgtable_32.h +++ b/include/asm-x86/pgtable_32.h @@ -91,9 +91,7 @@ extern unsigned long pg0[]; /* To avoid harmful races, pmd_none(x) should check only the lower when PAE */ #define pmd_none(x) (!(unsigned long)pmd_val(x)) #define pmd_present(x) (pmd_val(x) & _PAGE_PRESENT) -#define pmd_bad(x) ((pmd_val(x) \ - & ~(PAGE_MASK | _PAGE_USER | _PAGE_PSE | _PAGE_NX)) \ - != _KERNPG_TABLE) +#define pmd_bad(x) ((pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE) #define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT)) diff --git a/include/asm-x86/pgtable_64.h b/include/asm-x86/pgtable_64.h index 0a9258333cbd..0a0b77bc736a 100644 --- a/include/asm-x86/pgtable_64.h +++ b/include/asm-x86/pgtable_64.h @@ -153,14 +153,12 @@ static inline unsigned long pgd_bad(pgd_t pgd) static inline unsigned long pud_bad(pud_t pud) { - return pud_val(pud) & - ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER | _PAGE_PSE | _PAGE_NX); + return pud_val(pud) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER); } static inline unsigned long pmd_bad(pmd_t pmd) { - return pmd_val(pmd) & - ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER | _PAGE_PSE | _PAGE_NX); + return pmd_val(pmd) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER); } #define pte_none(x) (!pte_val(x)) From a64e715fc74b1a7dcc5944f848acc38b2c4d4ee2 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 3 Mar 2008 10:12:14 -0800 Subject: [PATCH 30/51] Allow ARG_MAX execve string space even with a small stack limit The new code that removed the limitation on the execve string size (which was historically 32 pages) replaced it with a much softer limit based on RLIMIT_STACK which is usually much larger than the traditional limit. See commit b6a2fea39318e43fee84fa7b0b90d68bed92d2ba ("mm: variable length argument support") for details. However, if you have a small stack limit (perhaps because you need lots of stacks in a threaded environment), the new heuristic of allowing up to 1/4th of RLIMIT_STACK to be used for argument and environment strings could actually be smaller than the old limit. So just say that it's ok to have up to ARG_MAX strings regardless of the value of RLIMIT_STACK, and check the rlimit only when going over that traditional limit. (Of course, if you actually have a *really* small stack limit, the whole stack itself will be limited before you hit ARG_MAX, but that has always been true and is clearly the right behaviour anyway). Acked-by: Carlos O'Donell Cc: Michael Kerrisk Cc: Peter Zijlstra Cc: Ollie Wild Signed-off-by: Linus Torvalds --- fs/exec.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/fs/exec.c b/fs/exec.c index a44b142fb460..54a0a557b678 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -173,8 +173,15 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, return NULL; if (write) { - struct rlimit *rlim = current->signal->rlim; unsigned long size = bprm->vma->vm_end - bprm->vma->vm_start; + struct rlimit *rlim; + + /* + * We've historically supported up to 32 pages (ARG_MAX) + * of argument strings even with small stacks + */ + if (size <= ARG_MAX) + return page; /* * Limit to 1/4-th the stack size for the argv+env strings. @@ -183,6 +190,7 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, * - the program will have a reasonable amount of stack left * to work from. */ + rlim = current->signal->rlim; if (size > rlim[RLIMIT_STACK].rlim_cur / 4) { put_page(page); return NULL; From 78a4a50a86b0a54f7ecbc164267b6c762760254c Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 29 Feb 2008 22:02:31 -0800 Subject: [PATCH 31/51] docbook: fix filesystems.tmpl source files Fix docbook problems in filesystems.tmpl. These cause the generated docbook to be incorrect. Signed-off-by: Randy Dunlap Signed-off-by: Linus Torvalds --- fs/buffer.c | 3 +-- fs/jbd/transaction.c | 17 +++++++++-------- fs/mpage.c | 11 +++-------- 3 files changed, 13 insertions(+), 18 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index 3ebccf4aa7e3..897cd7477b34 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -627,8 +627,7 @@ repeat: } /** - * sync_mapping_buffers - write out and wait upon a mapping's "associated" - * buffers + * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers * @mapping: the mapping which wants those buffers written * * Starts I/O against the buffers at mapping->private_list, and waits upon diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c index 038ed7436199..c6cbb6cd59b2 100644 --- a/fs/jbd/transaction.c +++ b/fs/jbd/transaction.c @@ -369,7 +369,7 @@ out: /** - * int journal_restart() - restart a handle . + * int journal_restart() - restart a handle. * @handle: handle to restart * @nblocks: nr credits requested * @@ -844,8 +844,7 @@ out: } /** - * int journal_get_undo_access() - Notify intent to modify metadata with - * non-rewindable consequences + * int journal_get_undo_access() - Notify intent to modify metadata with non-rewindable consequences * @handle: transaction * @bh: buffer to undo * @credits: store the number of taken credits here (if not NULL) @@ -921,12 +920,14 @@ out: } /** - * int journal_dirty_data() - mark a buffer as containing dirty data which - * needs to be flushed before we can commit the - * current transaction. + * int journal_dirty_data() - mark a buffer as containing dirty data to be flushed * @handle: transaction * @bh: bufferhead to mark * + * Description: + * Mark a buffer as containing dirty data which needs to be flushed before + * we can commit the current transaction. + * * The buffer is placed on the transaction's data list and is marked as * belonging to the transaction. * @@ -1098,11 +1099,11 @@ no_journal: } /** - * int journal_dirty_metadata() - mark a buffer as containing dirty metadata + * int journal_dirty_metadata() - mark a buffer as containing dirty metadata * @handle: transaction to add buffer to. * @bh: buffer to mark * - * mark dirty metadata which needs to be journaled as part of the current + * Mark dirty metadata which needs to be journaled as part of the current * transaction. * * The buffer is placed on the transaction's metadata list and is marked diff --git a/fs/mpage.c b/fs/mpage.c index 5df564366f36..235e4d3873a8 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -325,16 +325,12 @@ confused: } /** - * mpage_readpages - populate an address space with some pages, and - * start reads against them. - * + * mpage_readpages - populate an address space with some pages & start reads against them * @mapping: the address_space * @pages: The address of a list_head which contains the target pages. These * pages have their ->index populated and are otherwise uninitialised. - * * The page at @pages->prev has the lowest file offset, and reads should be * issued in @pages->prev to @pages->next order. - * * @nr_pages: The number of pages at *@pages * @get_block: The filesystem's block mapper function. * @@ -360,6 +356,7 @@ confused: * So an mpage read of the first 16 blocks of an ext2 file will cause I/O to be * submitted in the following order: * 12 0 1 2 3 4 5 6 7 8 9 10 11 13 14 15 16 + * * because the indirect block has to be read to get the mappings of blocks * 13,14,15,16. Obviously, this impacts performance. * @@ -656,9 +653,7 @@ out: } /** - * mpage_writepages - walk the list of dirty pages of the given - * address space and writepage() all of them. - * + * mpage_writepages - walk the list of dirty pages of the given address space & writepage() all of them * @mapping: address space structure to write * @wbc: subtract the number of written pages from *@wbc->nr_to_write * @get_block: the filesystem's block mapper function. From 7b089c8b90e6caedda889334bba5c72a152b79c6 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 29 Feb 2008 22:02:40 -0800 Subject: [PATCH 32/51] docbook: fix rapidio source files Fix docbook problems in rapidio source files. These cause the generated docbook to be incorrect. Signed-off-by: Randy Dunlap Signed-off-by: Linus Torvalds --- drivers/rapidio/rio-driver.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/rapidio/rio-driver.c b/drivers/rapidio/rio-driver.c index 5480119ff9d3..3ce9f3defc12 100644 --- a/drivers/rapidio/rio-driver.c +++ b/drivers/rapidio/rio-driver.c @@ -78,8 +78,7 @@ void rio_dev_put(struct rio_dev *rdev) } /** - * rio_device_probe - Tell if a RIO device structure has a matching RIO - * device id structure + * rio_device_probe - Tell if a RIO device structure has a matching RIO device id structure * @id: the RIO device id structure to match against * @dev: the RIO device structure to match against * @@ -137,7 +136,7 @@ static int rio_device_remove(struct device *dev) * rio_register_driver - register a new RIO driver * @rdrv: the RIO driver structure to register * - * Adds a &struct rio_driver to the list of registered drivers + * Adds a &struct rio_driver to the list of registered drivers. * Returns a negative value on error, otherwise 0. If no error * occurred, the driver remains registered even if no device * was claimed during registration. @@ -167,8 +166,7 @@ void rio_unregister_driver(struct rio_driver *rdrv) } /** - * rio_match_bus - Tell if a RIO device structure has a matching RIO - * driver device id structure + * rio_match_bus - Tell if a RIO device structure has a matching RIO driver device id structure * @dev: the standard device structure to match against * @drv: the standard driver structure containing the ids to match against * From e59e4a09729b06a131de9042b2a5b05b7ad26174 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 29 Feb 2008 22:02:50 -0800 Subject: [PATCH 33/51] docbook: fix scsi source file Fix docbook problem in SCSI source files. These cause the generated docbook to be incorrect. Signed-off-by: Randy Dunlap Signed-off-by: Linus Torvalds --- drivers/scsi/scsi_scan.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/scsi/scsi_scan.c b/drivers/scsi/scsi_scan.c index 1dc165ad17fb..e67c14e31bab 100644 --- a/drivers/scsi/scsi_scan.c +++ b/drivers/scsi/scsi_scan.c @@ -1577,8 +1577,7 @@ static void __scsi_scan_target(struct device *parent, unsigned int channel, } /** - * scsi_scan_target - scan a target id, possibly including all LUNs on the - * target. + * scsi_scan_target - scan a target id, possibly including all LUNs on the target. * @parent: host to scan * @channel: channel to scan * @id: target id to scan From d0bcabcd72dda5f553322a1ca92ae31c15b408b6 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 29 Feb 2008 22:03:07 -0800 Subject: [PATCH 34/51] docbook: fix usb source files Fix docbook problems in USB source files. These cause the generated docbook to be incorrect. Signed-off-by: Randy Dunlap Signed-off-by: Linus Torvalds --- drivers/usb/core/usb.c | 6 ++---- include/linux/usb.h | 9 +++------ 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/drivers/usb/core/usb.c b/drivers/usb/core/usb.c index 4e984060c984..f6f19908f5f0 100644 --- a/drivers/usb/core/usb.c +++ b/drivers/usb/core/usb.c @@ -99,8 +99,7 @@ struct usb_interface *usb_ifnum_to_if(const struct usb_device *dev, EXPORT_SYMBOL_GPL(usb_ifnum_to_if); /** - * usb_altnum_to_altsetting - get the altsetting structure with a given - * alternate setting number. + * usb_altnum_to_altsetting - get the altsetting structure with a given alternate setting number. * @intf: the interface containing the altsetting in question * @altnum: the desired alternate setting number * @@ -442,8 +441,7 @@ EXPORT_SYMBOL_GPL(usb_put_intf); */ /** - * usb_lock_device_for_reset - cautiously acquire the lock for a - * usb device structure + * usb_lock_device_for_reset - cautiously acquire the lock for a usb device structure * @udev: device that's being locked * @iface: interface bound to the driver making the request (optional) * diff --git a/include/linux/usb.h b/include/linux/usb.h index 2372e2e6b527..5bd3ae8aaaf4 100644 --- a/include/linux/usb.h +++ b/include/linux/usb.h @@ -781,8 +781,7 @@ static inline int usb_endpoint_is_isoc_out( .idVendor = (vend), \ .idProduct = (prod) /** - * USB_DEVICE_VER - macro used to describe a specific usb device with a - * version range + * USB_DEVICE_VER - describe a specific usb device with a version range * @vend: the 16 bit USB Vendor ID * @prod: the 16 bit USB Product ID * @lo: the bcdDevice_lo value @@ -799,8 +798,7 @@ static inline int usb_endpoint_is_isoc_out( .bcdDevice_hi = (hi) /** - * USB_DEVICE_INTERFACE_PROTOCOL - macro used to describe a usb - * device with a specific interface protocol + * USB_DEVICE_INTERFACE_PROTOCOL - describe a usb device with a specific interface protocol * @vend: the 16 bit USB Vendor ID * @prod: the 16 bit USB Product ID * @pr: bInterfaceProtocol value @@ -846,8 +844,7 @@ static inline int usb_endpoint_is_isoc_out( .bInterfaceProtocol = (pr) /** - * USB_DEVICE_AND_INTERFACE_INFO - macro used to describe a specific usb device - * with a class of usb interfaces + * USB_DEVICE_AND_INTERFACE_INFO - describe a specific usb device with a class of usb interfaces * @vend: the 16 bit USB Vendor ID * @prod: the 16 bit USB Product ID * @cl: bInterfaceClass value From 0643245f595dc175c14245fa1e1e9efda3e12f2a Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 29 Feb 2008 22:03:15 -0800 Subject: [PATCH 35/51] docbook: fix kernel-api source files Fix docbook problems in kernel-api.tmpl. These cause the generated docbook to be incorrect. Signed-off-by: Randy Dunlap Signed-off-by: Linus Torvalds --- drivers/base/transport_class.c | 4 +--- drivers/pci/rom.c | 3 +-- mm/truncate.c | 3 +-- 3 files changed, 3 insertions(+), 7 deletions(-) diff --git a/drivers/base/transport_class.c b/drivers/base/transport_class.c index f25e7c6b2d27..40bca48abc12 100644 --- a/drivers/base/transport_class.c +++ b/drivers/base/transport_class.c @@ -126,9 +126,7 @@ static int transport_setup_classdev(struct attribute_container *cont, } /** - * transport_setup_device - declare a new dev for transport class association - * but don't make it visible yet. - * + * transport_setup_device - declare a new dev for transport class association but don't make it visible yet. * @dev: the generic device representing the entity being added * * Usually, dev represents some component in the HBA system (either diff --git a/drivers/pci/rom.c b/drivers/pci/rom.c index a98b2470b9ea..bd5c0e031398 100644 --- a/drivers/pci/rom.c +++ b/drivers/pci/rom.c @@ -242,8 +242,7 @@ void pci_remove_rom(struct pci_dev *pdev) #endif /* 0 */ /** - * pci_cleanup_rom - internal routine for freeing the ROM copy created - * by pci_map_rom_copy called from remove.c + * pci_cleanup_rom - free the ROM copy created by pci_map_rom_copy * @pdev: pointer to pci device struct * * Free the copied ROM if we allocated one. diff --git a/mm/truncate.c b/mm/truncate.c index c35c49e54fb6..7d20ce41ecf5 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -134,8 +134,7 @@ invalidate_complete_page(struct address_space *mapping, struct page *page) } /** - * truncate_inode_pages - truncate range of pages specified by start and - * end byte offsets + * truncate_inode_pages - truncate range of pages specified by start & end byte offsets * @mapping: mapping to truncate * @lstart: offset from which to truncate * @lend: offset to which to truncate From 7105a387a8ac9b512b900efd5ff7a97acc44fb39 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 29 Feb 2008 22:03:27 -0800 Subject: [PATCH 36/51] docbook: fix fusion source files Fix docbook problems in fusion source files. These cause the generated docbook to be incorrect. Signed-off-by: Randy Dunlap Signed-off-by: Linus Torvalds --- drivers/message/fusion/mptbase.c | 25 +++++++++++++------------ drivers/message/fusion/mptscsih.c | 14 +++++--------- 2 files changed, 18 insertions(+), 21 deletions(-) diff --git a/drivers/message/fusion/mptbase.c b/drivers/message/fusion/mptbase.c index 0c303c84b37b..6b6df8679585 100644 --- a/drivers/message/fusion/mptbase.c +++ b/drivers/message/fusion/mptbase.c @@ -632,8 +632,7 @@ mpt_deregister(u8 cb_idx) /*=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/ /** - * mpt_event_register - Register protocol-specific event callback - * handler. + * mpt_event_register - Register protocol-specific event callback handler. * @cb_idx: previously registered (via mpt_register) callback handle * @ev_cbfunc: callback function * @@ -654,8 +653,7 @@ mpt_event_register(u8 cb_idx, MPT_EVHANDLER ev_cbfunc) /*=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/ /** - * mpt_event_deregister - Deregister protocol-specific event callback - * handler. + * mpt_event_deregister - Deregister protocol-specific event callback handler * @cb_idx: previously registered callback handle * * Each protocol-specific driver should call this routine @@ -765,11 +763,13 @@ mpt_device_driver_deregister(u8 cb_idx) /*=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/ /** - * mpt_get_msg_frame - Obtain a MPT request frame from the pool (of 1024) - * allocated per MPT adapter. + * mpt_get_msg_frame - Obtain an MPT request frame from the pool * @cb_idx: Handle of registered MPT protocol driver * @ioc: Pointer to MPT adapter structure * + * Obtain an MPT request frame from the pool (of 1024) that are + * allocated per MPT adapter. + * * Returns pointer to a MPT request frame or %NULL if none are available * or IOC is not active. */ @@ -834,13 +834,12 @@ mpt_get_msg_frame(u8 cb_idx, MPT_ADAPTER *ioc) /*=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/ /** - * mpt_put_msg_frame - Send a protocol specific MPT request frame - * to a IOC. + * mpt_put_msg_frame - Send a protocol-specific MPT request frame to an IOC * @cb_idx: Handle of registered MPT protocol driver * @ioc: Pointer to MPT adapter structure * @mf: Pointer to MPT request frame * - * This routine posts a MPT request frame to the request post FIFO of a + * This routine posts an MPT request frame to the request post FIFO of a * specific MPT adapter. */ void @@ -868,13 +867,15 @@ mpt_put_msg_frame(u8 cb_idx, MPT_ADAPTER *ioc, MPT_FRAME_HDR *mf) } /** - * mpt_put_msg_frame_hi_pri - Send a protocol specific MPT request frame - * to a IOC using hi priority request queue. + * mpt_put_msg_frame_hi_pri - Send a hi-pri protocol-specific MPT request frame * @cb_idx: Handle of registered MPT protocol driver * @ioc: Pointer to MPT adapter structure * @mf: Pointer to MPT request frame * - * This routine posts a MPT request frame to the request post FIFO of a + * Send a protocol-specific MPT request frame to an IOC using + * hi-priority request queue. + * + * This routine posts an MPT request frame to the request post FIFO of a * specific MPT adapter. **/ void diff --git a/drivers/message/fusion/mptscsih.c b/drivers/message/fusion/mptscsih.c index af1de0ccee2f..0c252f60c4c1 100644 --- a/drivers/message/fusion/mptscsih.c +++ b/drivers/message/fusion/mptscsih.c @@ -1533,7 +1533,7 @@ mptscsih_freeChainBuffers(MPT_ADAPTER *ioc, int req_idx) * * Remark: Currently invoked from a non-interrupt thread (_bh). * - * Remark: With old EH code, at most 1 SCSI TaskMgmt function per IOC + * Note: With old EH code, at most 1 SCSI TaskMgmt function per IOC * will be active. * * Returns 0 for SUCCESS, or %FAILED. @@ -2537,14 +2537,12 @@ mptscsih_copy_sense_data(struct scsi_cmnd *sc, MPT_SCSI_HOST *hd, MPT_FRAME_HDR /** * mptscsih_get_scsi_lookup - * - * retrieves scmd entry from ScsiLookup[] array list - * * @ioc: Pointer to MPT_ADAPTER structure * @i: index into the array * - * Returns the scsi_cmd pointer + * retrieves scmd entry from ScsiLookup[] array list * + * Returns the scsi_cmd pointer **/ static struct scsi_cmnd * mptscsih_get_scsi_lookup(MPT_ADAPTER *ioc, int i) @@ -2561,14 +2559,12 @@ mptscsih_get_scsi_lookup(MPT_ADAPTER *ioc, int i) /** * mptscsih_getclear_scsi_lookup - * - * retrieves and clears scmd entry from ScsiLookup[] array list - * * @ioc: Pointer to MPT_ADAPTER structure * @i: index into the array * - * Returns the scsi_cmd pointer + * retrieves and clears scmd entry from ScsiLookup[] array list * + * Returns the scsi_cmd pointer **/ static struct scsi_cmnd * mptscsih_getclear_scsi_lookup(MPT_ADAPTER *ioc, int i) From a973e9dd1e140a65bed694a2c5c8d53e9cba1a23 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Sat, 1 Mar 2008 13:40:44 -0800 Subject: [PATCH 37/51] Revert "unique end pointer" patch This only made sense for the alternate fastpath which was reverted last week. Mathieu is working on a new version that addresses the fastpath issues but that new code first needs to go through mm and it is not clear if we need the unique end pointers with his new scheme. Reviewed-by: Pekka Enberg Signed-off-by: Christoph Lameter --- include/linux/mm_types.h | 5 +-- mm/slub.c | 70 +++++++++++++--------------------------- 2 files changed, 24 insertions(+), 51 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index bfee0bd1d435..34023c65d466 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -64,10 +64,7 @@ struct page { #if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS spinlock_t ptl; #endif - struct { - struct kmem_cache *slab; /* SLUB: Pointer to slab */ - void *end; /* SLUB: end marker */ - }; + struct kmem_cache *slab; /* SLUB: Pointer to slab */ struct page *first_page; /* Compound tail pages */ }; union { diff --git a/mm/slub.c b/mm/slub.c index 74c65af0a54f..a873953e5a11 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -291,32 +291,15 @@ static inline struct kmem_cache_cpu *get_cpu_slab(struct kmem_cache *s, int cpu) #endif } -/* - * The end pointer in a slab is special. It points to the first object in the - * slab but has bit 0 set to mark it. - * - * Note that SLUB relies on page_mapping returning NULL for pages with bit 0 - * in the mapping set. - */ -static inline int is_end(void *addr) -{ - return (unsigned long)addr & PAGE_MAPPING_ANON; -} - -static void *slab_address(struct page *page) -{ - return page->end - PAGE_MAPPING_ANON; -} - static inline int check_valid_pointer(struct kmem_cache *s, struct page *page, const void *object) { void *base; - if (object == page->end) + if (!object) return 1; - base = slab_address(page); + base = page_address(page); if (object < base || object >= base + s->objects * s->size || (object - base) % s->size) { return 0; @@ -349,8 +332,7 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) /* Scan freelist */ #define for_each_free_object(__p, __s, __free) \ - for (__p = (__free); (__p) != page->end; __p = get_freepointer((__s),\ - __p)) + for (__p = (__free); __p; __p = get_freepointer((__s), __p)) /* Determine object index from a given position */ static inline int slab_index(void *p, struct kmem_cache *s, void *addr) @@ -502,7 +484,7 @@ static void slab_fix(struct kmem_cache *s, char *fmt, ...) static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) { unsigned int off; /* Offset of last byte */ - u8 *addr = slab_address(page); + u8 *addr = page_address(page); print_tracking(s, p); @@ -680,7 +662,7 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page) if (!(s->flags & SLAB_POISON)) return 1; - start = slab_address(page); + start = page_address(page); end = start + (PAGE_SIZE << s->order); length = s->objects * s->size; remainder = end - (start + length); @@ -748,7 +730,7 @@ static int check_object(struct kmem_cache *s, struct page *page, * of the free objects in this slab. May cause * another error because the object count is now wrong. */ - set_freepointer(s, p, page->end); + set_freepointer(s, p, NULL); return 0; } return 1; @@ -782,18 +764,18 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search) void *fp = page->freelist; void *object = NULL; - while (fp != page->end && nr <= s->objects) { + while (fp && nr <= s->objects) { if (fp == search) return 1; if (!check_valid_pointer(s, page, fp)) { if (object) { object_err(s, page, object, "Freechain corrupt"); - set_freepointer(s, object, page->end); + set_freepointer(s, object, NULL); break; } else { slab_err(s, page, "Freepointer corrupt"); - page->freelist = page->end; + page->freelist = NULL; page->inuse = s->objects; slab_fix(s, "Freelist cleared"); return 0; @@ -899,7 +881,7 @@ bad: */ slab_fix(s, "Marking all objects used"); page->inuse = s->objects; - page->freelist = page->end; + page->freelist = NULL; } return 0; } @@ -939,7 +921,7 @@ static int free_debug_processing(struct kmem_cache *s, struct page *page, } /* Special debug activities for freeing objects */ - if (!SlabFrozen(page) && page->freelist == page->end) + if (!SlabFrozen(page) && !page->freelist) remove_full(s, page); if (s->flags & SLAB_STORE_USER) set_track(s, object, TRACK_FREE, addr); @@ -1124,7 +1106,6 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) SetSlabDebug(page); start = page_address(page); - page->end = start + 1; if (unlikely(s->flags & SLAB_POISON)) memset(start, POISON_INUSE, PAGE_SIZE << s->order); @@ -1136,7 +1117,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) last = p; } setup_object(s, page, last); - set_freepointer(s, last, page->end); + set_freepointer(s, last, NULL); page->freelist = start; page->inuse = 0; @@ -1152,7 +1133,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page) void *p; slab_pad_check(s, page); - for_each_object(p, s, slab_address(page)) + for_each_object(p, s, page_address(page)) check_object(s, page, p, 0); ClearSlabDebug(page); } @@ -1162,7 +1143,6 @@ static void __free_slab(struct kmem_cache *s, struct page *page) NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, -pages); - page->mapping = NULL; __free_pages(page, s->order); } @@ -1366,7 +1346,7 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) ClearSlabFrozen(page); if (page->inuse) { - if (page->freelist != page->end) { + if (page->freelist) { add_partial(n, page, tail); stat(c, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD); } else { @@ -1410,12 +1390,8 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) * Merge cpu freelist into freelist. Typically we get here * because both freelists are empty. So this is unlikely * to occur. - * - * We need to use _is_end here because deactivate slab may - * be called for a debug slab. Then c->freelist may contain - * a dummy pointer. */ - while (unlikely(!is_end(c->freelist))) { + while (unlikely(c->freelist)) { void **object; tail = 0; /* Hot objects. Put the slab first */ @@ -1517,7 +1493,7 @@ static void *__slab_alloc(struct kmem_cache *s, stat(c, ALLOC_REFILL); load_freelist: object = c->page->freelist; - if (unlikely(object == c->page->end)) + if (unlikely(!object)) goto another_slab; if (unlikely(SlabDebug(c->page))) goto debug; @@ -1525,7 +1501,7 @@ load_freelist: object = c->page->freelist; c->freelist = object[c->offset]; c->page->inuse = s->objects; - c->page->freelist = c->page->end; + c->page->freelist = NULL; c->node = page_to_nid(c->page); unlock_out: slab_unlock(c->page); @@ -1607,7 +1583,7 @@ static __always_inline void *slab_alloc(struct kmem_cache *s, local_irq_save(flags); c = get_cpu_slab(s, smp_processor_id()); - if (unlikely(is_end(c->freelist) || !node_match(c, node))) + if (unlikely(!c->freelist || !node_match(c, node))) object = __slab_alloc(s, gfpflags, node, addr, c); @@ -1677,7 +1653,7 @@ checks_ok: * was not on the partial list before * then add it. */ - if (unlikely(prior == page->end)) { + if (unlikely(!prior)) { add_partial(get_node(s, page_to_nid(page)), page, 1); stat(c, FREE_ADD_PARTIAL); } @@ -1687,7 +1663,7 @@ out_unlock: return; slab_empty: - if (prior != page->end) { + if (prior) { /* * Slab still on the partial list. */ @@ -1910,7 +1886,7 @@ static void init_kmem_cache_cpu(struct kmem_cache *s, struct kmem_cache_cpu *c) { c->page = NULL; - c->freelist = (void *)PAGE_MAPPING_ANON; + c->freelist = NULL; c->node = 0; c->offset = s->offset / sizeof(void *); c->objsize = s->objsize; @@ -3199,7 +3175,7 @@ static int validate_slab(struct kmem_cache *s, struct page *page, unsigned long *map) { void *p; - void *addr = slab_address(page); + void *addr = page_address(page); if (!check_slab(s, page) || !on_freelist(s, page, NULL)) @@ -3482,7 +3458,7 @@ static int add_location(struct loc_track *t, struct kmem_cache *s, static void process_slab(struct loc_track *t, struct kmem_cache *s, struct page *page, enum track_item alloc) { - void *addr = slab_address(page); + void *addr = page_address(page); DECLARE_BITMAP(map, s->objects); void *p; From d9acf4b7b62d783d84273a61aed41a0f025b08ac Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 15 Feb 2008 15:22:21 -0800 Subject: [PATCH 38/51] slub: rename slab_objects to show_slab_objects The sysfs callback is better named show_slab_objects since it is always called from the xxx_show callbacks. We need the name for other purposes later. Reviewed-by: Pekka Enberg Signed-off-by: Christoph Lameter --- mm/slub.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index a873953e5a11..e01d399894c4 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3567,7 +3567,7 @@ enum slab_stat_type { #define SO_CPU (1 << SL_CPU) #define SO_OBJECTS (1 << SL_OBJECTS) -static unsigned long slab_objects(struct kmem_cache *s, +static unsigned long show_slab_objects(struct kmem_cache *s, char *buf, unsigned long flags) { unsigned long total = 0; @@ -3730,25 +3730,25 @@ SLAB_ATTR_RO(aliases); static ssize_t slabs_show(struct kmem_cache *s, char *buf) { - return slab_objects(s, buf, SO_FULL|SO_PARTIAL|SO_CPU); + return show_slab_objects(s, buf, SO_FULL|SO_PARTIAL|SO_CPU); } SLAB_ATTR_RO(slabs); static ssize_t partial_show(struct kmem_cache *s, char *buf) { - return slab_objects(s, buf, SO_PARTIAL); + return show_slab_objects(s, buf, SO_PARTIAL); } SLAB_ATTR_RO(partial); static ssize_t cpu_slabs_show(struct kmem_cache *s, char *buf) { - return slab_objects(s, buf, SO_CPU); + return show_slab_objects(s, buf, SO_CPU); } SLAB_ATTR_RO(cpu_slabs); static ssize_t objects_show(struct kmem_cache *s, char *buf) { - return slab_objects(s, buf, SO_FULL|SO_PARTIAL|SO_CPU|SO_OBJECTS); + return show_slab_objects(s, buf, SO_FULL|SO_PARTIAL|SO_CPU|SO_OBJECTS); } SLAB_ATTR_RO(objects); From e153362a50a34439718a938a851bba977116e19a Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 15 Feb 2008 23:45:24 -0800 Subject: [PATCH 39/51] slub: Remove objsize check in kmem_cache_flags() There is no page->offset anymore and also no associated limit on the number of objects. The page->offset field was removed for 2.6.24. So the check in kmem_cache_flags() is now also obsolete (should have been dropped earlier, somehow a hunk vanished). Reviewed-by: Pekka Enberg Signed-by: Christoph Lameter --- mm/slub.c | 27 ++++----------------------- 1 file changed, 4 insertions(+), 23 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index e01d399894c4..d7d0d866b6b2 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -997,30 +997,11 @@ static unsigned long kmem_cache_flags(unsigned long objsize, void (*ctor)(struct kmem_cache *, void *)) { /* - * The page->offset field is only 16 bit wide. This is an offset - * in units of words from the beginning of an object. If the slab - * size is bigger then we cannot move the free pointer behind the - * object anymore. - * - * On 32 bit platforms the limit is 256k. On 64bit platforms - * the limit is 512k. - * - * Debugging or ctor may create a need to move the free - * pointer. Fail if this happens. + * Enable debugging if selected on the kernel commandline. */ - if (objsize >= 65535 * sizeof(void *)) { - BUG_ON(flags & (SLAB_RED_ZONE | SLAB_POISON | - SLAB_STORE_USER | SLAB_DESTROY_BY_RCU)); - BUG_ON(ctor); - } else { - /* - * Enable debugging if selected on the kernel commandline. - */ - if (slub_debug && (!slub_debug_slabs || - strncmp(slub_debug_slabs, name, - strlen(slub_debug_slabs)) == 0)) - flags |= slub_debug; - } + if (slub_debug && (!slub_debug_slabs || + strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs)) == 0)) + flags |= slub_debug; return flags; } From d692ef6dcd20da60786470654410e85f29c2ddd9 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 15 Feb 2008 23:45:24 -0800 Subject: [PATCH 40/51] slub: Remove useless checks in alloc_debug_processing Alloc debug processing is never called with a NULL object pointer. No reason to check for NULL. Reviewed-by: Pekka Enberg Signed-off-by: Christoph Lameter --- mm/slub.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index d7d0d866b6b2..0a5a1001590b 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -852,7 +852,7 @@ static int alloc_debug_processing(struct kmem_cache *s, struct page *page, if (!check_slab(s, page)) goto bad; - if (object && !on_freelist(s, page, object)) { + if (!on_freelist(s, page, object)) { object_err(s, page, object, "Object already allocated"); goto bad; } @@ -862,7 +862,7 @@ static int alloc_debug_processing(struct kmem_cache *s, struct page *page, goto bad; } - if (object && !check_object(s, page, object, 0)) + if (!check_object(s, page, object, 0)) goto bad; /* Success perform special debug activities for allocs */ From 27d9e4e94862c89d171cf70911b4f11ad69fb54e Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 15 Feb 2008 23:45:25 -0800 Subject: [PATCH 41/51] slub: Use the objsize from the kmem_cache_cpu structure No need to access the kmem_cache structure. We have the same value in kmem_cache_cpu. Reviewed-by: Pekka Enberg Signed-off-by: Christoph Lameter --- mm/slub.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/slub.c b/mm/slub.c index 0a5a1001590b..b49570ca08b5 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1681,8 +1681,8 @@ static __always_inline void slab_free(struct kmem_cache *s, unsigned long flags; local_irq_save(flags); - debug_check_no_locks_freed(object, s->objsize); c = get_cpu_slab(s, smp_processor_id()); + debug_check_no_locks_freed(object, c->objsize); if (likely(page == c->page && c->node >= 0)) { object[c->offset] = c->freelist; c->freelist = object; From ae20bfda6813387af18c7fdbc0f8b1fa7be2d05b Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 15 Feb 2008 23:45:25 -0800 Subject: [PATCH 42/51] slub: Remove BUG_ON() from ksize and omit checks for !SLUB_DEBUG The BUG_ONs are useless since the pointer derefs will lead to NULL deref errors anyways. Some of the checks are not necessary if no debugging is possible. Signed-off-by: Christoph Lameter --- mm/slub.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index b49570ca08b5..09b5dc82df58 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2610,19 +2610,17 @@ size_t ksize(const void *object) struct page *page; struct kmem_cache *s; - BUG_ON(!object); if (unlikely(object == ZERO_SIZE_PTR)) return 0; page = virt_to_head_page(object); - BUG_ON(!page); if (unlikely(!PageSlab(page))) return PAGE_SIZE << compound_order(page); s = page->slab; - BUG_ON(!s); +#ifdef CONFIG_SLUB_DEBUG /* * Debugging requires use of the padding between object * and whatever may come after it. @@ -2630,6 +2628,7 @@ size_t ksize(const void *object) if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) return s->objsize; +#endif /* * If we have the need to store the freelist pointer * back there or track user information then we can @@ -2637,7 +2636,6 @@ size_t ksize(const void *object) */ if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER)) return s->inuse; - /* * Else we can use all the padding etc for the allocation */ From d8b42bf54be18b5d0bad941b3a1d3e8f022651a7 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 15 Feb 2008 23:45:25 -0800 Subject: [PATCH 43/51] slub: Rearrange #ifdef CONFIG_SLUB_DEBUG in calculate_sizes() Group SLUB_DEBUG code together to reduce the number of #ifdefs. Move some debug checks under the #ifdef. Reviewed-by: Pekka Enberg Signed-off-by: Christoph Lameter --- mm/slub.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 09b5dc82df58..72f5f4ecd1d2 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2129,6 +2129,14 @@ static int calculate_sizes(struct kmem_cache *s) unsigned long size = s->objsize; unsigned long align = s->align; + /* + * Round up object size to the next word boundary. We can only + * place the free pointer at word boundaries and this determines + * the possible location of the free pointer. + */ + size = ALIGN(size, sizeof(void *)); + +#ifdef CONFIG_SLUB_DEBUG /* * Determine if we can poison the object itself. If the user of * the slab may touch the object after free or before allocation @@ -2140,14 +2148,7 @@ static int calculate_sizes(struct kmem_cache *s) else s->flags &= ~__OBJECT_POISON; - /* - * Round up object size to the next word boundary. We can only - * place the free pointer at word boundaries and this determines - * the possible location of the free pointer. - */ - size = ALIGN(size, sizeof(void *)); -#ifdef CONFIG_SLUB_DEBUG /* * If we are Redzoning then check if there is some space between the * end of the object and the free pointer. If not then add an From 6446faa2ff30ca77c5b25e886bbbfb81c63f1c91 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 15 Feb 2008 23:45:26 -0800 Subject: [PATCH 44/51] slub: Fix up comments Provide comments and fix up various spelling / style issues. Signed-off-by: Christoph Lameter --- include/linux/slub_def.h | 4 ++-- mm/slub.c | 49 +++++++++++++++++++++++----------------- 2 files changed, 30 insertions(+), 23 deletions(-) diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index 57deecc79d52..b00c1c73eb0a 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -61,7 +61,7 @@ struct kmem_cache { int size; /* The size of an object including meta data */ int objsize; /* The size of an object without meta data */ int offset; /* Free pointer offset. */ - int order; + int order; /* Current preferred allocation order */ /* * Avoid an extra cache line for UP, SMP and for the node local to @@ -138,11 +138,11 @@ static __always_inline int kmalloc_index(size_t size) if (size <= 512) return 9; if (size <= 1024) return 10; if (size <= 2 * 1024) return 11; + if (size <= 4 * 1024) return 12; /* * The following is only needed to support architectures with a larger page * size than 4k. */ - if (size <= 4 * 1024) return 12; if (size <= 8 * 1024) return 13; if (size <= 16 * 1024) return 14; if (size <= 32 * 1024) return 15; diff --git a/mm/slub.c b/mm/slub.c index 72f5f4ecd1d2..10d546954efa 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -291,6 +291,7 @@ static inline struct kmem_cache_cpu *get_cpu_slab(struct kmem_cache *s, int cpu) #endif } +/* Verify that a pointer has an address that is valid within a slab page */ static inline int check_valid_pointer(struct kmem_cache *s, struct page *page, const void *object) { @@ -619,7 +620,7 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page, * A. Free pointer (if we cannot overwrite object on free) * B. Tracking data for SLAB_STORE_USER * C. Padding to reach required alignment boundary or at mininum - * one word if debuggin is on to be able to detect writes + * one word if debugging is on to be able to detect writes * before the word boundary. * * Padding is done using 0x5a (POISON_INUSE) @@ -1268,7 +1269,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) * may return off node objects because partial slabs are obtained * from other nodes and filled up. * - * If /sys/slab/xx/defrag_ratio is set to 100 (which makes + * If /sys/kernel/slab/xx/defrag_ratio is set to 100 (which makes * defrag_ratio = 1000) then every (well almost) allocation will * first attempt to defrag slab caches on other nodes. This means * scanning over all nodes to look for partial slabs which may be @@ -1343,9 +1344,11 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) * Adding an empty slab to the partial slabs in order * to avoid page allocator overhead. This slab needs * to come after the other slabs with objects in - * order to fill them up. That way the size of the - * partial list stays small. kmem_cache_shrink can - * reclaim empty slabs from the partial list. + * so that the others get filled first. That way the + * size of the partial list stays small. + * + * kmem_cache_shrink can reclaim any empty slabs from the + * partial list. */ add_partial(n, page, 1); slab_unlock(page); @@ -1368,7 +1371,7 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) if (c->freelist) stat(c, DEACTIVATE_REMOTE_FREES); /* - * Merge cpu freelist into freelist. Typically we get here + * Merge cpu freelist into slab freelist. Typically we get here * because both freelists are empty. So this is unlikely * to occur. */ @@ -1399,6 +1402,7 @@ static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) /* * Flush cpu slab. + * * Called from IPI handler with interrupts disabled. */ static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) @@ -1457,7 +1461,8 @@ static inline int node_match(struct kmem_cache_cpu *c, int node) * rest of the freelist to the lockless freelist. * * And if we were unable to get a new slab from the partial slab lists then - * we need to allocate a new slab. This is slowest path since we may sleep. + * we need to allocate a new slab. This is the slowest path since it involves + * a call to the page allocator and the setup of a new slab. */ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, void *addr, struct kmem_cache_cpu *c) @@ -1471,7 +1476,9 @@ static void *__slab_alloc(struct kmem_cache *s, slab_lock(c->page); if (unlikely(!node_match(c, node))) goto another_slab; + stat(c, ALLOC_REFILL); + load_freelist: object = c->page->freelist; if (unlikely(!object)) @@ -1616,6 +1623,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, if (unlikely(SlabDebug(page))) goto debug; + checks_ok: prior = object[offset] = page->freelist; page->freelist = object; @@ -1630,8 +1638,7 @@ checks_ok: goto slab_empty; /* - * Objects left in the slab. If it - * was not on the partial list before + * Objects left in the slab. If it was not on the partial list before * then add it. */ if (unlikely(!prior)) { @@ -1845,13 +1852,11 @@ static unsigned long calculate_alignment(unsigned long flags, unsigned long align, unsigned long size) { /* - * If the user wants hardware cache aligned objects then - * follow that suggestion if the object is sufficiently - * large. + * If the user wants hardware cache aligned objects then follow that + * suggestion if the object is sufficiently large. * - * The hardware cache alignment cannot override the - * specified alignment though. If that is greater - * then use it. + * The hardware cache alignment cannot override the specified + * alignment though. If that is greater then use it. */ if ((flags & SLAB_HWCACHE_ALIGN) && size > cache_line_size() / 2) @@ -2049,6 +2054,7 @@ static struct kmem_cache_node *early_kmem_cache_node_alloc(gfp_t gfpflags, #endif init_kmem_cache_node(n); atomic_long_inc(&n->nr_slabs); + /* * lockdep requires consistent irq usage for each lock * so even though there cannot be a race this early in @@ -2301,7 +2307,7 @@ int kmem_ptr_validate(struct kmem_cache *s, const void *object) /* * We could also check if the object is on the slabs freelist. * But this would be too expensive and it seems that the main - * purpose of kmem_ptr_valid is to check if the object belongs + * purpose of kmem_ptr_valid() is to check if the object belongs * to a certain slab. */ return 1; @@ -2913,7 +2919,7 @@ void __init kmem_cache_init(void) /* * Patch up the size_index table if we have strange large alignment * requirements for the kmalloc array. This is only the case for - * mips it seems. The standard arches will not generate any code here. + * MIPS it seems. The standard arches will not generate any code here. * * Largest permitted alignment is 256 bytes due to the way we * handle the index determination for the smaller caches. @@ -2942,7 +2948,6 @@ void __init kmem_cache_init(void) kmem_size = sizeof(struct kmem_cache); #endif - printk(KERN_INFO "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d," " CPUs=%d, Nodes=%d\n", @@ -3039,12 +3044,15 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, */ for_each_online_cpu(cpu) get_cpu_slab(s, cpu)->objsize = s->objsize; + s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); up_write(&slub_lock); + if (sysfs_slab_alias(s, name)) goto err; return s; } + s = kmalloc(kmem_size, GFP_KERNEL); if (s) { if (kmem_cache_open(s, GFP_KERNEL, name, @@ -3927,7 +3935,6 @@ SLAB_ATTR(remote_node_defrag_ratio); #endif #ifdef CONFIG_SLUB_STATS - static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si) { unsigned long sum = 0; @@ -4111,8 +4118,8 @@ static struct kset *slab_kset; #define ID_STR_LENGTH 64 /* Create a unique string id for a slab cache: - * format - * :[flags-]size:[memory address of kmemcache] + * + * Format :[flags-]size */ static char *create_unique_id(struct kmem_cache *s) { From 7693143481730686362cc6360e3d47c012d9b2c8 Mon Sep 17 00:00:00 2001 From: Pekka J Enberg Date: Sat, 1 Mar 2008 13:43:54 -0800 Subject: [PATCH 45/51] slub: look up object from the freelist once We only need to look up object from c->page->freelist once in __slab_alloc(). Signed-off-by: Pekka Enberg Signed-off-by: Christoph Lameter --- mm/slub.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 10d546954efa..db8026ba049f 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1486,7 +1486,6 @@ load_freelist: if (unlikely(SlabDebug(c->page))) goto debug; - object = c->page->freelist; c->freelist = object[c->offset]; c->page->inuse = s->objects; c->page->freelist = NULL; @@ -1542,7 +1541,6 @@ new_slab: return NULL; debug: - object = c->page->freelist; if (!alloc_debug_processing(s, c->page, object, addr)) goto another_slab; From f619cfe1bda809a97c407f4c723eb3235ecd64e5 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Sat, 1 Mar 2008 13:56:40 -0800 Subject: [PATCH 46/51] slub: Add kmalloc_large_node() to support kmalloc_node fallback Slub is missing some NUMA support for large kmallocs. Provide that. Reviewed-by: Pekka Enberg Signed-off-by: Christoph Lameter --- mm/slub.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index db8026ba049f..ecacacdce9d7 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2592,13 +2592,24 @@ void *__kmalloc(size_t size, gfp_t flags) } EXPORT_SYMBOL(__kmalloc); +static void *kmalloc_large_node(size_t size, gfp_t flags, int node) +{ + struct page *page = alloc_pages_node(node, flags | __GFP_COMP, + get_order(size)); + + if (page) + return page_address(page); + else + return NULL; +} + #ifdef CONFIG_NUMA void *__kmalloc_node(size_t size, gfp_t flags, int node) { struct kmem_cache *s; if (unlikely(size > PAGE_SIZE)) - return kmalloc_large(size, flags); + return kmalloc_large_node(size, flags, node); s = get_slab(size, flags); @@ -3146,7 +3157,7 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, struct kmem_cache *s; if (unlikely(size > PAGE_SIZE)) - return kmalloc_large(size, gfpflags); + return kmalloc_large_node(size, gfpflags, node); s = get_slab(size, gfpflags); From 62e5c4b4d6351707346695fd9e151b6cda85cbe1 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sun, 2 Mar 2008 23:28:24 +0300 Subject: [PATCH 47/51] slub: fix possible NULL pointer dereference This patch fix possible NULL pointer dereference if kzalloc failed. To be able to return proper error code the function return type is changed to ssize_t (according to callees and sysfs definitions). Signed-off-by: Cyrill Gorcunov Signed-off-by: Christoph Lameter --- mm/slub.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index ecacacdce9d7..0863fd38a5ce 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3564,8 +3564,8 @@ enum slab_stat_type { #define SO_CPU (1 << SL_CPU) #define SO_OBJECTS (1 << SL_OBJECTS) -static unsigned long show_slab_objects(struct kmem_cache *s, - char *buf, unsigned long flags) +static ssize_t show_slab_objects(struct kmem_cache *s, + char *buf, unsigned long flags) { unsigned long total = 0; int cpu; @@ -3575,6 +3575,8 @@ static unsigned long show_slab_objects(struct kmem_cache *s, unsigned long *per_cpu; nodes = kzalloc(2 * sizeof(unsigned long) * nr_node_ids, GFP_KERNEL); + if (!nodes) + return -ENOMEM; per_cpu = nodes + nr_node_ids; for_each_possible_cpu(cpu) { From 5ce2087ed0eb424e0889bdc9102727f65d2ecdde Mon Sep 17 00:00:00 2001 From: Samuel Thibault Date: Mon, 3 Mar 2008 01:23:49 +0000 Subject: [PATCH 48/51] Fix default compose table initialization Oddly enough, unsigned int c = '\300'; puts a "negative" value in c, not 0300... This fixes the default unicode compose table by using integers instead of character constants. Signed-off-by: Samuel Thibault Signed-off-by: Linus Torvalds --- drivers/acorn/char/defkeymap-l7200.c | 68 ++++++++++++++-------------- drivers/char/defkeymap.c_shipped | 68 ++++++++++++++-------------- drivers/s390/char/defkeymap.c | 4 +- 3 files changed, 70 insertions(+), 70 deletions(-) diff --git a/drivers/acorn/char/defkeymap-l7200.c b/drivers/acorn/char/defkeymap-l7200.c index 28a5fbc6aa1a..93d80a1c36f9 100644 --- a/drivers/acorn/char/defkeymap-l7200.c +++ b/drivers/acorn/char/defkeymap-l7200.c @@ -347,40 +347,40 @@ char *func_table[MAX_NR_FUNC] = { }; struct kbdiacruc accent_table[MAX_DIACR] = { - {'`', 'A', '\300'}, {'`', 'a', '\340'}, - {'\'', 'A', '\301'}, {'\'', 'a', '\341'}, - {'^', 'A', '\302'}, {'^', 'a', '\342'}, - {'~', 'A', '\303'}, {'~', 'a', '\343'}, - {'"', 'A', '\304'}, {'"', 'a', '\344'}, - {'O', 'A', '\305'}, {'o', 'a', '\345'}, - {'0', 'A', '\305'}, {'0', 'a', '\345'}, - {'A', 'A', '\305'}, {'a', 'a', '\345'}, - {'A', 'E', '\306'}, {'a', 'e', '\346'}, - {',', 'C', '\307'}, {',', 'c', '\347'}, - {'`', 'E', '\310'}, {'`', 'e', '\350'}, - {'\'', 'E', '\311'}, {'\'', 'e', '\351'}, - {'^', 'E', '\312'}, {'^', 'e', '\352'}, - {'"', 'E', '\313'}, {'"', 'e', '\353'}, - {'`', 'I', '\314'}, {'`', 'i', '\354'}, - {'\'', 'I', '\315'}, {'\'', 'i', '\355'}, - {'^', 'I', '\316'}, {'^', 'i', '\356'}, - {'"', 'I', '\317'}, {'"', 'i', '\357'}, - {'-', 'D', '\320'}, {'-', 'd', '\360'}, - {'~', 'N', '\321'}, {'~', 'n', '\361'}, - {'`', 'O', '\322'}, {'`', 'o', '\362'}, - {'\'', 'O', '\323'}, {'\'', 'o', '\363'}, - {'^', 'O', '\324'}, {'^', 'o', '\364'}, - {'~', 'O', '\325'}, {'~', 'o', '\365'}, - {'"', 'O', '\326'}, {'"', 'o', '\366'}, - {'/', 'O', '\330'}, {'/', 'o', '\370'}, - {'`', 'U', '\331'}, {'`', 'u', '\371'}, - {'\'', 'U', '\332'}, {'\'', 'u', '\372'}, - {'^', 'U', '\333'}, {'^', 'u', '\373'}, - {'"', 'U', '\334'}, {'"', 'u', '\374'}, - {'\'', 'Y', '\335'}, {'\'', 'y', '\375'}, - {'T', 'H', '\336'}, {'t', 'h', '\376'}, - {'s', 's', '\337'}, {'"', 'y', '\377'}, - {'s', 'z', '\337'}, {'i', 'j', '\377'}, + {'`', 'A', 0300}, {'`', 'a', 0340}, + {'\'', 'A', 0301}, {'\'', 'a', 0341}, + {'^', 'A', 0302}, {'^', 'a', 0342}, + {'~', 'A', 0303}, {'~', 'a', 0343}, + {'"', 'A', 0304}, {'"', 'a', 0344}, + {'O', 'A', 0305}, {'o', 'a', 0345}, + {'0', 'A', 0305}, {'0', 'a', 0345}, + {'A', 'A', 0305}, {'a', 'a', 0345}, + {'A', 'E', 0306}, {'a', 'e', 0346}, + {',', 'C', 0307}, {',', 'c', 0347}, + {'`', 'E', 0310}, {'`', 'e', 0350}, + {'\'', 'E', 0311}, {'\'', 'e', 0351}, + {'^', 'E', 0312}, {'^', 'e', 0352}, + {'"', 'E', 0313}, {'"', 'e', 0353}, + {'`', 'I', 0314}, {'`', 'i', 0354}, + {'\'', 'I', 0315}, {'\'', 'i', 0355}, + {'^', 'I', 0316}, {'^', 'i', 0356}, + {'"', 'I', 0317}, {'"', 'i', 0357}, + {'-', 'D', 0320}, {'-', 'd', 0360}, + {'~', 'N', 0321}, {'~', 'n', 0361}, + {'`', 'O', 0322}, {'`', 'o', 0362}, + {'\'', 'O', 0323}, {'\'', 'o', 0363}, + {'^', 'O', 0324}, {'^', 'o', 0364}, + {'~', 'O', 0325}, {'~', 'o', 0365}, + {'"', 'O', 0326}, {'"', 'o', 0366}, + {'/', 'O', 0330}, {'/', 'o', 0370}, + {'`', 'U', 0331}, {'`', 'u', 0371}, + {'\'', 'U', 0332}, {'\'', 'u', 0372}, + {'^', 'U', 0333}, {'^', 'u', 0373}, + {'"', 'U', 0334}, {'"', 'u', 0374}, + {'\'', 'Y', 0335}, {'\'', 'y', 0375}, + {'T', 'H', 0336}, {'t', 'h', 0376}, + {'s', 's', 0337}, {'"', 'y', 0377}, + {'s', 'z', 0337}, {'i', 'j', 0377}, }; unsigned int accent_table_size = 68; diff --git a/drivers/char/defkeymap.c_shipped b/drivers/char/defkeymap.c_shipped index 0aa419a61767..d2208dfe3f67 100644 --- a/drivers/char/defkeymap.c_shipped +++ b/drivers/char/defkeymap.c_shipped @@ -223,40 +223,40 @@ char *func_table[MAX_NR_FUNC] = { }; struct kbdiacruc accent_table[MAX_DIACR] = { - {'`', 'A', '\300'}, {'`', 'a', '\340'}, - {'\'', 'A', '\301'}, {'\'', 'a', '\341'}, - {'^', 'A', '\302'}, {'^', 'a', '\342'}, - {'~', 'A', '\303'}, {'~', 'a', '\343'}, - {'"', 'A', '\304'}, {'"', 'a', '\344'}, - {'O', 'A', '\305'}, {'o', 'a', '\345'}, - {'0', 'A', '\305'}, {'0', 'a', '\345'}, - {'A', 'A', '\305'}, {'a', 'a', '\345'}, - {'A', 'E', '\306'}, {'a', 'e', '\346'}, - {',', 'C', '\307'}, {',', 'c', '\347'}, - {'`', 'E', '\310'}, {'`', 'e', '\350'}, - {'\'', 'E', '\311'}, {'\'', 'e', '\351'}, - {'^', 'E', '\312'}, {'^', 'e', '\352'}, - {'"', 'E', '\313'}, {'"', 'e', '\353'}, - {'`', 'I', '\314'}, {'`', 'i', '\354'}, - {'\'', 'I', '\315'}, {'\'', 'i', '\355'}, - {'^', 'I', '\316'}, {'^', 'i', '\356'}, - {'"', 'I', '\317'}, {'"', 'i', '\357'}, - {'-', 'D', '\320'}, {'-', 'd', '\360'}, - {'~', 'N', '\321'}, {'~', 'n', '\361'}, - {'`', 'O', '\322'}, {'`', 'o', '\362'}, - {'\'', 'O', '\323'}, {'\'', 'o', '\363'}, - {'^', 'O', '\324'}, {'^', 'o', '\364'}, - {'~', 'O', '\325'}, {'~', 'o', '\365'}, - {'"', 'O', '\326'}, {'"', 'o', '\366'}, - {'/', 'O', '\330'}, {'/', 'o', '\370'}, - {'`', 'U', '\331'}, {'`', 'u', '\371'}, - {'\'', 'U', '\332'}, {'\'', 'u', '\372'}, - {'^', 'U', '\333'}, {'^', 'u', '\373'}, - {'"', 'U', '\334'}, {'"', 'u', '\374'}, - {'\'', 'Y', '\335'}, {'\'', 'y', '\375'}, - {'T', 'H', '\336'}, {'t', 'h', '\376'}, - {'s', 's', '\337'}, {'"', 'y', '\377'}, - {'s', 'z', '\337'}, {'i', 'j', '\377'}, + {'`', 'A', 0300}, {'`', 'a', 0340}, + {'\'', 'A', 0301}, {'\'', 'a', 0341}, + {'^', 'A', 0302}, {'^', 'a', 0342}, + {'~', 'A', 0303}, {'~', 'a', 0343}, + {'"', 'A', 0304}, {'"', 'a', 0344}, + {'O', 'A', 0305}, {'o', 'a', 0345}, + {'0', 'A', 0305}, {'0', 'a', 0345}, + {'A', 'A', 0305}, {'a', 'a', 0345}, + {'A', 'E', 0306}, {'a', 'e', 0346}, + {',', 'C', 0307}, {',', 'c', 0347}, + {'`', 'E', 0310}, {'`', 'e', 0350}, + {'\'', 'E', 0311}, {'\'', 'e', 0351}, + {'^', 'E', 0312}, {'^', 'e', 0352}, + {'"', 'E', 0313}, {'"', 'e', 0353}, + {'`', 'I', 0314}, {'`', 'i', 0354}, + {'\'', 'I', 0315}, {'\'', 'i', 0355}, + {'^', 'I', 0316}, {'^', 'i', 0356}, + {'"', 'I', 0317}, {'"', 'i', 0357}, + {'-', 'D', 0320}, {'-', 'd', 0360}, + {'~', 'N', 0321}, {'~', 'n', 0361}, + {'`', 'O', 0322}, {'`', 'o', 0362}, + {'\'', 'O', 0323}, {'\'', 'o', 0363}, + {'^', 'O', 0324}, {'^', 'o', 0364}, + {'~', 'O', 0325}, {'~', 'o', 0365}, + {'"', 'O', 0326}, {'"', 'o', 0366}, + {'/', 'O', 0330}, {'/', 'o', 0370}, + {'`', 'U', 0331}, {'`', 'u', 0371}, + {'\'', 'U', 0332}, {'\'', 'u', 0372}, + {'^', 'U', 0333}, {'^', 'u', 0373}, + {'"', 'U', 0334}, {'"', 'u', 0374}, + {'\'', 'Y', 0335}, {'\'', 'y', 0375}, + {'T', 'H', 0336}, {'t', 'h', 0376}, + {'s', 's', 0337}, {'"', 'y', 0377}, + {'s', 'z', 0337}, {'i', 'j', 0377}, }; unsigned int accent_table_size = 68; diff --git a/drivers/s390/char/defkeymap.c b/drivers/s390/char/defkeymap.c index 389346cda6c8..9692d6a205ef 100644 --- a/drivers/s390/char/defkeymap.c +++ b/drivers/s390/char/defkeymap.c @@ -151,8 +151,8 @@ char *func_table[MAX_NR_FUNC] = { }; struct kbdiacruc accent_table[MAX_DIACR] = { - {'^', 'c', '\003'}, {'^', 'd', '\004'}, - {'^', 'z', '\032'}, {'^', '\012', '\000'}, + {'^', 'c', 0003}, {'^', 'd', 0004}, + {'^', 'z', 0032}, {'^', 0012', 0000}, }; unsigned int accent_table_size = 4; From f49ee505b1ecb5960984880740f09aba87f870dc Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 2 Mar 2008 21:44:40 +0300 Subject: [PATCH 49/51] introduce kill_orphaned_pgrp() helper Factor out the common code in reparent_thread() and exit_notify(). No functional changes. Signed-off-by: Oleg Nesterov Signed-off-by: Linus Torvalds --- kernel/exit.c | 74 ++++++++++++++++++++++++--------------------------- 1 file changed, 35 insertions(+), 39 deletions(-) diff --git a/kernel/exit.c b/kernel/exit.c index 506a957b665a..11fcce760151 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -255,6 +255,37 @@ static int has_stopped_jobs(struct pid *pgrp) return retval; } +/* + * Check to see if any process groups have become orphaned as + * a result of our exiting, and if they have any stopped jobs, + * send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) + */ +static void +kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent) +{ + struct pid *pgrp = task_pgrp(tsk); + struct task_struct *ignored_task = tsk; + + if (!parent) + /* exit: our father is in a different pgrp than + * we are and we were the only connection outside. + */ + parent = tsk->real_parent; + else + /* reparent: our child is in a different pgrp than + * we are, and it was the only connection outside. + */ + ignored_task = NULL; + + if (task_pgrp(parent) != pgrp && + task_session(parent) == task_session(tsk) && + will_become_orphaned_pgrp(pgrp, ignored_task) && + has_stopped_jobs(pgrp)) { + __kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp); + __kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp); + } +} + /** * reparent_to_kthreadd - Reparent the calling kernel thread to kthreadd * @@ -635,22 +666,7 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced) p->exit_signal != -1 && thread_group_empty(p)) do_notify_parent(p, p->exit_signal); - /* - * process group orphan check - * Case ii: Our child is in a different pgrp - * than we are, and it was the only connection - * outside, so the child pgrp is now orphaned. - */ - if ((task_pgrp(p) != task_pgrp(father)) && - (task_session(p) == task_session(father))) { - struct pid *pgrp = task_pgrp(p); - - if (will_become_orphaned_pgrp(pgrp, NULL) && - has_stopped_jobs(pgrp)) { - __kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp); - __kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp); - } - } + kill_orphaned_pgrp(p, father); } /* @@ -738,8 +754,6 @@ static void forget_original_parent(struct task_struct *father) static void exit_notify(struct task_struct *tsk) { int state; - struct task_struct *t; - struct pid *pgrp; /* * This does two things: @@ -753,25 +767,7 @@ static void exit_notify(struct task_struct *tsk) exit_task_namespaces(tsk); write_lock_irq(&tasklist_lock); - /* - * Check to see if any process groups have become orphaned - * as a result of our exiting, and if they have any stopped - * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) - * - * Case i: Our father is in a different pgrp than we are - * and we were the only connection outside, so our pgrp - * is about to become orphaned. - */ - t = tsk->real_parent; - - pgrp = task_pgrp(tsk); - if ((task_pgrp(t) != pgrp) && - (task_session(t) == task_session(tsk)) && - will_become_orphaned_pgrp(pgrp, tsk) && - has_stopped_jobs(pgrp)) { - __kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp); - __kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp); - } + kill_orphaned_pgrp(tsk, NULL); /* Let father know we died * @@ -788,8 +784,8 @@ static void exit_notify(struct task_struct *tsk) * the same after a fork. */ if (tsk->exit_signal != SIGCHLD && tsk->exit_signal != -1 && - ( tsk->parent_exec_id != t->self_exec_id || - tsk->self_exec_id != tsk->parent_exec_id) + (tsk->parent_exec_id != tsk->real_parent->self_exec_id || + tsk->self_exec_id != tsk->parent_exec_id) && !capable(CAP_KILL)) tsk->exit_signal = SIGCHLD; From 05e83df624fe682bb8571cdb2c6d5284a99c3066 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 2 Mar 2008 21:44:42 +0300 Subject: [PATCH 50/51] will_become_orphaned_pgrp: partially fix insufficient ->exit_state check p->exit_state != 0 doesn't mean this process is dead, it may have sub-threads. Change the code to use "p->exit_state && thread_group_empty(p)" instead. Without this patch, ^Z doesn't deliver SIGTSTP to the foreground process if the main thread has exited. However, the new check is not perfect either. There is a window when exit_notify() drops tasklist and before release_task(). Suppose that the last (non-leader) thread exits. This means that entire group exits, but thread_group_empty() is not true yet. As Eric pointed out, is_global_init() is wrong as well, but I did not dare to do other changes. Just for the record, has_stopped_jobs() is absolutely wrong too. But we can't fix it now, we should first fix SIGNAL_STOP_STOPPED issues. Even with this patch ^Z doesn't play well with the dead main thread. The task is stopped correctly but do_wait(WSTOPPED) won't see it. This is another unrelated issue, will be (hopefully) fixed separately. Signed-off-by: Oleg Nesterov Signed-off-by: Linus Torvalds --- kernel/exit.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/kernel/exit.c b/kernel/exit.c index 11fcce760151..41c1edace97a 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -214,20 +214,19 @@ struct pid *session_of_pgrp(struct pid *pgrp) static int will_become_orphaned_pgrp(struct pid *pgrp, struct task_struct *ignored_task) { struct task_struct *p; - int ret = 1; do_each_pid_task(pgrp, PIDTYPE_PGID, p) { - if (p == ignored_task - || p->exit_state - || is_global_init(p->real_parent)) + if ((p == ignored_task) || + (p->exit_state && thread_group_empty(p)) || + is_global_init(p->real_parent)) continue; + if (task_pgrp(p->real_parent) != pgrp && - task_session(p->real_parent) == task_session(p)) { - ret = 0; - break; - } + task_session(p->real_parent) == task_session(p)) + return 0; } while_each_pid_task(pgrp, PIDTYPE_PGID, p); - return ret; /* (sighing) "Often!" */ + + return 1; } int is_current_pgrp_orphaned(void) From 821c7de7194e77afee1a69d50830a329a6d9af9f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 2 Mar 2008 21:44:44 +0300 Subject: [PATCH 51/51] exit_notify: fix kill_orphaned_pgrp() usage with mt exit 1. exit_notify() always calls kill_orphaned_pgrp(). This is wrong, we should do this only when the whole process exits. 2. exit_notify() uses "current" as "ignored_task", obviously wrong. Use ->group_leader instead. Test case: void hup(int sig) { printf("HUP received\n"); } void *tfunc(void *arg) { sleep(2); printf("sub-thread exited\n"); return NULL; } int main(int argc, char *argv[]) { if (!fork()) { signal(SIGHUP, hup); kill(getpid(), SIGSTOP); exit(0); } pthread_t thr; pthread_create(&thr, NULL, tfunc, NULL); sleep(1); printf("main thread exited\n"); syscall(__NR_exit, 0); return 0; } output: main thread exited HUP received Hangup With this patch the output is: main thread exited sub-thread exited HUP received Signed-off-by: Oleg Nesterov Signed-off-by: Linus Torvalds --- kernel/exit.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/kernel/exit.c b/kernel/exit.c index 41c1edace97a..cd20bf07e9e3 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -750,7 +750,7 @@ static void forget_original_parent(struct task_struct *father) * Send signals to all our closest relatives so that they know * to properly mourn us.. */ -static void exit_notify(struct task_struct *tsk) +static void exit_notify(struct task_struct *tsk, int group_dead) { int state; @@ -766,7 +766,8 @@ static void exit_notify(struct task_struct *tsk) exit_task_namespaces(tsk); write_lock_irq(&tasklist_lock); - kill_orphaned_pgrp(tsk, NULL); + if (group_dead) + kill_orphaned_pgrp(tsk->group_leader, NULL); /* Let father know we died * @@ -981,7 +982,7 @@ NORET_TYPE void do_exit(long code) module_put(tsk->binfmt->module); proc_exit_connector(tsk); - exit_notify(tsk); + exit_notify(tsk, group_dead); #ifdef CONFIG_NUMA mpol_free(tsk->mempolicy); tsk->mempolicy = NULL;