Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6

2010-04-09 15:17:41 +01:00 · 2010-04-09 15:17:41 +01:00 · 87d8a69709
parent 0b8973a818 2eaa9cfdf3
commit 87d8a69709
9969 changed files with 671197 additions and 288345 deletions
--- a/.gitignore
+++ b/.gitignore
@ -22,6 +22,7 @@
 *.lst
 *.symtypes
 *.order
+modules.builtin
 *.elf
 *.bin
 *.gz
@ -33,26 +34,26 @@
 #
 # Top-level generic files
 #
-tags
-TAGS
-vmlinux
-System.map
-Module.markers
-Module.symvers
+/tags
+/TAGS
+/linux
+/vmlinux
+/vmlinuz
+/System.map
+/Module.markers
+/Module.symvers
+
+#
+# git files that we don't want to ignore even it they are dot-files
+#
 !.gitignore
 !.mailmap

 #
 # Generated include files
 #
-include/asm
-include/asm-*/asm-offsets.h
 include/config
-include/linux/autoconf.h
-include/linux/compile.h
 include/linux/version.h
-include/linux/utsrelease.h
-include/linux/bounds.h
 include/generated

 # stgit generated dirs
--- a/Documentation/ABI/stable/sysfs-devices-node
+++ b/Documentation/ABI/stable/sysfs-devices-node
@ -0,0 +1,7 @@
+What:		/sys/devices/system/node/nodeX
+Date:		October 2002
+Contact:	Linux Memory Management list <linux-mm@kvack.org>
+Description:
+		When CONFIG_NUMA is enabled, this is a directory containing
+		information on node X such as what CPUs are local to the
+		node.
--- a/Documentation/ABI/testing/ima_policy
+++ b/Documentation/ABI/testing/ima_policy
@ -20,7 +20,7 @@ Description:
 			lsm:	[[subj_user=] [subj_role=] [subj_type=]
 				 [obj_user=] [obj_role=] [obj_type=]]

-		base: 	func:= [BPRM_CHECK][FILE_MMAP][INODE_PERMISSION]
+		base: 	func:= [BPRM_CHECK][FILE_MMAP][FILE_CHECK]
 			mask:= [MAY_READ] [MAY_WRITE] [MAY_APPEND] [MAY_EXEC]
 			fsmagic:= hex value
 			uid:= decimal value
@ -40,11 +40,11 @@ Description:

 			measure func=BPRM_CHECK
 			measure func=FILE_MMAP mask=MAY_EXEC
-			measure func=INODE_PERM mask=MAY_READ uid=0
+			measure func=FILE_CHECK mask=MAY_READ uid=0

 		The default policy measures all executables in bprm_check,
 		all files mmapped executable in file_mmap, and all files
-		open for read by root in inode_permission.
+		open for read by root in do_filp_open.

 		Examples of LSM specific definitions:

@ -54,8 +54,8 @@ Description:

 			dont_measure obj_type=var_log_t
 			dont_measure obj_type=auditd_log_t
-			measure subj_user=system_u func=INODE_PERM mask=MAY_READ
-			measure subj_role=system_r func=INODE_PERM mask=MAY_READ
+			measure subj_user=system_u func=FILE_CHECK mask=MAY_READ
+			measure subj_role=system_r func=FILE_CHECK mask=MAY_READ

 		Smack:
-			measure subj_user=_ func=INODE_PERM mask=MAY_READ
+			measure subj_user=_ func=FILE_CHECK mask=MAY_READ
--- a/Documentation/ABI/testing/sysfs-block
+++ b/Documentation/ABI/testing/sysfs-block
@ -128,3 +128,17 @@ Description:
 		preferred request size for workloads where sustained
 		throughput is desired.  If no optimal I/O size is
 		reported this file contains 0.
+
+What:		/sys/block/<disk>/queue/nomerges
+Date:		January 2010
+Contact:
+Description:
+		Standard I/O elevator operations include attempts to
+		merge contiguous I/Os. For known random I/O loads these
+		attempts will always fail and result in extra cycles
+		being spent in the kernel. This allows one to turn off
+		this behavior on one of two ways: When set to 1, complex
+		merge checks are disabled, but the simple one-shot merges
+		with the previous I/O request are enabled. When set to 2,
+		all merge tries are disabled. The default value is 0 -
+		which enables all types of merge tries.
--- a/Documentation/ABI/testing/sysfs-bus-usb
+++ b/Documentation/ABI/testing/sysfs-bus-usb
@ -21,25 +21,27 @@ Contact:	Alan Stern <stern@rowland.harvard.edu>
 Description:
 		Each USB device directory will contain a file named
 		power/level.  This file holds a power-level setting for
-		the device, one of "on", "auto", or "suspend".
+		the device, either "on" or "auto".

 		"on" means that the device is not allowed to autosuspend,
 		although normal suspends for system sleep will still
 		be honored.  "auto" means the device will autosuspend
 		and autoresume in the usual manner, according to the
-		capabilities of its driver.  "suspend" means the device
-		is forced into a suspended state and it will not autoresume
-		in response to I/O requests.  However remote-wakeup requests
-		from the device may still be enabled (the remote-wakeup
-		setting is controlled separately by the power/wakeup
-		attribute).
+		capabilities of its driver.

 		During normal use, devices should be left in the "auto"
-		level.  The other levels are meant for administrative uses.
+		level.  The "on" level is meant for administrative uses.
 		If you want to suspend a device immediately but leave it
 		free to wake up in response to I/O requests, you should
 		write "0" to power/autosuspend.

+		Device not capable of proper suspend and resume should be
+		left in the "on" level.  Although the USB spec requires
+		devices to support suspend/resume, many of them do not.
+		In fact so many don't that by default, the USB core
+		initializes all non-hub devices in the "on" level.  Some
+		drivers may change this setting when they are bound.
+
 What:		/sys/bus/usb/devices/.../power/persist
 Date:		May 2007
 KernelVersion:	2.6.23
@ -157,3 +159,14 @@ Description:
 		device.  This is useful to ensure auto probing won't
 		match the driver to the device.  For example:
 		# echo "046d c315" > /sys/bus/usb/drivers/foo/remove_id
+
+What:		/sys/bus/usb/device/.../avoid_reset_quirk
+Date:		December 2009
+Contact:	Oliver Neukum <oliver@neukum.org>
+Description:
+		Writing 1 to this file tells the kernel that this
+		device will morph into another mode when it is reset.
+		Drivers will not use reset for error handling for
+		such devices.
+Users:
+		usb_modeswitch
--- a/Documentation/ABI/testing/sysfs-devices-power
+++ b/Documentation/ABI/testing/sysfs-devices-power
@ -0,0 +1,79 @@
+What:		/sys/devices/.../power/
+Date:		January 2009
+Contact:	Rafael J. Wysocki <rjw@sisk.pl>
+Description:
+		The /sys/devices/.../power directory contains attributes
+		allowing the user space to check and modify some power
+		management related properties of given device.
+
+What:		/sys/devices/.../power/wakeup
+Date:		January 2009
+Contact:	Rafael J. Wysocki <rjw@sisk.pl>
+Description:
+		The /sys/devices/.../power/wakeup attribute allows the user
+		space to check if the device is enabled to wake up the system
+		from sleep states, such as the memory sleep state (suspend to
+		RAM) and hibernation (suspend to disk), and to enable or disable
+		it to do that as desired.
+
+		Some devices support "wakeup" events, which are hardware signals
+		used to activate the system from a sleep state.  Such devices
+		have one of the following two values for the sysfs power/wakeup
+		file:
+
+		+ "enabled\n" to issue the events;
+		+ "disabled\n" not to do so;
+
+		In that cases the user space can change the setting represented
+		by the contents of this file by writing either "enabled", or
+		"disabled" to it.
+
+		For the devices that are not capable of generating system wakeup
+		events this file contains "\n".  In that cases the user space
+		cannot modify the contents of this file and the device cannot be
+		enabled to wake up the system.
+
+What:		/sys/devices/.../power/control
+Date:		January 2009
+Contact:	Rafael J. Wysocki <rjw@sisk.pl>
+Description:
+		The /sys/devices/.../power/control attribute allows the user
+		space to control the run-time power management of the device.
+
+		All devices have one of the following two values for the
+		power/control file:
+
+		+ "auto\n" to allow the device to be power managed at run time;
+		+ "on\n" to prevent the device from being power managed;
+
+		The default for all devices is "auto", which means that they may
+		be subject to automatic power management, depending on their
+		drivers.  Changing this attribute to "on" prevents the driver
+		from power managing the device at run time.  Doing that while
+		the device is suspended causes it to be woken up.
+
+What:		/sys/devices/.../power/async
+Date:		January 2009
+Contact:	Rafael J. Wysocki <rjw@sisk.pl>
+Description:
+		The /sys/devices/.../async attribute allows the user space to
+		enable or diasble the device's suspend and resume callbacks to
+		be executed asynchronously (ie. in separate threads, in parallel
+		with the main suspend/resume thread) during system-wide power
+		transitions (eg. suspend to RAM, hibernation).
+
+		All devices have one of the following two values for the
+		power/async file:
+
+		+ "enabled\n" to permit the asynchronous suspend/resume;
+		+ "disabled\n" to forbid it;
+
+		The value of this attribute may be changed by writing either
+		"enabled", or "disabled" to it.
+
+		It generally is unsafe to permit the asynchronous suspend/resume
+		of a device unless it is certain that all of the PM dependencies
+		of the device are known to the PM core.  However, for some
+		devices this attribute is set to "enabled" by bus type code or
+		device drivers and in that cases it should be safe to leave the
+		default value.
--- a/Documentation/ABI/testing/sysfs-memory-page-offline
+++ b/Documentation/ABI/testing/sysfs-memory-page-offline
@ -0,0 +1,44 @@
+What:		/sys/devices/system/memory/soft_offline_page
+Date:		Sep 2009
+KernelVersion:	2.6.33
+Contact:	andi@firstfloor.org
+Description:
+		Soft-offline the memory page containing the physical address
+		written into this file. Input is a hex number specifying the
+		physical address of the page. The kernel will then attempt
+		to soft-offline it, by moving the contents elsewhere or
+		dropping it if possible. The kernel will then be placed
+		on the bad page list and never be reused.
+
+		The offlining is done in kernel specific granuality.
+		Normally it's the base page size of the kernel, but
+		this might change.
+
+		The page must be still accessible, not poisoned. The
+		kernel will never kill anything for this, but rather
+		fail the offline.  Return value is the size of the
+		number, or a error when the offlining failed.  Reading
+		the file is not allowed.
+
+What:		/sys/devices/system/memory/hard_offline_page
+Date:		Sep 2009
+KernelVersion:	2.6.33
+Contact:	andi@firstfloor.org
+Description:
+		Hard-offline the memory page containing the physical
+		address written into this file. Input is a hex number
+		specifying the physical address of the page. The
+		kernel will then attempt to hard-offline the page, by
+		trying to drop the page or killing any owner or
+		triggering IO errors if needed.  Note this may kill
+		any processes owning the page. The kernel will avoid
+		to access this page assuming it's poisoned by the
+		hardware.
+
+		The offlining is done in kernel specific granuality.
+		Normally it's the base page size of the kernel, but
+		this might change.
+
+		Return value is the size of the number, or a error when
+		the offlining failed.
+		Reading the file is not allowed.
--- a/Documentation/ABI/testing/sysfs-platform-asus-laptop
+++ b/Documentation/ABI/testing/sysfs-platform-asus-laptop
@ -1,4 +1,4 @@
-What:		/sys/devices/platform/asus-laptop/display
+What:		/sys/devices/platform/asus_laptop/display
 Date:		January 2007
 KernelVersion:	2.6.20
 Contact:	"Corentin Chary" <corentincj@iksaif.net>
@ -13,7 +13,7 @@ Description:
 		Ex: - 0 (0000b) means no display
 		    - 3 (0011b) CRT+LCD.

-What:		/sys/devices/platform/asus-laptop/gps
+What:		/sys/devices/platform/asus_laptop/gps
 Date:		January 2007
 KernelVersion:	2.6.20
 Contact:	"Corentin Chary" <corentincj@iksaif.net>
@ -21,7 +21,7 @@ Description:
 		Control the gps device. 1 means on, 0 means off.
 Users:		Lapsus

-What:		/sys/devices/platform/asus-laptop/ledd
+What:		/sys/devices/platform/asus_laptop/ledd
 Date:		January 2007
 KernelVersion:	2.6.20
 Contact:	"Corentin Chary" <corentincj@iksaif.net>
@ -29,11 +29,11 @@ Description:
 		Some models like the W1N have a LED display that can be
 		used to display several informations.
 		To control the LED display, use the following :
-		    echo 0x0T000DDD > /sys/devices/platform/asus-laptop/
+		    echo 0x0T000DDD > /sys/devices/platform/asus_laptop/
 		where T control the 3 letters display, and DDD the 3 digits display.
 		The DDD table can be found in Documentation/laptops/asus-laptop.txt

-What:		/sys/devices/platform/asus-laptop/bluetooth
+What:		/sys/devices/platform/asus_laptop/bluetooth
 Date:		January 2007
 KernelVersion:	2.6.20
 Contact:	"Corentin Chary" <corentincj@iksaif.net>
@ -42,7 +42,7 @@ Description:
 		This may control the led, the device or both.
 Users:		Lapsus

-What:		/sys/devices/platform/asus-laptop/wlan
+What:		/sys/devices/platform/asus_laptop/wlan
 Date:		January 2007
 KernelVersion:	2.6.20
 Contact:	"Corentin Chary" <corentincj@iksaif.net>
--- a/Documentation/ABI/testing/sysfs-platform-eeepc-laptop
+++ b/Documentation/ABI/testing/sysfs-platform-eeepc-laptop
@ -1,4 +1,4 @@
-What:		/sys/devices/platform/eeepc-laptop/disp
+What:		/sys/devices/platform/eeepc/disp
 Date:		May 2008
 KernelVersion:	2.6.26
 Contact:	"Corentin Chary" <corentincj@iksaif.net>
@ -9,21 +9,21 @@ Description:
 		- 3 = LCD+CRT
 		If you run X11, you should use xrandr instead.

-What:		/sys/devices/platform/eeepc-laptop/camera
+What:		/sys/devices/platform/eeepc/camera
 Date:		May 2008
 KernelVersion:	2.6.26
 Contact:	"Corentin Chary" <corentincj@iksaif.net>
 Description:
 		Control the camera. 1 means on, 0 means off.

-What:		/sys/devices/platform/eeepc-laptop/cardr
+What:		/sys/devices/platform/eeepc/cardr
 Date:		May 2008
 KernelVersion:	2.6.26
 Contact:	"Corentin Chary" <corentincj@iksaif.net>
 Description:
 		Control the card reader. 1 means on, 0 means off.

-What:		/sys/devices/platform/eeepc-laptop/cpufv
+What:		/sys/devices/platform/eeepc/cpufv
 Date:		Jun 2009
 KernelVersion:	2.6.31
 Contact:	"Corentin Chary" <corentincj@iksaif.net>
@ -42,7 +42,7 @@ Description:
 		    `------------ Availables modes
 		For example, 0x301 means: mode 1 selected, 3 available modes.

-What:		/sys/devices/platform/eeepc-laptop/available_cpufv
+What:		/sys/devices/platform/eeepc/available_cpufv
 Date:		Jun 2009
 KernelVersion:	2.6.31
 Contact:	"Corentin Chary" <corentincj@iksaif.net>
--- a/Documentation/ABI/testing/sysfs-power
+++ b/Documentation/ABI/testing/sysfs-power
@ -101,3 +101,16 @@ Description:

 		CAUTION: Using it will cause your machine's real-time (CMOS)
 		clock to be set to a random invalid time after a resume.
+
+What:		/sys/power/pm_async
+Date:		January 2009
+Contact:	Rafael J. Wysocki <rjw@sisk.pl>
+Description:
+		The /sys/power/pm_async file controls the switch allowing the
+		user space to enable or disable asynchronous suspend and resume
+		of devices.  If enabled, this feature will cause some device
+		drivers' suspend and resume callbacks to be executed in parallel
+		with each other and with the main suspend thread.  It is enabled
+		if this file contains "1", which is the default.  It may be
+		disabled by writing "0" to this file, in which case all devices
+		will be suspended and resumed synchronously.
--- a/Documentation/Changes
+++ b/Documentation/Changes
@ -49,6 +49,8 @@ o  oprofile               0.9                     # oprofiled --version
 o  udev                   081                     # udevinfo -V
 o  grub                   0.93                    # grub --version
 o  mcelog		  0.6
+o  iptables               1.4.1                   # iptables -V
+

 Kernel compilation
 ==================
--- a/Documentation/DMA-API-HOWTO.txt
+++ b/Documentation/DMA-API-HOWTO.txt
@ -1,12 +1,12 @@
-			Dynamic DMA mapping
-			===================
+		     Dynamic DMA mapping Guide
+		     =========================

 		 David S. Miller <davem@redhat.com>
 		 Richard Henderson <rth@cygnus.com>
 		  Jakub Jelinek <jakub@redhat.com>

-This document describes the DMA mapping system in terms of the pci_
-API.  For a similar API that works for generic devices, see
+This is a guide to device driver writers on how to use the DMA API
+with example pseudo-code.  For a concise description of the API, see
 DMA-API.txt.

 Most of the 64bit platforms have special hardware that translates bus
@ -26,12 +26,15 @@ mapped only for the time they are actually used and unmapped after the DMA
 transfer.

 The following API will work of course even on platforms where no such
-hardware exists, see e.g. arch/x86/include/asm/pci.h for how it is implemented on
-top of the virt_to_bus interface.
+hardware exists.
+
+Note that the DMA API works with any bus independent of the underlying
+microprocessor architecture. You should use the DMA API rather than
+the bus specific DMA API (e.g. pci_dma_*).

 First of all, you should make sure

-#include <linux/pci.h>
+#include <linux/dma-mapping.h>

 is in your driver. This file will obtain for you the definition of the
 dma_addr_t (which can hold any valid DMA address for the platform)
@ -78,44 +81,43 @@ for you to DMA from/to.
 			DMA addressing limitations

 Does your device have any DMA addressing limitations?  For example, is
-your device only capable of driving the low order 24-bits of address
-on the PCI bus for SAC DMA transfers?  If so, you need to inform the
-PCI layer of this fact.
+your device only capable of driving the low order 24-bits of address?
+If so, you need to inform the kernel of this fact.

 By default, the kernel assumes that your device can address the full
-32-bits in a SAC cycle.  For a 64-bit DAC capable device, this needs
-to be increased.  And for a device with limitations, as discussed in
-the previous paragraph, it needs to be decreased.
+32-bits.  For a 64-bit capable device, this needs to be increased.
+And for a device with limitations, as discussed in the previous
+paragraph, it needs to be decreased.

-pci_alloc_consistent() by default will return 32-bit DMA addresses.
-PCI-X specification requires PCI-X devices to support 64-bit
-addressing (DAC) for all transactions. And at least one platform (SGI
-SN2) requires 64-bit consistent allocations to operate correctly when
-the IO bus is in PCI-X mode. Therefore, like with pci_set_dma_mask(),
-it's good practice to call pci_set_consistent_dma_mask() to set the
-appropriate mask even if your device only supports 32-bit DMA
-(default) and especially if it's a PCI-X device.
+Special note about PCI: PCI-X specification requires PCI-X devices to
+support 64-bit addressing (DAC) for all transactions.  And at least
+one platform (SGI SN2) requires 64-bit consistent allocations to
+operate correctly when the IO bus is in PCI-X mode.

-For correct operation, you must interrogate the PCI layer in your
-device probe routine to see if the PCI controller on the machine can
-properly support the DMA addressing limitation your device has.  It is
-good style to do this even if your device holds the default setting,
+For correct operation, you must interrogate the kernel in your device
+probe routine to see if the DMA controller on the machine can properly
+support the DMA addressing limitation your device has.  It is good
+style to do this even if your device holds the default setting,
 because this shows that you did think about these issues wrt. your
 device.

-The query is performed via a call to pci_set_dma_mask():
+The query is performed via a call to dma_set_mask():

-	int pci_set_dma_mask(struct pci_dev *pdev, u64 device_mask);
+	int dma_set_mask(struct device *dev, u64 mask);

 The query for consistent allocations is performed via a call to
-pci_set_consistent_dma_mask():
+dma_set_coherent_mask():

-	int pci_set_consistent_dma_mask(struct pci_dev *pdev, u64 device_mask);
+	int dma_set_coherent_mask(struct device *dev, u64 mask);

-Here, pdev is a pointer to the PCI device struct of your device, and
-device_mask is a bit mask describing which bits of a PCI address your
-device supports.  It returns zero if your card can perform DMA
-properly on the machine given the address mask you provided.
+Here, dev is a pointer to the device struct of your device, and mask
+is a bit mask describing which bits of an address your device
+supports.  It returns zero if your card can perform DMA properly on
+the machine given the address mask you provided.  In general, the
+device struct of your device is embedded in the bus specific device
+struct of your device.  For example, a pointer to the device struct of
+your PCI device is pdev->dev (pdev is a pointer to the PCI device
+struct of your device).

 If it returns non-zero, your device cannot perform DMA properly on
 this platform, and attempting to do so will result in undefined
@ -133,31 +135,30 @@ of your driver reports that performance is bad or that the device is not
 even detected, you can ask them for the kernel messages to find out
 exactly why.

-The standard 32-bit addressing PCI device would do something like
-this:
+The standard 32-bit addressing device would do something like this:

-	if (pci_set_dma_mask(pdev, DMA_BIT_MASK(32))) {
+	if (dma_set_mask(dev, DMA_BIT_MASK(32))) {
 		printk(KERN_WARNING
 		       "mydev: No suitable DMA available.\n");
 		goto ignore_this_device;
 	}

-Another common scenario is a 64-bit capable device.  The approach
-here is to try for 64-bit DAC addressing, but back down to a
-32-bit mask should that fail.  The PCI platform code may fail the
-64-bit mask not because the platform is not capable of 64-bit
-addressing.  Rather, it may fail in this case simply because
-32-bit SAC addressing is done more efficiently than DAC addressing.
-Sparc64 is one platform which behaves in this way.
+Another common scenario is a 64-bit capable device.  The approach here
+is to try for 64-bit addressing, but back down to a 32-bit mask that
+should not fail.  The kernel may fail the 64-bit mask not because the
+platform is not capable of 64-bit addressing.  Rather, it may fail in
+this case simply because 32-bit addressing is done more efficiently
+than 64-bit addressing.  For example, Sparc64 PCI SAC addressing is
+more efficient than DAC addressing.

 Here is how you would handle a 64-bit capable device which can drive
 all 64-bits when accessing streaming DMA:

 	int using_dac;

-	if (!pci_set_dma_mask(pdev, DMA_BIT_MASK(64))) {
+	if (!dma_set_mask(dev, DMA_BIT_MASK(64))) {
 		using_dac = 1;
-	} else if (!pci_set_dma_mask(pdev, DMA_BIT_MASK(32))) {
+	} else if (!dma_set_mask(dev, DMA_BIT_MASK(32))) {
 		using_dac = 0;
 	} else {
 		printk(KERN_WARNING
@ -170,36 +171,36 @@ the case would look like this:

 	int using_dac, consistent_using_dac;

-	if (!pci_set_dma_mask(pdev, DMA_BIT_MASK(64))) {
+	if (!dma_set_mask(dev, DMA_BIT_MASK(64))) {
 		using_dac = 1;
 	   	consistent_using_dac = 1;
-		pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
-	} else if (!pci_set_dma_mask(pdev, DMA_BIT_MASK(32))) {
+		dma_set_coherent_mask(dev, DMA_BIT_MASK(64));
+	} else if (!dma_set_mask(dev, DMA_BIT_MASK(32))) {
 		using_dac = 0;
 		consistent_using_dac = 0;
-		pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
+		dma_set_coherent_mask(dev, DMA_BIT_MASK(32));
 	} else {
 		printk(KERN_WARNING
 		       "mydev: No suitable DMA available.\n");
 		goto ignore_this_device;
 	}

-pci_set_consistent_dma_mask() will always be able to set the same or a
-smaller mask as pci_set_dma_mask(). However for the rare case that a
+dma_set_coherent_mask() will always be able to set the same or a
+smaller mask as dma_set_mask(). However for the rare case that a
 device driver only uses consistent allocations, one would have to
-check the return value from pci_set_consistent_dma_mask().
+check the return value from dma_set_coherent_mask().

 Finally, if your device can only drive the low 24-bits of
-address during PCI bus mastering you might do something like:
+address you might do something like:

-	if (pci_set_dma_mask(pdev, DMA_BIT_MASK(24))) {
+	if (dma_set_mask(dev, DMA_BIT_MASK(24))) {
 		printk(KERN_WARNING
 		       "mydev: 24-bit DMA addressing not available.\n");
 		goto ignore_this_device;
 	}

-When pci_set_dma_mask() is successful, and returns zero, the PCI layer
-saves away this mask you have provided.  The PCI layer will use this
+When dma_set_mask() is successful, and returns zero, the kernel saves
+away this mask you have provided.  The kernel will use this
 information later when you make DMA mappings.

 There is a case which we are aware of at this time, which is worth
@ -208,7 +209,7 @@ functions (for example a sound card provides playback and record
 functions) and the various different functions have _different_
 DMA addressing limitations, you may wish to probe each mask and
 only provide the functionality which the machine can handle.  It
-is important that the last call to pci_set_dma_mask() be for the
+is important that the last call to dma_set_mask() be for the
 most specific mask.

 Here is pseudo-code showing how this might be done:
@ -217,17 +218,17 @@ Here is pseudo-code showing how this might be done:
 	#define RECORD_ADDRESS_BITS	DMA_BIT_MASK(24)

 	struct my_sound_card *card;
-	struct pci_dev *pdev;
+	struct device *dev;

 	...
-	if (!pci_set_dma_mask(pdev, PLAYBACK_ADDRESS_BITS)) {
+	if (!dma_set_mask(dev, PLAYBACK_ADDRESS_BITS)) {
 		card->playback_enabled = 1;
 	} else {
 		card->playback_enabled = 0;
 		printk(KERN_WARNING "%s: Playback disabled due to DMA limitations.\n",
 		       card->name);
 	}
-	if (!pci_set_dma_mask(pdev, RECORD_ADDRESS_BITS)) {
+	if (!dma_set_mask(dev, RECORD_ADDRESS_BITS)) {
 		card->record_enabled = 1;
 	} else {
 		card->record_enabled = 0;
@ -252,8 +253,8 @@ There are two types of DMA mappings:
  Think of "consistent" as "synchronous" or "coherent".

  The current default is to return consistent memory in the low 32
-  bits of the PCI bus space.  However, for future compatibility you
-  should set the consistent mask even if this default is fine for your
+  bits of the bus space.  However, for future compatibility you should
+  set the consistent mask even if this default is fine for your
  driver.

  Good examples of what to use consistent mappings for are:
@ -285,9 +286,9 @@ There are two types of DMA mappings:
 	     found in PCI bridges (such as by reading a register's value
 	     after writing it).

- Streaming DMA mappings which are usually mapped for one DMA transfer,
-  unmapped right after it (unless you use pci_dma_sync_* below) and for which
-  hardware can optimize for sequential accesses.
+- Streaming DMA mappings which are usually mapped for one DMA
+  transfer, unmapped right after it (unless you use dma_sync_* below)
+  and for which hardware can optimize for sequential accesses.

  This of "streaming" as "asynchronous" or "outside the coherency
  domain".
@ -302,8 +303,8 @@ There are two types of DMA mappings:
  optimizations the hardware allows.  To this end, when using
  such mappings you must be explicit about what you want to happen.

-Neither type of DMA mapping has alignment restrictions that come
-from PCI, although some devices may have such restrictions.
+Neither type of DMA mapping has alignment restrictions that come from
+the underlying bus, although some devices may have such restrictions.
 Also, systems with caches that aren't DMA-coherent will work better
 when the underlying buffers don't share cache lines with other data.

@ -315,33 +316,27 @@ you should do:

 	dma_addr_t dma_handle;

-	cpu_addr = pci_alloc_consistent(pdev, size, &dma_handle);
+	cpu_addr = dma_alloc_coherent(dev, size, &dma_handle, gfp);

-where pdev is a struct pci_dev *. This may be called in interrupt context.
-You should use dma_alloc_coherent (see DMA-API.txt) for buses
-where devices don't have struct pci_dev (like ISA, EISA).
-
-This argument is needed because the DMA translations may be bus
-specific (and often is private to the bus which the device is attached
-to).
+where device is a struct device *. This may be called in interrupt
+context with the GFP_ATOMIC flag.

 Size is the length of the region you want to allocate, in bytes.

 This routine will allocate RAM for that region, so it acts similarly to
 __get_free_pages (but takes size instead of a page order).  If your
 driver needs regions sized smaller than a page, you may prefer using
-the pci_pool interface, described below.
+the dma_pool interface, described below.

-The consistent DMA mapping interfaces, for non-NULL pdev, will by
-default return a DMA address which is SAC (Single Address Cycle)
-addressable.  Even if the device indicates (via PCI dma mask) that it
-may address the upper 32-bits and thus perform DAC cycles, consistent
-allocation will only return > 32-bit PCI addresses for DMA if the
-consistent dma mask has been explicitly changed via
-pci_set_consistent_dma_mask().  This is true of the pci_pool interface
-as well.
+The consistent DMA mapping interfaces, for non-NULL dev, will by
+default return a DMA address which is 32-bit addressable.  Even if the
+device indicates (via DMA mask) that it may address the upper 32-bits,
+consistent allocation will only return > 32-bit addresses for DMA if
+the consistent DMA mask has been explicitly changed via
+dma_set_coherent_mask().  This is true of the dma_pool interface as
+well.

-pci_alloc_consistent returns two values: the virtual address which you
+dma_alloc_coherent returns two values: the virtual address which you
 can use to access it from the CPU and dma_handle which you pass to the
 card.

@ -354,54 +349,54 @@ buffer you receive will not cross a 64K boundary.

 To unmap and free such a DMA region, you call:

-	pci_free_consistent(pdev, size, cpu_addr, dma_handle);
+	dma_free_coherent(dev, size, cpu_addr, dma_handle);

-where pdev, size are the same as in the above call and cpu_addr and
-dma_handle are the values pci_alloc_consistent returned to you.
+where dev, size are the same as in the above call and cpu_addr and
+dma_handle are the values dma_alloc_coherent returned to you.
 This function may not be called in interrupt context.

 If your driver needs lots of smaller memory regions, you can write
-custom code to subdivide pages returned by pci_alloc_consistent,
-or you can use the pci_pool API to do that.  A pci_pool is like
-a kmem_cache, but it uses pci_alloc_consistent not __get_free_pages.
+custom code to subdivide pages returned by dma_alloc_coherent,
+or you can use the dma_pool API to do that.  A dma_pool is like
+a kmem_cache, but it uses dma_alloc_coherent not __get_free_pages.
 Also, it understands common hardware constraints for alignment,
 like queue heads needing to be aligned on N byte boundaries.

-Create a pci_pool like this:
+Create a dma_pool like this:

-	struct pci_pool *pool;
+	struct dma_pool *pool;

-	pool = pci_pool_create(name, pdev, size, align, alloc);
+	pool = dma_pool_create(name, dev, size, align, alloc);

-The "name" is for diagnostics (like a kmem_cache name); pdev and size
+The "name" is for diagnostics (like a kmem_cache name); dev and size
 are as above.  The device's hardware alignment requirement for this
 type of data is "align" (which is expressed in bytes, and must be a
 power of two).  If your device has no boundary crossing restrictions,
 pass 0 for alloc; passing 4096 says memory allocated from this pool
 must not cross 4KByte boundaries (but at that time it may be better to
-go for pci_alloc_consistent directly instead).
+go for dma_alloc_coherent directly instead).

-Allocate memory from a pci pool like this:
+Allocate memory from a dma pool like this:

-	cpu_addr = pci_pool_alloc(pool, flags, &dma_handle);
+	cpu_addr = dma_pool_alloc(pool, flags, &dma_handle);

 flags are SLAB_KERNEL if blocking is permitted (not in_interrupt nor
-holding SMP locks), SLAB_ATOMIC otherwise.  Like pci_alloc_consistent,
+holding SMP locks), SLAB_ATOMIC otherwise.  Like dma_alloc_coherent,
 this returns two values, cpu_addr and dma_handle.

-Free memory that was allocated from a pci_pool like this:
+Free memory that was allocated from a dma_pool like this:

-	pci_pool_free(pool, cpu_addr, dma_handle);
+	dma_pool_free(pool, cpu_addr, dma_handle);

-where pool is what you passed to pci_pool_alloc, and cpu_addr and
-dma_handle are the values pci_pool_alloc returned. This function
+where pool is what you passed to dma_pool_alloc, and cpu_addr and
+dma_handle are the values dma_pool_alloc returned. This function
 may be called in interrupt context.

-Destroy a pci_pool by calling:
+Destroy a dma_pool by calling:

-	pci_pool_destroy(pool);
+	dma_pool_destroy(pool);

-Make sure you've called pci_pool_free for all memory allocated
+Make sure you've called dma_pool_free for all memory allocated
 from a pool before you destroy the pool. This function may not
 be called in interrupt context.

@ -411,15 +406,15 @@ The interfaces described in subsequent portions of this document
 take a DMA direction argument, which is an integer and takes on
 one of the following values:

- PCI_DMA_BIDIRECTIONAL
- PCI_DMA_TODEVICE
- PCI_DMA_FROMDEVICE
- PCI_DMA_NONE
+ DMA_BIDIRECTIONAL
+ DMA_TO_DEVICE
+ DMA_FROM_DEVICE
+ DMA_NONE

 One should provide the exact DMA direction if you know it.

-PCI_DMA_TODEVICE means "from main memory to the PCI device"
-PCI_DMA_FROMDEVICE means "from the PCI device to main memory"
+DMA_TO_DEVICE means "from main memory to the device"
+DMA_FROM_DEVICE means "from the device to main memory"
 It is the direction in which the data moves during the DMA
 transfer.

@ -427,12 +422,12 @@ You are _strongly_ encouraged to specify this as precisely
 as you possibly can.

 If you absolutely cannot know the direction of the DMA transfer,
-specify PCI_DMA_BIDIRECTIONAL.  It means that the DMA can go in
+specify DMA_BIDIRECTIONAL.  It means that the DMA can go in
 either direction.  The platform guarantees that you may legally
 specify this, and that it will work, but this may be at the
 cost of performance for example.

-The value PCI_DMA_NONE is to be used for debugging.  One can
+The value DMA_NONE is to be used for debugging.  One can
 hold this in a data structure before you come to know the
 precise direction, and this will help catch cases where your
 direction tracking logic has failed to set things up properly.
@ -442,21 +437,21 @@ potential platform-specific optimizations of such) is for debugging.
 Some platforms actually have a write permission boolean which DMA
 mappings can be marked with, much like page protections in the user
 program address space.  Such platforms can and do report errors in the
-kernel logs when the PCI controller hardware detects violation of the
+kernel logs when the DMA controller hardware detects violation of the
 permission setting.

 Only streaming mappings specify a direction, consistent mappings
 implicitly have a direction attribute setting of
-PCI_DMA_BIDIRECTIONAL.
+DMA_BIDIRECTIONAL.

 The SCSI subsystem tells you the direction to use in the
 'sc_data_direction' member of the SCSI command your driver is
 working on.

 For Networking drivers, it's a rather simple affair.  For transmit
-packets, map/unmap them with the PCI_DMA_TODEVICE direction
+packets, map/unmap them with the DMA_TO_DEVICE direction
 specifier.  For receive packets, just the opposite, map/unmap them
-with the PCI_DMA_FROMDEVICE direction specifier.
+with the DMA_FROM_DEVICE direction specifier.

 		  Using Streaming DMA mappings

@ -467,43 +462,43 @@ scatterlist.

 To map a single region, you do:

-	struct pci_dev *pdev = mydev->pdev;
+	struct device *dev = &my_dev->dev;
 	dma_addr_t dma_handle;
 	void *addr = buffer->ptr;
 	size_t size = buffer->len;

-	dma_handle = pci_map_single(pdev, addr, size, direction);
+	dma_handle = dma_map_single(dev, addr, size, direction);

 and to unmap it:

-	pci_unmap_single(pdev, dma_handle, size, direction);
+	dma_unmap_single(dev, dma_handle, size, direction);

-You should call pci_unmap_single when the DMA activity is finished, e.g.
+You should call dma_unmap_single when the DMA activity is finished, e.g.
 from the interrupt which told you that the DMA transfer is done.

 Using cpu pointers like this for single mappings has a disadvantage,
 you cannot reference HIGHMEM memory in this way.  Thus, there is a
-map/unmap interface pair akin to pci_{map,unmap}_single.  These
+map/unmap interface pair akin to dma_{map,unmap}_single.  These
 interfaces deal with page/offset pairs instead of cpu pointers.
 Specifically:

-	struct pci_dev *pdev = mydev->pdev;
+	struct device *dev = &my_dev->dev;
 	dma_addr_t dma_handle;
 	struct page *page = buffer->page;
 	unsigned long offset = buffer->offset;
 	size_t size = buffer->len;

-	dma_handle = pci_map_page(pdev, page, offset, size, direction);
+	dma_handle = dma_map_page(dev, page, offset, size, direction);

 	...

-	pci_unmap_page(pdev, dma_handle, size, direction);
+	dma_unmap_page(dev, dma_handle, size, direction);

 Here, "offset" means byte offset within the given page.

 With scatterlists, you map a region gathered from several regions by:

-	int i, count = pci_map_sg(pdev, sglist, nents, direction);
+	int i, count = dma_map_sg(dev, sglist, nents, direction);
 	struct scatterlist *sg;

 	for_each_sg(sglist, sg, count, i) {
@ -527,16 +522,16 @@ accessed sg->address and sg->length as shown above.

 To unmap a scatterlist, just call:

-	pci_unmap_sg(pdev, sglist, nents, direction);
+	dma_unmap_sg(dev, sglist, nents, direction);

 Again, make sure DMA activity has already finished.

-PLEASE NOTE:  The 'nents' argument to the pci_unmap_sg call must be
-              the _same_ one you passed into the pci_map_sg call,
+PLEASE NOTE:  The 'nents' argument to the dma_unmap_sg call must be
+              the _same_ one you passed into the dma_map_sg call,
 	      it should _NOT_ be the 'count' value _returned_ from the
-              pci_map_sg call.
+              dma_map_sg call.

-Every pci_map_{single,sg} call should have its pci_unmap_{single,sg}
+Every dma_map_{single,sg} call should have its dma_unmap_{single,sg}
 counterpart, because the bus address space is a shared resource (although
 in some ports the mapping is per each BUS so less devices contend for the
 same bus address space) and you could render the machine unusable by eating
@ -547,14 +542,14 @@ the data in between the DMA transfers, the buffer needs to be synced
 properly in order for the cpu and device to see the most uptodate and
 correct copy of the DMA buffer.

-So, firstly, just map it with pci_map_{single,sg}, and after each DMA
+So, firstly, just map it with dma_map_{single,sg}, and after each DMA
 transfer call either:

-	pci_dma_sync_single_for_cpu(pdev, dma_handle, size, direction);
+	dma_sync_single_for_cpu(dev, dma_handle, size, direction);

 or:

-	pci_dma_sync_sg_for_cpu(pdev, sglist, nents, direction);
+	dma_sync_sg_for_cpu(dev, sglist, nents, direction);

 as appropriate.

@ -562,27 +557,27 @@ Then, if you wish to let the device get at the DMA area again,
 finish accessing the data with the cpu, and then before actually
 giving the buffer to the hardware call either:

-	pci_dma_sync_single_for_device(pdev, dma_handle, size, direction);
+	dma_sync_single_for_device(dev, dma_handle, size, direction);

 or:

-	pci_dma_sync_sg_for_device(dev, sglist, nents, direction);
+	dma_sync_sg_for_device(dev, sglist, nents, direction);

 as appropriate.

 After the last DMA transfer call one of the DMA unmap routines
-pci_unmap_{single,sg}. If you don't touch the data from the first pci_map_*
-call till pci_unmap_*, then you don't have to call the pci_dma_sync_*
+dma_unmap_{single,sg}. If you don't touch the data from the first dma_map_*
+call till dma_unmap_*, then you don't have to call the dma_sync_*
 routines at all.

 Here is pseudo code which shows a situation in which you would need
-to use the pci_dma_sync_*() interfaces.
+to use the dma_sync_*() interfaces.

 	my_card_setup_receive_buffer(struct my_card *cp, char *buffer, int len)
 	{
 		dma_addr_t mapping;

-		mapping = pci_map_single(cp->pdev, buffer, len, PCI_DMA_FROMDEVICE);
+		mapping = dma_map_single(cp->dev, buffer, len, DMA_FROM_DEVICE);

 		cp->rx_buf = buffer;
 		cp->rx_len = len;
@ -606,25 +601,25 @@ to use the pci_dma_sync_*() interfaces.
 			 * the DMA transfer with the CPU first
 			 * so that we see updated contents.
 			 */
-			pci_dma_sync_single_for_cpu(cp->pdev, cp->rx_dma,
-						    cp->rx_len,
-						    PCI_DMA_FROMDEVICE);
+			dma_sync_single_for_cpu(&cp->dev, cp->rx_dma,
+						cp->rx_len,
+						DMA_FROM_DEVICE);

 			/* Now it is safe to examine the buffer. */
 			hp = (struct my_card_header *) cp->rx_buf;
 			if (header_is_ok(hp)) {
-				pci_unmap_single(cp->pdev, cp->rx_dma, cp->rx_len,
-						 PCI_DMA_FROMDEVICE);
+				dma_unmap_single(&cp->dev, cp->rx_dma, cp->rx_len,
+						 DMA_FROM_DEVICE);
 				pass_to_upper_layers(cp->rx_buf);
 				make_and_setup_new_rx_buf(cp);
 			} else {
 				/* Just sync the buffer and give it back
 				 * to the card.
 				 */
-				pci_dma_sync_single_for_device(cp->pdev,
-							       cp->rx_dma,
-							       cp->rx_len,
-							       PCI_DMA_FROMDEVICE);
+				dma_sync_single_for_device(&cp->dev,
+							   cp->rx_dma,
+							   cp->rx_len,
+							   DMA_FROM_DEVICE);
 				give_rx_buf_to_card(cp);
 			}
 		}
@ -634,19 +629,19 @@ Drivers converted fully to this interface should not use virt_to_bus any
 longer, nor should they use bus_to_virt. Some drivers have to be changed a
 little bit, because there is no longer an equivalent to bus_to_virt in the
 dynamic DMA mapping scheme - you have to always store the DMA addresses
-returned by the pci_alloc_consistent, pci_pool_alloc, and pci_map_single
-calls (pci_map_sg stores them in the scatterlist itself if the platform
+returned by the dma_alloc_coherent, dma_pool_alloc, and dma_map_single
+calls (dma_map_sg stores them in the scatterlist itself if the platform
 supports dynamic DMA mapping in hardware) in your driver structures and/or
 in the card registers.

-All PCI drivers should be using these interfaces with no exceptions.
-It is planned to completely remove virt_to_bus() and bus_to_virt() as
+All drivers should be using these interfaces with no exceptions.  It
+is planned to completely remove virt_to_bus() and bus_to_virt() as
 they are entirely deprecated.  Some ports already do not provide these
 as it is impossible to correctly support them.

 		Optimizing Unmap State Space Consumption

-On many platforms, pci_unmap_{single,page}() is simply a nop.
+On many platforms, dma_unmap_{single,page}() is simply a nop.
 Therefore, keeping track of the mapping address and length is a waste
 of space.  Instead of filling your drivers up with ifdefs and the like
 to "work around" this (which would defeat the whole purpose of a
@ -655,7 +650,7 @@ portable API) the following facilities are provided.
 Actually, instead of describing the macros one by one, we'll
 transform some example code.

-1) Use DECLARE_PCI_UNMAP_{ADDR,LEN} in state saving structures.
+1) Use DEFINE_DMA_UNMAP_{ADDR,LEN} in state saving structures.
   Example, before:

 	struct ring_state {
@ -668,14 +663,11 @@ transform some example code.

 	struct ring_state {
 		struct sk_buff *skb;
-		DECLARE_PCI_UNMAP_ADDR(mapping)
-		DECLARE_PCI_UNMAP_LEN(len)
+		DEFINE_DMA_UNMAP_ADDR(mapping);
+		DEFINE_DMA_UNMAP_LEN(len);
 	};

-   NOTE: DO NOT put a semicolon at the end of the DECLARE_*()
-         macro.
-
-2) Use pci_unmap_{addr,len}_set to set these values.
+2) Use dma_unmap_{addr,len}_set to set these values.
   Example, before:

 	ringp->mapping = FOO;
@ -683,21 +675,21 @@ transform some example code.

   after:

-	pci_unmap_addr_set(ringp, mapping, FOO);
-	pci_unmap_len_set(ringp, len, BAR);
+	dma_unmap_addr_set(ringp, mapping, FOO);
+	dma_unmap_len_set(ringp, len, BAR);

-3) Use pci_unmap_{addr,len} to access these values.
+3) Use dma_unmap_{addr,len} to access these values.
   Example, before:

-	pci_unmap_single(pdev, ringp->mapping, ringp->len,
-			 PCI_DMA_FROMDEVICE);
+	dma_unmap_single(dev, ringp->mapping, ringp->len,
+			 DMA_FROM_DEVICE);

   after:

-	pci_unmap_single(pdev,
-			 pci_unmap_addr(ringp, mapping),
-			 pci_unmap_len(ringp, len),
-			 PCI_DMA_FROMDEVICE);
+	dma_unmap_single(dev,
+			 dma_unmap_addr(ringp, mapping),
+			 dma_unmap_len(ringp, len),
+			 DMA_FROM_DEVICE);

 It really should be self-explanatory.  We treat the ADDR and LEN
 separately, because it is possible for an implementation to only
@ -732,15 +724,15 @@ to "Closing".
 DMA address space is limited on some architectures and an allocation
 failure can be determined by:

- checking if pci_alloc_consistent returns NULL or pci_map_sg returns 0
+- checking if dma_alloc_coherent returns NULL or dma_map_sg returns 0

- checking the returned dma_addr_t of pci_map_single and pci_map_page
-  by using pci_dma_mapping_error():
+- checking the returned dma_addr_t of dma_map_single and dma_map_page
+  by using dma_mapping_error():

 	dma_addr_t dma_handle;

-	dma_handle = pci_map_single(pdev, addr, size, direction);
-	if (pci_dma_mapping_error(pdev, dma_handle)) {
+	dma_handle = dma_map_single(dev, addr, size, direction);
+	if (dma_mapping_error(dev, dma_handle)) {
 		/*
 		 * reduce current DMA mapping usage,
 		 * delay and try again later or
--- a/Documentation/DMA-API.txt
+++ b/Documentation/DMA-API.txt
@ -4,20 +4,18 @@
        James E.J. Bottomley <James.Bottomley@HansenPartnership.com>

 This document describes the DMA API.  For a more gentle introduction
-phrased in terms of the pci_ equivalents (and actual examples) see
-Documentation/PCI/PCI-DMA-mapping.txt.
+of the API (and actual examples) see
+Documentation/DMA-API-HOWTO.txt.

-This API is split into two pieces.  Part I describes the API and the
-corresponding pci_ API.  Part II describes the extensions to the API
-for supporting non-consistent memory machines.  Unless you know that
-your driver absolutely has to support non-consistent platforms (this
-is usually only legacy platforms) you should only use the API
-described in part I.
+This API is split into two pieces.  Part I describes the API.  Part II
+describes the extensions to the API for supporting non-consistent
+memory machines.  Unless you know that your driver absolutely has to
+support non-consistent platforms (this is usually only legacy
+platforms) you should only use the API described in part I.

-Part I - pci_ and dma_ Equivalent API 
+Part I - dma_ API
 -------------------------------------

-To get the pci_ API, you must #include <linux/pci.h>
 To get the dma_ API, you must #include <linux/dma-mapping.h>


@ -27,9 +25,6 @@ Part Ia - Using large dma-coherent buffers
 void *
 dma_alloc_coherent(struct device *dev, size_t size,
 			     dma_addr_t *dma_handle, gfp_t flag)
-void *
-pci_alloc_consistent(struct pci_dev *dev, size_t size,
-			     dma_addr_t *dma_handle)

 Consistent memory is memory for which a write by either the device or
 the processor can immediately be read by the processor or device
@ -53,15 +48,11 @@ The simplest way to do that is to use the dma_pool calls (see below).
 The flag parameter (dma_alloc_coherent only) allows the caller to
 specify the GFP_ flags (see kmalloc) for the allocation (the
 implementation may choose to ignore flags that affect the location of
-the returned memory, like GFP_DMA).  For pci_alloc_consistent, you
-must assume GFP_ATOMIC behaviour.
+the returned memory, like GFP_DMA).

 void
 dma_free_coherent(struct device *dev, size_t size, void *cpu_addr,
 			   dma_addr_t dma_handle)
-void
-pci_free_consistent(struct pci_dev *dev, size_t size, void *cpu_addr,
-			   dma_addr_t dma_handle)

 Free the region of consistent memory you previously allocated.  dev,
 size and dma_handle must all be the same as those passed into the
@ -89,10 +80,6 @@ for alignment, like queue heads needing to be aligned on N-byte boundaries.
 	dma_pool_create(const char *name, struct device *dev,
 			size_t size, size_t align, size_t alloc);

-	struct pci_pool *
-	pci_pool_create(const char *name, struct pci_device *dev,
-			size_t size, size_t align, size_t alloc);
-
 The pool create() routines initialize a pool of dma-coherent buffers
 for use with a given device.  It must be called in a context which
 can sleep.
@ -108,9 +95,6 @@ from this pool must not cross 4KByte boundaries.
 	void *dma_pool_alloc(struct dma_pool *pool, gfp_t gfp_flags,
 			dma_addr_t *dma_handle);

-	void *pci_pool_alloc(struct pci_pool *pool, gfp_t gfp_flags,
-			dma_addr_t *dma_handle);
-
 This allocates memory from the pool; the returned memory will meet the size
 and alignment requirements specified at creation time.  Pass GFP_ATOMIC to
 prevent blocking, or if it's permitted (not in_interrupt, not holding SMP locks),
@ -122,9 +106,6 @@ pool's device.
 	void dma_pool_free(struct dma_pool *pool, void *vaddr,
 			dma_addr_t addr);

-	void pci_pool_free(struct pci_pool *pool, void *vaddr,
-			dma_addr_t addr);
-
 This puts memory back into the pool.  The pool is what was passed to
 the pool allocation routine; the cpu (vaddr) and dma addresses are what
 were returned when that routine allocated the memory being freed.
@ -132,8 +113,6 @@ were returned when that routine allocated the memory being freed.

 	void dma_pool_destroy(struct dma_pool *pool);

-	void pci_pool_destroy(struct pci_pool *pool);
-
 The pool destroy() routines free the resources of the pool.  They must be
 called in a context which can sleep.  Make sure you've freed all allocated
 memory back to the pool before you destroy it.
@ -144,8 +123,6 @@ Part Ic - DMA addressing limitations

 int
 dma_supported(struct device *dev, u64 mask)
-int
-pci_dma_supported(struct pci_dev *hwdev, u64 mask)

 Checks to see if the device can support DMA to the memory described by
 mask.
@ -159,8 +136,14 @@ driver writers.

 int
 dma_set_mask(struct device *dev, u64 mask)
+
+Checks to see if the mask is possible and updates the device
+parameters if it is.
+
+Returns: 0 if successful and a negative error if not.
+
 int
-pci_set_dma_mask(struct pci_device *dev, u64 mask)
+dma_set_coherent_mask(struct device *dev, u64 mask)

 Checks to see if the mask is possible and updates the device
 parameters if it is.
@ -187,9 +170,6 @@ Part Id - Streaming DMA mappings
 dma_addr_t
 dma_map_single(struct device *dev, void *cpu_addr, size_t size,
 		      enum dma_data_direction direction)
-dma_addr_t
-pci_map_single(struct pci_dev *hwdev, void *cpu_addr, size_t size,
-		      int direction)

 Maps a piece of processor virtual memory so it can be accessed by the
 device and returns the physical handle of the memory.
@ -198,14 +178,10 @@ The direction for both api's may be converted freely by casting.
 However the dma_ API uses a strongly typed enumerator for its
 direction:

-DMA_NONE		= PCI_DMA_NONE		no direction (used for
-						debugging)
-DMA_TO_DEVICE		= PCI_DMA_TODEVICE	data is going from the
-						memory to the device
-DMA_FROM_DEVICE		= PCI_DMA_FROMDEVICE	data is coming from
-						the device to the
-						memory
-DMA_BIDIRECTIONAL	= PCI_DMA_BIDIRECTIONAL	direction isn't known
+DMA_NONE		no direction (used for debugging)
+DMA_TO_DEVICE		data is going from the memory to the device
+DMA_FROM_DEVICE		data is coming from the device to the memory
+DMA_BIDIRECTIONAL	direction isn't known

 Notes:  Not all memory regions in a machine can be mapped by this
 API.  Further, regions that appear to be physically contiguous in
@ -268,9 +244,6 @@ cache lines are updated with data that the device may have changed).
 void
 dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size,
 		 enum dma_data_direction direction)
-void
-pci_unmap_single(struct pci_dev *hwdev, dma_addr_t dma_addr,
-		 size_t size, int direction)

 Unmaps the region previously mapped.  All the parameters passed in
 must be identical to those passed in (and returned) by the mapping
@ -280,15 +253,9 @@ dma_addr_t
 dma_map_page(struct device *dev, struct page *page,
 		    unsigned long offset, size_t size,
 		    enum dma_data_direction direction)
-dma_addr_t
-pci_map_page(struct pci_dev *hwdev, struct page *page,
-		    unsigned long offset, size_t size, int direction)
 void
 dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size,
 	       enum dma_data_direction direction)
-void
-pci_unmap_page(struct pci_dev *hwdev, dma_addr_t dma_address,
-	       size_t size, int direction)

 API for mapping and unmapping for pages.  All the notes and warnings
 for the other mapping APIs apply here.  Also, although the <offset>
@ -299,9 +266,6 @@ cache width is.
 int
 dma_mapping_error(struct device *dev, dma_addr_t dma_addr)

-int
-pci_dma_mapping_error(struct pci_dev *hwdev, dma_addr_t dma_addr)
-
 In some circumstances dma_map_single and dma_map_page will fail to create
 a mapping. A driver can check for these errors by testing the returned
 dma address with dma_mapping_error(). A non-zero return value means the mapping
@ -311,9 +275,6 @@ reduce current DMA mapping usage or delay and try again later).
 	int
 	dma_map_sg(struct device *dev, struct scatterlist *sg,
 		int nents, enum dma_data_direction direction)
-	int
-	pci_map_sg(struct pci_dev *hwdev, struct scatterlist *sg,
-		int nents, int direction)

 Returns: the number of physical segments mapped (this may be shorter
 than <nents> passed in if some elements of the scatter/gather list are
@ -353,9 +314,6 @@ accessed sg->address and sg->length as shown above.
 	void
 	dma_unmap_sg(struct device *dev, struct scatterlist *sg,
 		int nhwentries, enum dma_data_direction direction)
-	void
-	pci_unmap_sg(struct pci_dev *hwdev, struct scatterlist *sg,
-		int nents, int direction)

 Unmap the previously mapped scatter/gather list.  All the parameters
 must be the same as those and passed in to the scatter/gather mapping
@ -365,21 +323,23 @@ Note: <nents> must be the number you passed in, *not* the number of
 physical entries returned.

 void
-dma_sync_single(struct device *dev, dma_addr_t dma_handle, size_t size,
-		enum dma_data_direction direction)
+dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size,
+			enum dma_data_direction direction)
 void
-pci_dma_sync_single(struct pci_dev *hwdev, dma_addr_t dma_handle,
-			   size_t size, int direction)
+dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, size_t size,
+			   enum dma_data_direction direction)
 void
-dma_sync_sg(struct device *dev, struct scatterlist *sg, int nelems,
-			  enum dma_data_direction direction)
+dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems,
+		    enum dma_data_direction direction)
 void
-pci_dma_sync_sg(struct pci_dev *hwdev, struct scatterlist *sg,
-		       int nelems, int direction)
+dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems,
+		       enum dma_data_direction direction)

-Synchronise a single contiguous or scatter/gather mapping.  All the
-parameters must be the same as those passed into the single mapping
-API.
+Synchronise a single contiguous or scatter/gather mapping for the cpu
+and device. With the sync_sg API, all the parameters must be the same
+as those passed into the single mapping API. With the sync_single API,
+you can use dma_handle and size parameters that aren't identical to
+those passed into the single mapping API to do a partial sync.

 Notes:  You must do this:

@ -461,9 +421,9 @@ void whizco_dma_map_sg_attrs(struct device *dev, dma_addr_t dma_addr,
 Part II - Advanced dma_ usage
 -----------------------------

-Warning: These pieces of the DMA API have no PCI equivalent.  They
-should also not be used in the majority of cases, since they cater for
-unlikely corner cases that don't belong in usual drivers.
+Warning: These pieces of the DMA API should not be used in the
+majority of cases, since they cater for unlikely corner cases that
+don't belong in usual drivers.

 If you don't understand how cache line coherency works between a
 processor and an I/O device, you should not be using this part of the
@ -513,16 +473,6 @@ line, but it will guarantee that one or more cache lines fit exactly
 into the width returned by this call.  It will also always be a power
 of two for easy alignment.

-void
-dma_sync_single_range(struct device *dev, dma_addr_t dma_handle,
-		      unsigned long offset, size_t size,
-		      enum dma_data_direction direction)
-
-Does a partial sync, starting at offset and continuing for size.  You
-must be careful to observe the cache alignment and width when doing
-anything like this.  You must also be extra careful about accessing
-memory you intend to sync partially.
-
 void
 dma_cache_sync(struct device *dev, void *vaddr, size_t size,
 	       enum dma_data_direction direction)
--- a/Documentation/DocBook/device-drivers.tmpl
+++ b/Documentation/DocBook/device-drivers.tmpl
@ -45,7 +45,7 @@
     </sect1>

     <sect1><title>Atomic and pointer manipulation</title>
-!Iarch/x86/include/asm/atomic_32.h
+!Iarch/x86/include/asm/atomic.h
 !Iarch/x86/include/asm/unaligned.h
     </sect1>

--- a/Documentation/DocBook/deviceiobook.tmpl
+++ b/Documentation/DocBook/deviceiobook.tmpl
@ -316,7 +316,7 @@ CPU B:  spin_unlock_irqrestore(&amp;dev_lock, flags)

  <chapter id="pubfunctions">
     <title>Public Functions Provided</title>
-!Iarch/x86/include/asm/io_32.h
+!Iarch/x86/include/asm/io.h
 !Elib/iomap.c
  </chapter>

--- a/Documentation/DocBook/mac80211.tmpl
+++ b/Documentation/DocBook/mac80211.tmpl
@ -144,7 +144,7 @@ usage should require reading the full document.
        this though and the recommendation to allow only a single
        interface in STA mode at first!
      </para>
-!Finclude/net/mac80211.h ieee80211_if_init_conf
+!Finclude/net/mac80211.h ieee80211_vif
    </chapter>

    <chapter id="rx-tx">
@ -234,7 +234,6 @@ usage should require reading the full document.
      <title>Multiple queues and QoS support</title>
      <para>TBD</para>
 !Finclude/net/mac80211.h ieee80211_tx_queue_params
-!Finclude/net/mac80211.h ieee80211_tx_queue_stats
    </chapter>

    <chapter id="AP">
--- a/Documentation/DocBook/mtdnand.tmpl
+++ b/Documentation/DocBook/mtdnand.tmpl
@ -174,7 +174,7 @@
 		</para>
 		<programlisting>
 static struct mtd_info *board_mtd;
-static unsigned long baseaddr;
+static void __iomem *baseaddr;
 		</programlisting>
 		<para>
 			Static example
@ -182,7 +182,7 @@ static unsigned long baseaddr;
 		<programlisting>
 static struct mtd_info board_mtd;
 static struct nand_chip board_chip;
-static unsigned long baseaddr;
+static void __iomem *baseaddr;
 		</programlisting>
 	</sect1>
 	<sect1 id="Partition_defines">
@ -283,8 +283,8 @@ int __init board_init (void)
 	}

 	/* map physical address */
-	baseaddr = (unsigned long)ioremap(CHIP_PHYSICAL_ADDRESS, 1024);
-	if(!baseaddr){
+	baseaddr = ioremap(CHIP_PHYSICAL_ADDRESS, 1024);
+	if (!baseaddr) {
 		printk("Ioremap to access NAND chip failed\n");
 		err = -EIO;
 		goto out_mtd;
@ -316,7 +316,7 @@ int __init board_init (void)
 	goto out;

 out_ior:
-	iounmap((void *)baseaddr);
+	iounmap(baseaddr);
 out_mtd:
 	kfree (board_mtd);
 out:
@ -341,7 +341,7 @@ static void __exit board_cleanup (void)
 	nand_release (board_mtd);

 	/* unmap physical address */
-	iounmap((void *)baseaddr);
+	iounmap(baseaddr);
 	
 	/* Free the MTD device structure */
 	kfree (board_mtd);
@ -488,7 +488,7 @@ static void board_select_chip (struct mtd_info *mtd, int chip)
 				The ECC bytes must be placed immidiately after the data
 				bytes in order to make the syndrome generator work. This
 				is contrary to the usual layout used by software ECC. The
-				seperation of data and out of band area is not longer
+				separation of data and out of band area is not longer
 				possible. The nand driver code handles this layout and
 				the remaining free bytes in the oob area are managed by 
 				the autoplacement code. Provide a matching oob-layout
@ -560,7 +560,7 @@ static void board_select_chip (struct mtd_info *mtd, int chip)
 				bad blocks. They have factory marked good blocks. The marker pattern
 				is erased when the block is erased to be reused. So in case of
 				powerloss before writing the pattern back to the chip this block 
-				would be lost and added to the bad blocks. Therefor we scan the 
+				would be lost and added to the bad blocks. Therefore we scan the 
 				chip(s) when we detect them the first time for good blocks and 
 				store this information in a bad block table before erasing any 
 				of the blocks.
@ -1094,7 +1094,7 @@ in this page</entry>
 		manufacturers specifications. This applies similar to the spare area. 
 	</para>
 	<para>
-		Therefor NAND aware filesystems must either write in page size chunks
+		Therefore NAND aware filesystems must either write in page size chunks
 		or hold a writebuffer to collect smaller writes until they sum up to 
 		pagesize. Available NAND aware filesystems: JFFS2, YAFFS. 		
 	</para>
--- a/Documentation/DocBook/v4l/common.xml
+++ b/Documentation/DocBook/v4l/common.xml
@ -1170,7 +1170,7 @@ frames per second. If less than this number of frames is to be
 captured or output, applications can request frame skipping or
 duplicating on the driver side. This is especially useful when using
 the &func-read; or &func-write;, which are not augmented by timestamps
-or sequence counters, and to avoid unneccessary data copying.</para>
+or sequence counters, and to avoid unnecessary data copying.</para>

    <para>Finally these ioctls can be used to determine the number of
 buffers used internally by a driver in read/write mode. For
--- a/Documentation/DocBook/v4l/io.xml
+++ b/Documentation/DocBook/v4l/io.xml
@ -589,7 +589,8 @@ number of a video input as in &v4l2-input; field
 	    <entry></entry>
 	    <entry>A place holder for future extensions and custom
 (driver defined) buffer types
-<constant>V4L2_BUF_TYPE_PRIVATE</constant> and higher.</entry>
+<constant>V4L2_BUF_TYPE_PRIVATE</constant> and higher. Applications
+should set this to 0.</entry>
 	  </row>
 	</tbody>
      </tgroup>
--- a/Documentation/DocBook/v4l/vidioc-g-parm.xml
+++ b/Documentation/DocBook/v4l/vidioc-g-parm.xml
@ -55,7 +55,7 @@ captured or output, applications can request frame skipping or
 duplicating on the driver side. This is especially useful when using
 the <function>read()</function> or <function>write()</function>, which
 are not augmented by timestamps or sequence counters, and to avoid
-unneccessary data copying.</para>
+unnecessary data copying.</para>

    <para>Further these ioctls can be used to determine the number of
 buffers used internally by a driver in read/write mode. For
--- a/Documentation/DocBook/v4l/vidioc-qbuf.xml
+++ b/Documentation/DocBook/v4l/vidioc-qbuf.xml
@ -54,12 +54,10 @@ to enqueue an empty (capturing) or filled (output) buffer in the
 driver's incoming queue. The semantics depend on the selected I/O
 method.</para>

-    <para>To enqueue a <link linkend="mmap">memory mapped</link>
-buffer applications set the <structfield>type</structfield> field of a
-&v4l2-buffer; to the same buffer type as previously &v4l2-format;
-<structfield>type</structfield> and &v4l2-requestbuffers;
-<structfield>type</structfield>, the <structfield>memory</structfield>
-field to <constant>V4L2_MEMORY_MMAP</constant> and the
+    <para>To enqueue a buffer applications set the <structfield>type</structfield>
+field of a &v4l2-buffer; to the same buffer type as was previously used
+with &v4l2-format; <structfield>type</structfield> and &v4l2-requestbuffers;
+<structfield>type</structfield>. Applications must also set the
 <structfield>index</structfield> field. Valid index numbers range from
 zero to the number of buffers allocated with &VIDIOC-REQBUFS;
 (&v4l2-requestbuffers; <structfield>count</structfield>) minus one. The
@ -70,8 +68,19 @@ intended for output (<structfield>type</structfield> is
 <constant>V4L2_BUF_TYPE_VBI_OUTPUT</constant>) applications must also
 initialize the <structfield>bytesused</structfield>,
 <structfield>field</structfield> and
-<structfield>timestamp</structfield> fields. See <xref
-	linkend="buffer" /> for details. When
+<structfield>timestamp</structfield> fields, see <xref
+linkend="buffer" /> for details.
+Applications must also set <structfield>flags</structfield> to 0. If a driver
+supports capturing from specific video inputs and you want to specify a video
+input, then <structfield>flags</structfield> should be set to
+<constant>V4L2_BUF_FLAG_INPUT</constant> and the field
+<structfield>input</structfield> must be initialized to the desired input.
+The <structfield>reserved</structfield> field must be set to 0.
+</para>
+
+    <para>To enqueue a <link linkend="mmap">memory mapped</link>
+buffer applications set the <structfield>memory</structfield>
+field to <constant>V4L2_MEMORY_MMAP</constant>. When
 <constant>VIDIOC_QBUF</constant> is called with a pointer to this
 structure the driver sets the
 <constant>V4L2_BUF_FLAG_MAPPED</constant> and
@ -81,14 +90,10 @@ structure the driver sets the
 &EINVAL;.</para>

    <para>To enqueue a <link linkend="userp">user pointer</link>
-buffer applications set the <structfield>type</structfield> field of a
-&v4l2-buffer; to the same buffer type as previously &v4l2-format;
-<structfield>type</structfield> and &v4l2-requestbuffers;
-<structfield>type</structfield>, the <structfield>memory</structfield>
-field to <constant>V4L2_MEMORY_USERPTR</constant> and the
+buffer applications set the <structfield>memory</structfield>
+field to <constant>V4L2_MEMORY_USERPTR</constant>, the
 <structfield>m.userptr</structfield> field to the address of the
-buffer and <structfield>length</structfield> to its size. When the
-buffer is intended for output additional fields must be set as above.
+buffer and <structfield>length</structfield> to its size.
 When <constant>VIDIOC_QBUF</constant> is called with a pointer to this
 structure the driver sets the <constant>V4L2_BUF_FLAG_QUEUED</constant>
 flag and clears the <constant>V4L2_BUF_FLAG_MAPPED</constant> and
@ -96,13 +101,14 @@ flag and clears the <constant>V4L2_BUF_FLAG_MAPPED</constant> and
 <structfield>flags</structfield> field, or it returns an error code.
 This ioctl locks the memory pages of the buffer in physical memory,
 they cannot be swapped out to disk. Buffers remain locked until
-dequeued, until the &VIDIOC-STREAMOFF; or &VIDIOC-REQBUFS; ioctl are
+dequeued, until the &VIDIOC-STREAMOFF; or &VIDIOC-REQBUFS; ioctl is
 called, or until the device is closed.</para>

    <para>Applications call the <constant>VIDIOC_DQBUF</constant>
 ioctl to dequeue a filled (capturing) or displayed (output) buffer
 from the driver's outgoing queue. They just set the
-<structfield>type</structfield> and <structfield>memory</structfield>
+<structfield>type</structfield>, <structfield>memory</structfield>
+and <structfield>reserved</structfield>
 fields of a &v4l2-buffer; as above, when <constant>VIDIOC_DQBUF</constant>
 is called with a pointer to this structure the driver fills the
 remaining fields or returns an error code.</para>
--- a/Documentation/DocBook/v4l/vidioc-querybuf.xml
+++ b/Documentation/DocBook/v4l/vidioc-querybuf.xml
@ -54,12 +54,13 @@ buffer at any time after buffers have been allocated with the
 &VIDIOC-REQBUFS; ioctl.</para>

    <para>Applications set the <structfield>type</structfield> field
-    of a &v4l2-buffer; to the same buffer type as previously
+    of a &v4l2-buffer; to the same buffer type as was previously used with
 &v4l2-format; <structfield>type</structfield> and &v4l2-requestbuffers;
 <structfield>type</structfield>, and the <structfield>index</structfield>
    field. Valid index numbers range from zero
 to the number of buffers allocated with &VIDIOC-REQBUFS;
    (&v4l2-requestbuffers; <structfield>count</structfield>) minus one.
+The <structfield>reserved</structfield> field should to set to 0.
 After calling <constant>VIDIOC_QUERYBUF</constant> with a pointer to
    this structure drivers return an error code or fill the rest of
 the structure.</para>
@ -68,8 +69,8 @@ the structure.</para>
 <constant>V4L2_BUF_FLAG_MAPPED</constant>,
 <constant>V4L2_BUF_FLAG_QUEUED</constant> and
 <constant>V4L2_BUF_FLAG_DONE</constant> flags will be valid. The
-<structfield>memory</structfield> field will be set to
-<constant>V4L2_MEMORY_MMAP</constant>, the <structfield>m.offset</structfield>
+<structfield>memory</structfield> field will be set to the current
+I/O method, the <structfield>m.offset</structfield>
 contains the offset of the buffer from the start of the device memory,
 the <structfield>length</structfield> field its size. The driver may
 or may not set the remaining fields and flags, they are meaningless in
--- a/Documentation/DocBook/v4l/vidioc-reqbufs.xml
+++ b/Documentation/DocBook/v4l/vidioc-reqbufs.xml
@ -54,23 +54,23 @@ I/O. Memory mapped buffers are located in device memory and must be
 allocated with this ioctl before they can be mapped into the
 application's address space. User buffers are allocated by
 applications themselves, and this ioctl is merely used to switch the
-driver into user pointer I/O mode.</para>
+driver into user pointer I/O mode and to setup some internal structures.</para>

-    <para>To allocate device buffers applications initialize three
-fields of a <structname>v4l2_requestbuffers</structname> structure.
+    <para>To allocate device buffers applications initialize all
+fields of the <structname>v4l2_requestbuffers</structname> structure.
 They set the <structfield>type</structfield> field to the respective
 stream or buffer type, the <structfield>count</structfield> field to
-the desired number of buffers, and <structfield>memory</structfield>
-must be set to <constant>V4L2_MEMORY_MMAP</constant>. When the ioctl
-is called with a pointer to this structure the driver attempts to
-allocate the requested number of buffers and stores the actual number
+the desired number of buffers, <structfield>memory</structfield>
+must be set to the requested I/O method and the reserved array
+must be zeroed. When the ioctl
+is called with a pointer to this structure the driver will attempt to allocate
+the requested number of buffers and it stores the actual number
 allocated in the <structfield>count</structfield> field. It can be
 smaller than the number requested, even zero, when the driver runs out
-of free memory. A larger number is possible when the driver requires
-more buffers to function correctly.<footnote>
-	<para>For example video output requires at least two buffers,
+of free memory. A larger number is also possible when the driver requires
+more buffers to function correctly. For example video output requires at least two buffers,
 one displayed and one filled by the application.</para>
-	</footnote> When memory mapping I/O is not supported the ioctl
+    <para>When the I/O method is not supported the ioctl
 returns an &EINVAL;.</para>

    <para>Applications can call <constant>VIDIOC_REQBUFS</constant>
@ -81,14 +81,6 @@ in progress, an implicit &VIDIOC-STREAMOFF;. <!-- mhs: I see no
 reason why munmap()ping one or even all buffers must imply
 streamoff.--></para>

-    <para>To negotiate user pointer I/O, applications initialize only
-the <structfield>type</structfield> field and set
-<structfield>memory</structfield> to
-<constant>V4L2_MEMORY_USERPTR</constant>. When the ioctl is called
-with a pointer to this structure the driver prepares for user pointer
-I/O, when this I/O method is not supported the ioctl returns an
-&EINVAL;.</para>
-
    <table pgwide="1" frame="none" id="v4l2-requestbuffers">
      <title>struct <structname>v4l2_requestbuffers</structname></title>
      <tgroup cols="3">
@ -97,9 +89,7 @@ I/O, when this I/O method is not supported the ioctl returns an
 	  <row>
 	    <entry>__u32</entry>
 	    <entry><structfield>count</structfield></entry>
-	    <entry>The number of buffers requested or granted. This
-field is only used when <structfield>memory</structfield> is set to
-<constant>V4L2_MEMORY_MMAP</constant>.</entry>
+	    <entry>The number of buffers requested or granted.</entry>
 	  </row>
 	  <row>
 	    <entry>&v4l2-buf-type;</entry>
@ -120,7 +110,7 @@ as the &v4l2-format; <structfield>type</structfield> field. See <xref
 	    <entry><structfield>reserved</structfield>[2]</entry>
 	    <entry>A place holder for future extensions and custom
 (driver defined) buffer types <constant>V4L2_BUF_TYPE_PRIVATE</constant> and
-higher.</entry>
+higher. This array should be zeroed by applications.</entry>
 	  </row>
 	</tbody>
      </tgroup>
--- a/Documentation/HOWTO
+++ b/Documentation/HOWTO
@ -221,8 +221,8 @@ branches.  These different branches are:
  - main 2.6.x kernel tree
  - 2.6.x.y -stable kernel tree
  - 2.6.x -git kernel patches
-  - 2.6.x -mm kernel patches
  - subsystem specific kernel trees and patches
+  - the 2.6.x -next kernel tree for integration tests

 2.6.x kernel tree
 -----------------
@ -232,7 +232,7 @@ process is as follows:
  - As soon as a new kernel is released a two weeks window is open,
    during this period of time maintainers can submit big diffs to
    Linus, usually the patches that have already been included in the
-    -mm kernel for a few weeks.  The preferred way to submit big changes
+    -next kernel for a few weeks.  The preferred way to submit big changes
    is using git (the kernel's source management tool, more information
    can be found at http://git.or.cz/) but plain patches are also just
    fine.
@ -293,84 +293,43 @@ daily and represent the current state of Linus' tree.  They are more
 experimental than -rc kernels since they are generated automatically
 without even a cursory glance to see if they are sane.

-2.6.x -mm kernel patches
------------------------
-These are experimental kernel patches released by Andrew Morton.  Andrew
-takes all of the different subsystem kernel trees and patches and mushes
-them together, along with a lot of patches that have been plucked from
-the linux-kernel mailing list.  This tree serves as a proving ground for
-new features and patches.  Once a patch has proved its worth in -mm for
-a while Andrew or the subsystem maintainer pushes it on to Linus for
-inclusion in mainline.
-
-It is heavily encouraged that all new patches get tested in the -mm tree
-before they are sent to Linus for inclusion in the main kernel tree.  Code
-which does not make an appearance in -mm before the opening of the merge
-window will prove hard to merge into the mainline.
-
-These kernels are not appropriate for use on systems that are supposed
-to be stable and they are more risky to run than any of the other
-branches.
-
-If you wish to help out with the kernel development process, please test
-and use these kernel releases and provide feedback to the linux-kernel
-mailing list if you have any problems, and if everything works properly.
-
-In addition to all the other experimental patches, these kernels usually
-also contain any changes in the mainline -git kernels available at the
-time of release.
-
-The -mm kernels are not released on a fixed schedule, but usually a few
-mm kernels are released in between each -rc kernel (1 to 3 is common).
-
 Subsystem Specific kernel trees and patches
 -------------------------------------------
-A number of the different kernel subsystem developers expose their
-development trees so that others can see what is happening in the
-different areas of the kernel.  These trees are pulled into the -mm
-kernel releases as described above.
+The maintainers of the various kernel subsystems --- and also many
+kernel subsystem developers --- expose their current state of
+development in source repositories.  That way, others can see what is
+happening in the different areas of the kernel.  In areas where
+development is rapid, a developer may be asked to base his submissions
+onto such a subsystem kernel tree so that conflicts between the
+submission and other already ongoing work are avoided.

-Here is a list of some of the different kernel trees available:
-  git trees:
-    - Kbuild development tree, Sam Ravnborg <sam@ravnborg.org>
-	git.kernel.org:/pub/scm/linux/kernel/git/sam/kbuild.git
+Most of these repositories are git trees, but there are also other SCMs
+in use, or patch queues being published as quilt series.  Addresses of
+these subsystem repositories are listed in the MAINTAINERS file.  Many
+of them can be browsed at http://git.kernel.org/.

-    - ACPI development tree, Len Brown <len.brown@intel.com>
-	git.kernel.org:/pub/scm/linux/kernel/git/lenb/linux-acpi-2.6.git
+Before a proposed patch is committed to such a subsystem tree, it is
+subject to review which primarily happens on mailing lists (see the
+respective section below).  For several kernel subsystems, this review
+process is tracked with the tool patchwork.  Patchwork offers a web
+interface which shows patch postings, any comments on a patch or
+revisions to it, and maintainers can mark patches as under review,
+accepted, or rejected.  Most of these patchwork sites are listed at
+http://patchwork.kernel.org/ or http://patchwork.ozlabs.org/.

-    - Block development tree, Jens Axboe <jens.axboe@oracle.com>
-	git.kernel.org:/pub/scm/linux/kernel/git/axboe/linux-2.6-block.git
+2.6.x -next kernel tree for integration tests
+---------------------------------------------
+Before updates from subsystem trees are merged into the mainline 2.6.x
+tree, they need to be integration-tested.  For this purpose, a special
+testing repository exists into which virtually all subsystem trees are
+pulled on an almost daily basis:
+	http://git.kernel.org/?p=linux/kernel/git/sfr/linux-next.git
+	http://linux.f-seidel.de/linux-next/pmwiki/

-    - DRM development tree, Dave Airlie <airlied@linux.ie>
-	git.kernel.org:/pub/scm/linux/kernel/git/airlied/drm-2.6.git
+This way, the -next kernel gives a summary outlook onto what will be
+expected to go into the mainline kernel at the next merge period.
+Adventurous testers are very welcome to runtime-test the -next kernel.

-    - ia64 development tree, Tony Luck <tony.luck@intel.com>
-	git.kernel.org:/pub/scm/linux/kernel/git/aegl/linux-2.6.git
-
-    - infiniband, Roland Dreier <rolandd@cisco.com>
-	git.kernel.org:/pub/scm/linux/kernel/git/roland/infiniband.git
-
-    - libata, Jeff Garzik <jgarzik@pobox.com>
-	git.kernel.org:/pub/scm/linux/kernel/git/jgarzik/libata-dev.git
-
-    - network drivers, Jeff Garzik <jgarzik@pobox.com>
-	git.kernel.org:/pub/scm/linux/kernel/git/jgarzik/netdev-2.6.git
-
-    - pcmcia, Dominik Brodowski <linux@dominikbrodowski.net>
-	git.kernel.org:/pub/scm/linux/kernel/git/brodo/pcmcia-2.6.git
-
-    - SCSI, James Bottomley <James.Bottomley@hansenpartnership.com>
-	git.kernel.org:/pub/scm/linux/kernel/git/jejb/scsi-misc-2.6.git
-
-    - x86, Ingo Molnar <mingo@elte.hu>
-	git://git.kernel.org/pub/scm/linux/kernel/git/x86/linux-2.6-x86.git
-
-  quilt trees:
-    - USB, Driver Core, and I2C, Greg Kroah-Hartman <gregkh@suse.de>
-	kernel.org/pub/linux/kernel/people/gregkh/gregkh-2.6/
-
-  Other kernel trees can be found listed at http://git.kernel.org/ and in
-  the MAINTAINERS file.

 Bug Reporting
 -------------
--- a/Documentation/IO-mapping.txt
+++ b/Documentation/IO-mapping.txt
@ -157,7 +157,7 @@ For such memory, you can do things like
 	 * access only the 640k-1MB area, so anything else
 	 * has to be remapped.
 	 */
-	char * baseptr = ioremap(0xFC000000, 1024*1024);
+	void __iomem *baseptr = ioremap(0xFC000000, 1024*1024);

 	/* write a 'A' to the offset 10 of the area */
 	writeb('A',baseptr+10);
--- a/Documentation/IPMI.txt
+++ b/Documentation/IPMI.txt
@ -365,6 +365,7 @@ You can change this at module load time (for a module) with:
       regshifts=<shift1>,<shift2>,...
       slave_addrs=<addr1>,<addr2>,...
       force_kipmid=<enable1>,<enable2>,...
+       kipmid_max_busy_us=<ustime1>,<ustime2>,...
       unload_when_empty=[0|1]

 Each of these except si_trydefaults is a list, the first item for the
@ -433,6 +434,7 @@ kernel command line as:
       ipmi_si.regshifts=<shift1>,<shift2>,...
       ipmi_si.slave_addrs=<addr1>,<addr2>,...
       ipmi_si.force_kipmid=<enable1>,<enable2>,...
+       ipmi_si.kipmid_max_busy_us=<ustime1>,<ustime2>,...

 It works the same as the module parameters of the same names.

@ -450,6 +452,16 @@ force this thread on or off.  If you force it off and don't have
 interrupts, the driver will run VERY slowly.  Don't blame me,
 these interfaces suck.

+Unfortunately, this thread can use a lot of CPU depending on the
+interface's performance.  This can waste a lot of CPU and cause
+various issues with detecting idle CPU and using extra power.  To
+avoid this, the kipmid_max_busy_us sets the maximum amount of time, in
+microseconds, that kipmid will spin before sleeping for a tick.  This
+value sets a balance between performance and CPU waste and needs to be
+tuned to your needs.  Maybe, someday, auto-tuning will be added, but
+that's not a simple thing and even the auto-tuning would need to be
+tuned to the user's desired performance.
+
 The driver supports a hot add and remove of interfaces.  This way,
 interfaces can be added or removed after the kernel is up and running.
 This is done using /sys/modules/ipmi_si/parameters/hotmod, which is a
--- a/Documentation/Makefile
+++ b/Documentation/Makefile
@ -1,3 +1,3 @@
 obj-m := DocBook/ accounting/ auxdisplay/ connector/ \
-	filesystems/configfs/ ia64/ networking/ \
-	pcmcia/ spi/ video4linux/ vm/ watchdog/src/
+	filesystems/ filesystems/configfs/ ia64/ laptops/ networking/ \
+	pcmcia/ spi/ timers/ video4linux/ vm/ watchdog/src/
--- a/Documentation/RCU/00-INDEX
+++ b/Documentation/RCU/00-INDEX
@ -6,16 +6,22 @@ checklist.txt
 	- Review Checklist for RCU Patches
 listRCU.txt
 	- Using RCU to Protect Read-Mostly Linked Lists
+lockdep.txt
+	- RCU and lockdep checking
 NMI-RCU.txt
 	- Using RCU to Protect Dynamic NMI Handlers
+rcubarrier.txt
+	- RCU and Unloadable Modules
+rculist_nulls.txt
+	- RCU list primitives for use with SLAB_DESTROY_BY_RCU
 rcuref.txt
 	- Reference-count design for elements of lists/arrays protected by RCU
 rcu.txt
 	- RCU Concepts
-rcubarrier.txt
-	- Unloading modules that use RCU callbacks
 RTFP.txt
 	- List of RCU papers (bibliography) going back to 1980.
+stallwarn.txt
+	- RCU CPU stall warnings (CONFIG_RCU_CPU_STALL_DETECTOR)
 torture.txt
 	- RCU Torture Test Operation (CONFIG_RCU_TORTURE_TEST)
 trace.txt
--- a/Documentation/RCU/RTFP.txt
+++ b/Documentation/RCU/RTFP.txt
@ -25,10 +25,10 @@ to be referencing the data structure.  However, this mechanism was not
 optimized for modern computer systems, which is not surprising given
 that these overheads were not so expensive in the mid-80s.  Nonetheless,
 passive serialization appears to be the first deferred-destruction
-mechanism to be used in production.  Furthermore, the relevant patent has
-lapsed, so this approach may be used in non-GPL software, if desired.
-(In contrast, use of RCU is permitted only in software licensed under
-GPL.  Sorry!!!)
+mechanism to be used in production.  Furthermore, the relevant patent
+has lapsed, so this approach may be used in non-GPL software, if desired.
+(In contrast, implementation of RCU is permitted only in software licensed
+under either GPL or LGPL.  Sorry!!!)

 In 1990, Pugh [Pugh90] noted that explicitly tracking which threads
 were reading a given data structure permitted deferred free to operate
@ -150,6 +150,18 @@ preemptible RCU [PaulEMcKenney2007PreemptibleRCU], and the three-part
 LWN "What is RCU?" series [PaulEMcKenney2007WhatIsRCUFundamentally,
 PaulEMcKenney2008WhatIsRCUUsage, and PaulEMcKenney2008WhatIsRCUAPI].

+2008 saw a journal paper on real-time RCU [DinakarGuniguntala2008IBMSysJ],
+a history of how Linux changed RCU more than RCU changed Linux
+[PaulEMcKenney2008RCUOSR], and a design overview of hierarchical RCU
+[PaulEMcKenney2008HierarchicalRCU].
+
+2009 introduced user-level RCU algorithms [PaulEMcKenney2009MaliciousURCU],
+which Mathieu Desnoyers is now maintaining [MathieuDesnoyers2009URCU]
+[MathieuDesnoyersPhD].  TINY_RCU [PaulEMcKenney2009BloatWatchRCU] made
+its appearance, as did expedited RCU [PaulEMcKenney2009expeditedRCU].
+The problem of resizeable RCU-protected hash tables may now be on a path
+to a solution [JoshTriplett2009RPHash].
+
 Bibtex Entries

@article{Kung80
@ -730,6 +742,11 @@ Revised:
 "
 }

+#
+#	"What is RCU?" LWN series.
+#
+########################################################################
+
@article{DinakarGuniguntala2008IBMSysJ
 ,author="D. Guniguntala and P. E. McKenney and J. Triplett and J. Walpole"
 ,title="The read-copy-update mechanism for supporting real-time applications on shared-memory multiprocessor systems with {Linux}"
@ -820,3 +837,39 @@ Revised:
 	Uniprocessor assumptions allow simplified RCU implementation.
 "
 }
+
+@unpublished{PaulEMcKenney2009expeditedRCU
+,Author="Paul E. McKenney"
+,Title="[{PATCH} -tip 0/3] expedited 'big hammer' {RCU} grace periods"
+,month="June"
+,day="25"
+,year="2009"
+,note="Available:
+\url{http://lkml.org/lkml/2009/6/25/306}
+[Viewed August 16, 2009]"
+,annotation="
+	First posting of expedited RCU to be accepted into -tip.
+"
+}
+
+@unpublished{JoshTriplett2009RPHash
+,Author="Josh Triplett"
+,Title="Scalable concurrent hash tables via relativistic programming"
+,month="September"
+,year="2009"
+,note="Linux Plumbers Conference presentation"
+,annotation="
+	RP fun with hash tables.
+"
+}
+
+@phdthesis{MathieuDesnoyersPhD
+, title  = "Low-Impact Operating System Tracing"
+, author = "Mathieu Desnoyers"
+, school = "Ecole Polytechnique de Montr\'{e}al"
+, month  = "December"
+, year   = 2009
+,note="Available:
+\url{http://www.lttng.org/pub/thesis/desnoyers-dissertation-2009-12.pdf}
+[Viewed December 9, 2009]"
+}
--- a/Documentation/RCU/checklist.txt
+++ b/Documentation/RCU/checklist.txt
@ -8,13 +8,12 @@ would cause.  This list is based on experiences reviewing such patches
 over a rather long period of time, but improvements are always welcome!

 0.	Is RCU being applied to a read-mostly situation?  If the data
-	structure is updated more than about 10% of the time, then
-	you should strongly consider some other approach, unless
-	detailed performance measurements show that RCU is nonetheless
-	the right tool for the job.  Yes, you might think of RCU
-	as simply cutting overhead off of the readers and imposing it
-	on the writers.  That is exactly why normal uses of RCU will
-	do much more reading than updating.
+	structure is updated more than about 10% of the time, then you
+	should strongly consider some other approach, unless detailed
+	performance measurements show that RCU is nonetheless the right
+	tool for the job.  Yes, RCU does reduce read-side overhead by
+	increasing write-side overhead, which is exactly why normal uses
+	of RCU will do much more reading than updating.

 	Another exception is where performance is not an issue, and RCU
 	provides a simpler implementation.  An example of this situation
@ -35,13 +34,13 @@ over a rather long period of time, but improvements are always welcome!

 	If you choose #b, be prepared to describe how you have handled
 	memory barriers on weakly ordered machines (pretty much all of
-	them -- even x86 allows reads to be reordered), and be prepared
-	to explain why this added complexity is worthwhile.  If you
-	choose #c, be prepared to explain how this single task does not
-	become a major bottleneck on big multiprocessor machines (for
-	example, if the task is updating information relating to itself
-	that other tasks can read, there by definition can be no
-	bottleneck).
+	them -- even x86 allows later loads to be reordered to precede
+	earlier stores), and be prepared to explain why this added
+	complexity is worthwhile.  If you choose #c, be prepared to
+	explain how this single task does not become a major bottleneck on
+	big multiprocessor machines (for example, if the task is updating
+	information relating to itself that other tasks can read, there
+	by definition can be no bottleneck).

 2.	Do the RCU read-side critical sections make proper use of
 	rcu_read_lock() and friends?  These primitives are needed
@ -51,8 +50,10 @@ over a rather long period of time, but improvements are always welcome!
 	actuarial risk of your kernel.

 	As a rough rule of thumb, any dereference of an RCU-protected
-	pointer must be covered by rcu_read_lock() or rcu_read_lock_bh()
-	or by the appropriate update-side lock.
+	pointer must be covered by rcu_read_lock(), rcu_read_lock_bh(),
+	rcu_read_lock_sched(), or by the appropriate update-side lock.
+	Disabling of preemption can serve as rcu_read_lock_sched(), but
+	is less readable.

 3.	Does the update code tolerate concurrent accesses?

@ -62,25 +63,27 @@ over a rather long period of time, but improvements are always welcome!
 	of ways to handle this concurrency, depending on the situation:

 	a.	Use the RCU variants of the list and hlist update
-		primitives to add, remove, and replace elements on an
-		RCU-protected list.  Alternatively, use the RCU-protected
-		trees that have been added to the Linux kernel.
+		primitives to add, remove, and replace elements on
+		an RCU-protected list.	Alternatively, use the other
+		RCU-protected data structures that have been added to
+		the Linux kernel.

 		This is almost always the best approach.

 	b.	Proceed as in (a) above, but also maintain per-element
 		locks (that are acquired by both readers and writers)
 		that guard per-element state.  Of course, fields that
-		the readers refrain from accessing can be guarded by the
-		update-side lock.
+		the readers refrain from accessing can be guarded by
+		some other lock acquired only by updaters, if desired.

 		This works quite well, also.

 	c.	Make updates appear atomic to readers.  For example,
-		pointer updates to properly aligned fields will appear
-		atomic, as will individual atomic primitives.  Operations
-		performed under a lock and sequences of multiple atomic
-		primitives will -not- appear to be atomic.
+		pointer updates to properly aligned fields will
+		appear atomic, as will individual atomic primitives.
+		Sequences of perations performed under a lock will -not-
+		appear to be atomic to RCU readers, nor will sequences
+		of multiple atomic primitives.

 		This can work, but is starting to get a bit tricky.

@ -98,9 +101,9 @@ over a rather long period of time, but improvements are always welcome!
 		a new structure containing updated values.

 4.	Weakly ordered CPUs pose special challenges.  Almost all CPUs
-	are weakly ordered -- even i386 CPUs allow reads to be reordered.
-	RCU code must take all of the following measures to prevent
-	memory-corruption problems:
+	are weakly ordered -- even x86 CPUs allow later loads to be
+	reordered to precede earlier stores.  RCU code must take all of
+	the following measures to prevent memory-corruption problems:

 	a.	Readers must maintain proper ordering of their memory
 		accesses.  The rcu_dereference() primitive ensures that
@ -113,14 +116,25 @@ over a rather long period of time, but improvements are always welcome!
 		The rcu_dereference() primitive is also an excellent
 		documentation aid, letting the person reading the code
 		know exactly which pointers are protected by RCU.
+		Please note that compilers can also reorder code, and
+		they are becoming increasingly aggressive about doing
+		just that.  The rcu_dereference() primitive therefore
+		also prevents destructive compiler optimizations.

-		The rcu_dereference() primitive is used by the various
-		"_rcu()" list-traversal primitives, such as the
-		list_for_each_entry_rcu().  Note that it is perfectly
-		legal (if redundant) for update-side code to use
-		rcu_dereference() and the "_rcu()" list-traversal
-		primitives.  This is particularly useful in code
-		that is common to readers and updaters.
+		The rcu_dereference() primitive is used by the
+		various "_rcu()" list-traversal primitives, such
+		as the list_for_each_entry_rcu().  Note that it is
+		perfectly legal (if redundant) for update-side code to
+		use rcu_dereference() and the "_rcu()" list-traversal
+		primitives.  This is particularly useful in code that
+		is common to readers and updaters.  However, lockdep
+		will complain if you access rcu_dereference() outside
+		of an RCU read-side critical section.  See lockdep.txt
+		to learn what to do about this.
+
+		Of course, neither rcu_dereference() nor the "_rcu()"
+		list-traversal primitives can substitute for a good
+		concurrency design coordinating among multiple updaters.

 	b.	If the list macros are being used, the list_add_tail_rcu()
 		and list_add_rcu() primitives must be used in order
@ -135,11 +149,14 @@ over a rather long period of time, but improvements are always welcome!
 		readers.  Similarly, if the hlist macros are being used,
 		the hlist_del_rcu() primitive is required.

-		The list_replace_rcu() primitive may be used to
-		replace an old structure with a new one in an
-		RCU-protected list.
+		The list_replace_rcu() and hlist_replace_rcu() primitives
+		may be used to replace an old structure with a new one
+		in their respective types of RCU-protected lists.

-	d.	Updates must ensure that initialization of a given
+	d.	Rules similar to (4b) and (4c) apply to the "hlist_nulls"
+		type of RCU-protected linked lists.
+
+	e.	Updates must ensure that initialization of a given
 		structure happens before pointers to that structure are
 		publicized.  Use the rcu_assign_pointer() primitive
 		when publicizing a pointer to a structure that can
@ -151,16 +168,31 @@ over a rather long period of time, but improvements are always welcome!
 	it cannot block.

 6.	Since synchronize_rcu() can block, it cannot be called from
-	any sort of irq context.  Ditto for synchronize_sched() and
-	synchronize_srcu().
+	any sort of irq context.  The same rule applies for
+	synchronize_rcu_bh(), synchronize_sched(), synchronize_srcu(),
+	synchronize_rcu_expedited(), synchronize_rcu_bh_expedited(),
+	synchronize_sched_expedite(), and synchronize_srcu_expedited().

-7.	If the updater uses call_rcu(), then the corresponding readers
-	must use rcu_read_lock() and rcu_read_unlock().  If the updater
-	uses call_rcu_bh(), then the corresponding readers must use
-	rcu_read_lock_bh() and rcu_read_unlock_bh().  If the updater
-	uses call_rcu_sched(), then the corresponding readers must
-	disable preemption.  Mixing things up will result in confusion
-	and broken kernels.
+	The expedited forms of these primitives have the same semantics
+	as the non-expedited forms, but expediting is both expensive
+	and unfriendly to real-time workloads.	Use of the expedited
+	primitives should be restricted to rare configuration-change
+	operations that would not normally be undertaken while a real-time
+	workload is running.
+
+7.	If the updater uses call_rcu() or synchronize_rcu(), then the
+	corresponding readers must use rcu_read_lock() and
+	rcu_read_unlock().  If the updater uses call_rcu_bh() or
+	synchronize_rcu_bh(), then the corresponding readers must
+	use rcu_read_lock_bh() and rcu_read_unlock_bh().  If the
+	updater uses call_rcu_sched() or synchronize_sched(), then
+	the corresponding readers must disable preemption, possibly
+	by calling rcu_read_lock_sched() and rcu_read_unlock_sched().
+	If the updater uses synchronize_srcu(), the the corresponding
+	readers must use srcu_read_lock() and srcu_read_unlock(),
+	and with the same srcu_struct.	The rules for the expedited
+	primitives are the same as for their non-expedited counterparts.
+	Mixing things up will result in confusion and broken kernels.

 	One exception to this rule: rcu_read_lock() and rcu_read_unlock()
 	may be substituted for rcu_read_lock_bh() and rcu_read_unlock_bh()
@ -212,6 +244,8 @@ over a rather long period of time, but improvements are always welcome!
 	e.	Periodically invoke synchronize_rcu(), permitting a limited
 		number of updates per grace period.

+	The same cautions apply to call_rcu_bh() and call_rcu_sched().
+
 9.	All RCU list-traversal primitives, which include
 	rcu_dereference(), list_for_each_entry_rcu(),
 	list_for_each_continue_rcu(), and list_for_each_safe_rcu(),
@ -219,7 +253,9 @@ over a rather long period of time, but improvements are always welcome!
 	must be protected by appropriate update-side locks.  RCU
 	read-side critical sections are delimited by rcu_read_lock()
 	and rcu_read_unlock(), or by similar primitives such as
-	rcu_read_lock_bh() and rcu_read_unlock_bh().
+	rcu_read_lock_bh() and rcu_read_unlock_bh(), in which case
+	the matching rcu_dereference() primitive must be used in order
+	to keep lockdep happy, in this case, rcu_dereference_bh().

 	The reason that it is permissible to use RCU list-traversal
 	primitives when the update-side lock is held is that doing so
@ -229,7 +265,8 @@ over a rather long period of time, but improvements are always welcome!
 10.	Conversely, if you are in an RCU read-side critical section,
 	and you don't hold the appropriate update-side lock, you -must-
 	use the "_rcu()" variants of the list macros.  Failing to do so
-	will break Alpha and confuse people reading your code.
+	will break Alpha, cause aggressive compilers to generate bad code,
+	and confuse people trying to read your code.

 11.	Note that synchronize_rcu() -only- guarantees to wait until
 	all currently executing rcu_read_lock()-protected RCU read-side
@ -239,15 +276,21 @@ over a rather long period of time, but improvements are always welcome!
 	rcu_read_lock()-protected read-side critical sections, do -not-
 	use synchronize_rcu().

-	If you want to wait for some of these other things, you might
-	instead need to use synchronize_irq() or synchronize_sched().
+	Similarly, disabling preemption is not an acceptable substitute
+	for rcu_read_lock().  Code that attempts to use preemption
+	disabling where it should be using rcu_read_lock() will break
+	in real-time kernel builds.
+
+	If you want to wait for interrupt handlers, NMI handlers, and
+	code under the influence of preempt_disable(), you instead
+	need to use synchronize_irq() or synchronize_sched().

 12.	Any lock acquired by an RCU callback must be acquired elsewhere
 	with softirq disabled, e.g., via spin_lock_irqsave(),
 	spin_lock_bh(), etc.  Failing to disable irq on a given
-	acquisition of that lock will result in deadlock as soon as the
-	RCU callback happens to interrupt that acquisition's critical
-	section.
+	acquisition of that lock will result in deadlock as soon as
+	the RCU softirq handler happens to run your RCU callback while
+	interrupting that acquisition's critical section.

 13.	RCU callbacks can be and are executed in parallel.  In many cases,
 	the callback code simply wrappers around kfree(), so that this
@ -265,29 +308,30 @@ over a rather long period of time, but improvements are always welcome!
 	not the case, a self-spawning RCU callback would prevent the
 	victim CPU from ever going offline.)

-14.	SRCU (srcu_read_lock(), srcu_read_unlock(), and synchronize_srcu())
-	may only be invoked from process context.  Unlike other forms of
-	RCU, it -is- permissible to block in an SRCU read-side critical
-	section (demarked by srcu_read_lock() and srcu_read_unlock()),
-	hence the "SRCU": "sleepable RCU".  Please note that if you
-	don't need to sleep in read-side critical sections, you should
-	be using RCU rather than SRCU, because RCU is almost always
-	faster and easier to use than is SRCU.
+14.	SRCU (srcu_read_lock(), srcu_read_unlock(), srcu_dereference(),
+	synchronize_srcu(), and synchronize_srcu_expedited()) may only
+	be invoked from process context.  Unlike other forms of RCU, it
+	-is- permissible to block in an SRCU read-side critical section
+	(demarked by srcu_read_lock() and srcu_read_unlock()), hence the
+	"SRCU": "sleepable RCU".  Please note that if you don't need
+	to sleep in read-side critical sections, you should be using
+	RCU rather than SRCU, because RCU is almost always faster and
+	easier to use than is SRCU.

 	Also unlike other forms of RCU, explicit initialization
 	and cleanup is required via init_srcu_struct() and
 	cleanup_srcu_struct().	These are passed a "struct srcu_struct"
 	that defines the scope of a given SRCU domain.	Once initialized,
 	the srcu_struct is passed to srcu_read_lock(), srcu_read_unlock()
-	and synchronize_srcu().  A given synchronize_srcu() waits only
-	for SRCU read-side critical sections governed by srcu_read_lock()
-	and srcu_read_unlock() calls that have been passd the same
-	srcu_struct.  This property is what makes sleeping read-side
-	critical sections tolerable -- a given subsystem delays only
-	its own updates, not those of other subsystems using SRCU.
-	Therefore, SRCU is less prone to OOM the system than RCU would
-	be if RCU's read-side critical sections were permitted to
-	sleep.
+	synchronize_srcu(), and synchronize_srcu_expedited().  A given
+	synchronize_srcu() waits only for SRCU read-side critical
+	sections governed by srcu_read_lock() and srcu_read_unlock()
+	calls that have been passed the same srcu_struct.  This property
+	is what makes sleeping read-side critical sections tolerable --
+	a given subsystem delays only its own updates, not those of other
+	subsystems using SRCU.	Therefore, SRCU is less prone to OOM the
+	system than RCU would be if RCU's read-side critical sections
+	were permitted to sleep.

 	The ability to sleep in read-side critical sections does not
 	come for free.	First, corresponding srcu_read_lock() and
@ -311,12 +355,12 @@ over a rather long period of time, but improvements are always welcome!
 	destructive operation, and -only- -then- invoke call_rcu(),
 	synchronize_rcu(), or friends.

-	Because these primitives only wait for pre-existing readers,
-	it is the caller's responsibility to guarantee safety to
-	any subsequent readers.
+	Because these primitives only wait for pre-existing readers, it
+	is the caller's responsibility to guarantee that any subsequent
+	readers will execute safely.

-16.	The various RCU read-side primitives do -not- contain memory
-	barriers.  The CPU (and in some cases, the compiler) is free
-	to reorder code into and out of RCU read-side critical sections.
-	It is the responsibility of the RCU update-side primitives to
-	deal with this.
+16.	The various RCU read-side primitives do -not- necessarily contain
+	memory barriers.  You should therefore plan for the CPU
+	and the compiler to freely reorder code into and out of RCU
+	read-side critical sections.  It is the responsibility of the
+	RCU update-side primitives to deal with this.
--- a/Documentation/RCU/lockdep.txt
+++ b/Documentation/RCU/lockdep.txt
@ -0,0 +1,67 @@
+RCU and lockdep checking
+
+All flavors of RCU have lockdep checking available, so that lockdep is
+aware of when each task enters and leaves any flavor of RCU read-side
+critical section.  Each flavor of RCU is tracked separately (but note
+that this is not the case in 2.6.32 and earlier).  This allows lockdep's
+tracking to include RCU state, which can sometimes help when debugging
+deadlocks and the like.
+
+In addition, RCU provides the following primitives that check lockdep's
+state:
+
+	rcu_read_lock_held() for normal RCU.
+	rcu_read_lock_bh_held() for RCU-bh.
+	rcu_read_lock_sched_held() for RCU-sched.
+	srcu_read_lock_held() for SRCU.
+
+These functions are conservative, and will therefore return 1 if they
+aren't certain (for example, if CONFIG_DEBUG_LOCK_ALLOC is not set).
+This prevents things like WARN_ON(!rcu_read_lock_held()) from giving false
+positives when lockdep is disabled.
+
+In addition, a separate kernel config parameter CONFIG_PROVE_RCU enables
+checking of rcu_dereference() primitives:
+
+	rcu_dereference(p):
+		Check for RCU read-side critical section.
+	rcu_dereference_bh(p):
+		Check for RCU-bh read-side critical section.
+	rcu_dereference_sched(p):
+		Check for RCU-sched read-side critical section.
+	srcu_dereference(p, sp):
+		Check for SRCU read-side critical section.
+	rcu_dereference_check(p, c):
+		Use explicit check expression "c".
+	rcu_dereference_raw(p)
+		Don't check.  (Use sparingly, if at all.)
+
+The rcu_dereference_check() check expression can be any boolean
+expression, but would normally include one of the rcu_read_lock_held()
+family of functions and a lockdep expression.  However, any boolean
+expression can be used.  For a moderately ornate example, consider
+the following:
+
+	file = rcu_dereference_check(fdt->fd[fd],
+				     rcu_read_lock_held() ||
+				     lockdep_is_held(&files->file_lock) ||
+				     atomic_read(&files->count) == 1);
+
+This expression picks up the pointer "fdt->fd[fd]" in an RCU-safe manner,
+and, if CONFIG_PROVE_RCU is configured, verifies that this expression
+is used in:
+
+1.	An RCU read-side critical section, or
+2.	with files->file_lock held, or
+3.	on an unshared files_struct.
+
+In case (1), the pointer is picked up in an RCU-safe manner for vanilla
+RCU read-side critical sections, in case (2) the ->file_lock prevents
+any change from taking place, and finally, in case (3) the current task
+is the only task accessing the file_struct, again preventing any change
+from taking place.
+
+There are currently only "universal" versions of the rcu_assign_pointer()
+and RCU list-/tree-traversal primitives, which do not (yet) check for
+being in an RCU read-side critical section.  In the future, separate
+versions of these primitives might be created.
--- a/Documentation/RCU/rcu.txt
+++ b/Documentation/RCU/rcu.txt
@ -75,6 +75,8 @@ o	I hear that RCU is patented?  What is with that?
 	search for the string "Patent" in RTFP.txt to find them.
 	Of these, one was allowed to lapse by the assignee, and the
 	others have been contributed to the Linux kernel under GPL.
+	There are now also LGPL implementations of user-level RCU
+	available (http://lttng.org/?q=node/18).

 o	I hear that RCU needs work in order to support realtime kernels?

@ -91,48 +93,4 @@ o	Where can I find more information on RCU?

 o	What are all these files in this directory?

-
-	NMI-RCU.txt
-
-		Describes how to use RCU to implement dynamic
-		NMI handlers, which can be revectored on the fly,
-		without rebooting.
-
-	RTFP.txt
-
-		List of RCU-related publications and web sites.
-
-	UP.txt
-
-		Discussion of RCU usage in UP kernels.
-
-	arrayRCU.txt
-
-		Describes how to use RCU to protect arrays, with
-		resizeable arrays whose elements reference other
-		data structures being of the most interest.
-
-	checklist.txt
-
-		Lists things to check for when inspecting code that
-		uses RCU.
-
-	listRCU.txt
-
-		Describes how to use RCU to protect linked lists.
-		This is the simplest and most common use of RCU
-		in the Linux kernel.
-
-	rcu.txt
-
-		You are reading it!
-
-	rcuref.txt
-
-		Describes how to combine use of reference counts
-		with RCU.
-
-	whatisRCU.txt
-
-		Overview of how the RCU implementation works.  Along
-		the way, presents a conceptual view of RCU.
+	See 00-INDEX for the list.
--- a/Documentation/RCU/stallwarn.txt
+++ b/Documentation/RCU/stallwarn.txt
@ -0,0 +1,58 @@
+Using RCU's CPU Stall Detector
+
+The CONFIG_RCU_CPU_STALL_DETECTOR kernel config parameter enables
+RCU's CPU stall detector, which detects conditions that unduly delay
+RCU grace periods.  The stall detector's idea of what constitutes
+"unduly delayed" is controlled by a pair of C preprocessor macros:
+
+RCU_SECONDS_TILL_STALL_CHECK
+
+	This macro defines the period of time that RCU will wait from
+	the beginning of a grace period until it issues an RCU CPU
+	stall warning.	It is normally ten seconds.
+
+RCU_SECONDS_TILL_STALL_RECHECK
+
+	This macro defines the period of time that RCU will wait after
+	issuing a stall warning until it issues another stall warning.
+	It is normally set to thirty seconds.
+
+RCU_STALL_RAT_DELAY
+
+	The CPU stall detector tries to make the offending CPU rat on itself,
+	as this often gives better-quality stack traces.  However, if
+	the offending CPU does not detect its own stall in the number
+	of jiffies specified by RCU_STALL_RAT_DELAY, then other CPUs will
+	complain.  This is normally set to two jiffies.
+
+The following problems can result in an RCU CPU stall warning:
+
+o	A CPU looping in an RCU read-side critical section.
+	
+o	A CPU looping with interrupts disabled.
+
+o	A CPU looping with preemption disabled.
+
+o	For !CONFIG_PREEMPT kernels, a CPU looping anywhere in the kernel
+	without invoking schedule().
+
+o	A bug in the RCU implementation.
+
+o	A hardware failure.  This is quite unlikely, but has occurred
+	at least once in a former life.  A CPU failed in a running system,
+	becoming unresponsive, but not causing an immediate crash.
+	This resulted in a series of RCU CPU stall warnings, eventually
+	leading the realization that the CPU had failed.
+
+The RCU, RCU-sched, and RCU-bh implementations have CPU stall warning.
+SRCU does not do so directly, but its calls to synchronize_sched() will
+result in RCU-sched detecting any CPU stalls that might be occurring.
+
+To diagnose the cause of the stall, inspect the stack traces.  The offending
+function will usually be near the top of the stack.  If you have a series
+of stall warnings from a single extended stall, comparing the stack traces
+can often help determine where the stall is occurring, which will usually
+be in the function nearest the top of the stack that stays the same from
+trace to trace.
+
+RCU bugs can often be debugged with the help of CONFIG_RCU_TRACE.
--- a/Documentation/RCU/torture.txt
+++ b/Documentation/RCU/torture.txt
@ -30,6 +30,18 @@ MODULE PARAMETERS

 This module has the following parameters:

+fqs_duration	Duration (in microseconds) of artificially induced bursts
+		of force_quiescent_state() invocations.  In RCU
+		implementations having force_quiescent_state(), these
+		bursts help force races between forcing a given grace
+		period and that grace period ending on its own.
+
+fqs_holdoff	Holdoff time (in microseconds) between consecutive calls
+		to force_quiescent_state() within a burst.
+
+fqs_stutter	Wait time (in seconds) between consecutive bursts
+		of calls to force_quiescent_state().
+
 irqreaders	Says to invoke RCU readers from irq level.  This is currently
 		done via timers.  Defaults to "1" for variants of RCU that
 		permit this.  (Or, more accurately, variants of RCU that do
--- a/Documentation/RCU/whatisRCU.txt
+++ b/Documentation/RCU/whatisRCU.txt
@ -323,14 +323,17 @@ used as follows:
 	Defer			Protect

 a.	synchronize_rcu()	rcu_read_lock() / rcu_read_unlock()
-	call_rcu()
+	call_rcu()		rcu_dereference()

 b.	call_rcu_bh()		rcu_read_lock_bh() / rcu_read_unlock_bh()
+				rcu_dereference_bh()

-c.	synchronize_sched()	preempt_disable() / preempt_enable()
+c.	synchronize_sched()	rcu_read_lock_sched() / rcu_read_unlock_sched()
+				preempt_disable() / preempt_enable()
 				local_irq_save() / local_irq_restore()
 				hardirq enter / hardirq exit
 				NMI enter / NMI exit
+				rcu_dereference_sched()

 These three mechanisms are used as follows:

@ -780,9 +783,8 @@ Linux-kernel source code, but it helps to have a full list of the
 APIs, since there does not appear to be a way to categorize them
 in docbook.  Here is the list, by category.

-RCU pointer/list traversal:
+RCU list traversal:

-	rcu_dereference
 	list_for_each_entry_rcu
 	hlist_for_each_entry_rcu
 	hlist_nulls_for_each_entry_rcu
@ -808,7 +810,7 @@ RCU:	Critical sections	Grace period		Barrier

 	rcu_read_lock		synchronize_net		rcu_barrier
 	rcu_read_unlock		synchronize_rcu
-				synchronize_rcu_expedited
+	rcu_dereference		synchronize_rcu_expedited
 				call_rcu


@ -816,7 +818,7 @@ bh:	Critical sections	Grace period		Barrier

 	rcu_read_lock_bh	call_rcu_bh		rcu_barrier_bh
 	rcu_read_unlock_bh	synchronize_rcu_bh
-				synchronize_rcu_bh_expedited
+	rcu_dereference_bh	synchronize_rcu_bh_expedited


 sched:	Critical sections	Grace period		Barrier
@ -825,12 +827,14 @@ sched:	Critical sections	Grace period		Barrier
 	rcu_read_unlock_sched	call_rcu_sched
 	[preempt_disable]	synchronize_sched_expedited
 	[and friends]
+	rcu_dereference_sched


 SRCU:	Critical sections	Grace period		Barrier

 	srcu_read_lock		synchronize_srcu	N/A
 	srcu_read_unlock	synchronize_srcu_expedited
+	srcu_dereference

 SRCU:	Initialization/cleanup
 	init_srcu_struct
--- a/Documentation/SubmitChecklist
+++ b/Documentation/SubmitChecklist
@ -9,10 +9,14 @@ Documentation/SubmittingPatches and elsewhere regarding submitting Linux
 kernel patches.


-1: Builds cleanly with applicable or modified CONFIG options =y, =m, and
+1: If you use a facility then #include the file that defines/declares
+   that facility.  Don't depend on other header files pulling in ones
+   that you use.
+
+2: Builds cleanly with applicable or modified CONFIG options =y, =m, and
   =n.  No gcc warnings/errors, no linker warnings/errors.

-2: Passes allnoconfig, allmodconfig
+2b: Passes allnoconfig, allmodconfig

 3: Builds on multiple CPU architectures by using local cross-compile tools
   or some other build farm.
--- a/Documentation/acpi/method-customizing.txt
+++ b/Documentation/acpi/method-customizing.txt
@ -0,0 +1,66 @@
+Linux ACPI Custom Control Method How To
+=======================================
+
+Written by Zhang Rui <rui.zhang@intel.com>
+
+
+Linux supports customizing ACPI control methods at runtime.
+
+Users can use this to
+1. override an existing method which may not work correctly,
+   or just for debugging purposes.
+2. insert a completely new method in order to create a missing
+   method such as _OFF, _ON, _STA, _INI, etc.
+For these cases, it is far simpler to dynamically install a single
+control method rather than override the entire DSDT, because kernel
+rebuild/reboot is not needed and test result can be got in minutes.
+
+Note: Only ACPI METHOD can be overridden, any other object types like
+      "Device", "OperationRegion", are not recognized.
+Note: The same ACPI control method can be overridden for many times,
+      and it's always the latest one that used by Linux/kernel.
+
+1. override an existing method
+   a) get the ACPI table via ACPI sysfs I/F. e.g. to get the DSDT,
+      just run "cat /sys/firmware/acpi/tables/DSDT > /tmp/dsdt.dat"
+   b) disassemble the table by running "iasl -d dsdt.dat".
+   c) rewrite the ASL code of the method and save it in a new file,
+   d) package the new file (psr.asl) to an ACPI table format.
+      Here is an example of a customized \_SB._AC._PSR method,
+
+      DefinitionBlock ("", "SSDT", 1, "", "", 0x20080715)
+      {
+	External (ACON)
+
+	Method (\_SB_.AC._PSR, 0, NotSerialized)
+	{
+		Store ("In AC _PSR", Debug)
+		Return (ACON)
+	}
+      }
+      Note that the full pathname of the method in ACPI namespace
+      should be used.
+      And remember to use "External" to declare external objects.
+   e) assemble the file to generate the AML code of the method.
+      e.g. "iasl psr.asl" (psr.aml is generated as a result)
+   f) mount debugfs by "mount -t debugfs none /sys/kernel/debug"
+   g) override the old method via the debugfs by running
+      "cat /tmp/psr.aml > /sys/kernel/debug/acpi/custom_method"
+
+2. insert a new method
+   This is easier than overriding an existing method.
+   We just need to create the ASL code of the method we want to
+   insert and then follow the step c) ~ g) in section 1.
+
+3. undo your changes
+   The "undo" operation is not supported for a new inserted method
+   right now, i.e. we can not remove a method currently.
+   For an overrided method, in order to undo your changes, please
+   save a copy of the method original ASL code in step c) section 1,
+   and redo step c) ~ g) to override the method with the original one.
+
+
+Note: We can use a kernel with multiple custom ACPI method running,
+      But each individual write to debugfs can implement a SINGLE
+      method override. i.e. if we want to insert/override multiple
+      ACPI methods, we need to redo step c) ~ g) for multiple times.
--- a/Documentation/arm/Samsung-S3C24XX/CPUfreq.txt
+++ b/Documentation/arm/Samsung-S3C24XX/CPUfreq.txt
@ -14,8 +14,8 @@ Introduction
 how the clocks are arranged. The first implementation used as single
 PLL to feed the ARM, memory and peripherals via a series of dividers
 and muxes and this is the implementation that is documented here. A
- newer version where there is a seperate PLL and clock divider for the
- ARM core is available as a seperate driver.
+ newer version where there is a separate PLL and clock divider for the
+ ARM core is available as a separate driver.


 Layout
--- a/Documentation/arm/Samsung/Overview.txt
+++ b/Documentation/arm/Samsung/Overview.txt
@ -0,0 +1,86 @@
+		Samsung ARM Linux Overview
+		==========================
+
+Introduction
+------------
+
+  The Samsung range of ARM SoCs spans many similar devices, from the initial
+  ARM9 through to the newest ARM cores. This document shows an overview of
+  the current kernel support, how to use it and where to find the code
+  that supports this.
+
+  The currently supported SoCs are:
+
+  - S3C24XX: See Documentation/arm/Samsung-S3C24XX/Overview.txt for full list
+  - S3C64XX: S3C6400 and S3C6410
+  - S5PC6440
+
+  S5PC100 and S5PC110 support is currently being merged
+
+
+S3C24XX Systems
+---------------
+
+  There is still documentation in Documnetation/arm/Samsung-S3C24XX/ which
+  deals with the architecture and drivers specific to these devices.
+
+  See Documentation/arm/Samsung-S3C24XX/Overview.txt for more information
+  on the implementation details and specific support.
+
+
+Configuration
+-------------
+
+  A number of configurations are supplied, as there is no current way of
+  unifying all the SoCs into one kernel.
+
+  s5p6440_defconfig - S5P6440 specific default configuration
+  s5pc100_defconfig - S5PC100 specific default configuration
+
+
+Layout
+------
+
+  The directory layout is currently being restructured, and consists of
+  several platform directories and then the machine specific directories
+  of the CPUs being built for.
+
+  plat-samsung provides the base for all the implementations, and is the
+  last in the line of include directories that are processed for the build
+  specific information. It contains the base clock, GPIO and device definitions
+  to get the system running.
+
+  plat-s3c is the s3c24xx/s3c64xx platform directory, although it is currently
+  involved in other builds this will be phased out once the relevant code is
+  moved elsewhere.
+
+  plat-s3c24xx is for s3c24xx specific builds, see the S3C24XX docs.
+
+  plat-s3c64xx is for the s3c64xx specific bits, see the S3C24XX docs.
+
+  plat-s5p is for s5p specific builds, more to be added.
+
+
+  [ to finish ]
+
+
+Port Contributors
+-----------------
+
+  Ben Dooks (BJD)
+  Vincent Sanders
+  Herbert Potzl
+  Arnaud Patard (RTP)
+  Roc Wu
+  Klaus Fetscher
+  Dimitry Andric
+  Shannon Holland
+  Guillaume Gourat (NexVision)
+  Christer Weinigel (wingel) (Acer N30)
+  Lucas Correia Villa Real (S3C2400 port)
+
+
+Document Author
+---------------
+
+Copyright 2009-2010 Ben Dooks <ben-linux@fluff.org>
--- a/Documentation/arm/Samsung/clksrc-change-registers.awk
+++ b/Documentation/arm/Samsung/clksrc-change-registers.awk
@ -0,0 +1,167 @@
+#!/usr/bin/awk -f
+#
+# Copyright 2010 Ben Dooks <ben-linux@fluff.org>
+#
+# Released under GPLv2
+
+# example usage
+# ./clksrc-change-registers.awk arch/arm/plat-s5pc1xx/include/plat/regs-clock.h < src > dst
+
+function extract_value(s)
+{
+    eqat = index(s, "=")
+    comat = index(s, ",")
+    return substr(s, eqat+2, (comat-eqat)-2)
+}
+
+function remove_brackets(b)
+{
+    return substr(b, 2, length(b)-2)
+}
+
+function splitdefine(l, p)
+{
+    r = split(l, tp)
+
+    p[0] = tp[2]
+    p[1] = remove_brackets(tp[3])
+}
+
+function find_length(f)
+{
+    if (0)
+	printf "find_length " f "\n" > "/dev/stderr"
+
+    if (f ~ /0x1/)
+	return 1
+    else if (f ~ /0x3/)
+	return 2
+    else if (f ~ /0x7/)
+	return 3
+    else if (f ~ /0xf/)
+	return 4
+
+    printf "unknown legnth " f "\n" > "/dev/stderr"
+    exit
+}
+
+function find_shift(s)
+{
+    id = index(s, "<")
+    if (id <= 0) {
+	printf "cannot find shift " s "\n" > "/dev/stderr"
+	exit
+    }
+
+    return substr(s, id+2)
+}
+
+
+BEGIN {
+    if (ARGC < 2) {
+	print "too few arguments" > "/dev/stderr"
+	exit
+    }
+
+# read the header file and find the mask values that we will need
+# to replace and create an associative array of values
+
+    while (getline line < ARGV[1] > 0) {
+	if (line ~ /\#define.*_MASK/ &&
+	    !(line ~ /S5PC100_EPLL_MASK/) &&
+	    !(line ~ /USB_SIG_MASK/)) {
+	    splitdefine(line, fields)
+	    name = fields[0]
+	    if (0)
+		printf "MASK " line "\n" > "/dev/stderr"
+	    dmask[name,0] = find_length(fields[1])
+	    dmask[name,1] = find_shift(fields[1])
+	    if (0)
+		printf "=> '" name "' LENGTH=" dmask[name,0] " SHIFT=" dmask[name,1] "\n" > "/dev/stderr"
+	} else {
+	}
+    }
+
+    delete ARGV[1]
+}
+
+/clksrc_clk.*=.*{/ {
+    shift=""
+    mask=""
+    divshift=""
+    reg_div=""
+    reg_src=""
+    indent=1
+
+    print $0
+
+    for(; indent >= 1;) {
+	if ((getline line) <= 0) {
+	    printf "unexpected end of file" > "/dev/stderr"
+	    exit 1;
+	}
+
+	if (line ~ /\.shift/) {
+	    shift = extract_value(line)
+	} else if (line ~ /\.mask/) {
+	    mask = extract_value(line)
+	} else if (line ~ /\.reg_divider/) {
+	    reg_div = extract_value(line)
+	} else if (line ~ /\.reg_source/) {
+	    reg_src = extract_value(line)
+	} else if (line ~ /\.divider_shift/) {
+	    divshift = extract_value(line)
+	} else if (line ~ /{/) {
+		indent++
+		print line
+	    } else if (line ~ /}/) {
+	    indent--
+
+	    if (indent == 0) {
+		if (0) {
+		    printf "shift '" shift   "' ='" dmask[shift,0] "'\n" > "/dev/stderr"
+		    printf "mask  '" mask    "'\n" > "/dev/stderr"
+		    printf "dshft '" divshift "'\n" > "/dev/stderr"
+		    printf "rdiv  '" reg_div "'\n" > "/dev/stderr"
+		    printf "rsrc  '" reg_src "'\n" > "/dev/stderr"
+		}
+
+		generated = mask
+		sub(reg_src, reg_div, generated)
+
+		if (0) {
+		    printf "/* rsrc " reg_src " */\n"
+		    printf "/* rdiv " reg_div " */\n"
+		    printf "/* shift " shift " */\n"
+		    printf "/* mask " mask " */\n"
+		    printf "/* generated " generated " */\n"
+		}
+
+		if (reg_div != "") {
+		    printf "\t.reg_div = { "
+		    printf ".reg = " reg_div ", "
+		    printf ".shift = " dmask[generated,1] ", "
+		    printf ".size = " dmask[generated,0] ", "
+		    printf "},\n"
+		}
+
+		printf "\t.reg_src = { "
+		printf ".reg = " reg_src ", "
+		printf ".shift = " dmask[mask,1] ", "
+		printf ".size = " dmask[mask,0] ", "
+
+		printf "},\n"
+
+	    }
+
+	    print line
+	} else {
+	    print line
+	}
+
+	if (0)
+	    printf indent ":" line "\n" > "/dev/stderr"
+    }
+}
+
+// && ! /clksrc_clk.*=.*{/ { print $0 }
--- a/Documentation/arm/memory.txt
+++ b/Documentation/arm/memory.txt
@ -59,7 +59,11 @@ PAGE_OFFSET	high_memory-1	Kernel direct-mapped RAM region.
 				This maps the platforms RAM, and typically
 				maps all platform RAM in a 1:1 relationship.

-TASK_SIZE	PAGE_OFFSET-1	Kernel module space
+PKMAP_BASE	PAGE_OFFSET-1	Permanent kernel mappings
+				One way of mapping HIGHMEM pages into kernel
+				space.
+
+MODULES_VADDR	MODULES_END-1	Kernel module space
 				Kernel modules inserted via insmod are
 				placed here using dynamic mappings.

--- a/Documentation/blackfin/00-INDEX
+++ b/Documentation/blackfin/00-INDEX
@ -1,9 +1,6 @@
 00-INDEX
 	- This file

-cache-lock.txt
-	- HOWTO for blackfin cache locking.
-
 cachefeatures.txt
 	- Supported cache features.

--- a/Documentation/blackfin/Makefile
+++ b/Documentation/blackfin/Makefile
@ -0,0 +1,6 @@
+obj-m := gptimers-example.o
+
+all: modules
+
+modules clean:
+	$(MAKE) -C ../.. SUBDIRS=$(PWD) $@
--- a/Documentation/blackfin/cache-lock.txt
+++ b/Documentation/blackfin/cache-lock.txt
@ -1,48 +0,0 @@
-/*
- * File:         Documentation/blackfin/cache-lock.txt
- * Based on:
- * Author:
- *
- * Created:
- * Description:  This file contains the simple DMA Implementation for Blackfin
- *
- * Rev:          $Id: cache-lock.txt 2384 2006-11-01 04:12:43Z magicyang $
- *
- * Modified:
- *               Copyright 2004-2006 Analog Devices Inc.
- *
- * Bugs:         Enter bugs at http://blackfin.uclinux.org/
- *
- */
-
-How to lock your code in cache in uClinux/blackfin
--------------------------------------------------
-
-There are only a few steps required to lock your code into the cache.
-Currently you can lock the code by Way.
-
-Below are the interface provided for locking the cache.
-
-
-1. cache_grab_lock(int Ways);
-
-This function grab the lock for locking your code into the cache specified
-by Ways.
-
-
-2. cache_lock(int Ways);
-
-This function should be called after your critical code has been executed.
-Once the critical code exits, the code is now loaded into the cache. This
-function locks the code into the cache.
-
-
-So, the example sequence will be:
-
-	cache_grab_lock(WAY0_L);	/* Grab the lock */
-
-	critical_code();		/* Execute the code of interest */
-
-	cache_lock(WAY0_L);		/* Lock the cache */
-
-Where WAY0_L signifies WAY0 locking.
--- a/Documentation/blackfin/cachefeatures.txt
+++ b/Documentation/blackfin/cachefeatures.txt
@ -41,16 +41,6 @@
 		icplb_flush();
 		dcplb_flush();

-	- Locking the cache.
-
-		cache_grab_lock();
-		cache_lock();
-
-	Please refer linux-2.6.x/Documentation/blackfin/cache-lock.txt for how to
-	lock the cache.
-
-	Locking the cache is optional feature.
-
 	- Miscellaneous cache functions.

 		flush_cache_all();
--- a/Documentation/blackfin/gptimers-example.c
+++ b/Documentation/blackfin/gptimers-example.c
@ -0,0 +1,83 @@
+/*
+ * Simple gptimers example
+ *	http://docs.blackfin.uclinux.org/doku.php?id=linux-kernel:drivers:gptimers
+ *
+ * Copyright 2007-2009 Analog Devices Inc.
+ *
+ * Licensed under the GPL-2 or later.
+ */
+
+#include <linux/interrupt.h>
+#include <linux/module.h>
+
+#include <asm/gptimers.h>
+#include <asm/portmux.h>
+
+/* ... random driver includes ... */
+
+#define DRIVER_NAME "gptimer_example"
+
+struct gptimer_data {
+	uint32_t period, width;
+};
+static struct gptimer_data data;
+
+/* ... random driver state ... */
+
+static irqreturn_t gptimer_example_irq(int irq, void *dev_id)
+{
+	struct gptimer_data *data = dev_id;
+
+	/* make sure it was our timer which caused the interrupt */
+	if (!get_gptimer_intr(TIMER5_id))
+		return IRQ_NONE;
+
+	/* read the width/period values that were captured for the waveform */
+	data->width = get_gptimer_pwidth(TIMER5_id);
+	data->period = get_gptimer_period(TIMER5_id);
+
+	/* acknowledge the interrupt */
+	clear_gptimer_intr(TIMER5_id);
+
+	/* tell the upper layers we took care of things */
+	return IRQ_HANDLED;
+}
+
+/* ... random driver code ... */
+
+static int __init gptimer_example_init(void)
+{
+	int ret;
+
+	/* grab the peripheral pins */
+	ret = peripheral_request(P_TMR5, DRIVER_NAME);
+	if (ret) {
+		printk(KERN_NOTICE DRIVER_NAME ": peripheral request failed\n");
+		return ret;
+	}
+
+	/* grab the IRQ for the timer */
+	ret = request_irq(IRQ_TIMER5, gptimer_example_irq, IRQF_SHARED, DRIVER_NAME, &data);
+	if (ret) {
+		printk(KERN_NOTICE DRIVER_NAME ": IRQ request failed\n");
+		peripheral_free(P_TMR5);
+		return ret;
+	}
+
+	/* setup the timer and enable it */
+	set_gptimer_config(TIMER5_id, WDTH_CAP | PULSE_HI | PERIOD_CNT | IRQ_ENA);
+	enable_gptimers(TIMER5bit);
+
+	return 0;
+}
+module_init(gptimer_example_init);
+
+static void __exit gptimer_example_exit(void)
+{
+	disable_gptimers(TIMER5bit);
+	free_irq(IRQ_TIMER5, &data);
+	peripheral_free(P_TMR5);
+}
+module_exit(gptimer_example_exit);
+
+MODULE_LICENSE("BSD");
--- a/Documentation/block/00-INDEX
+++ b/Documentation/block/00-INDEX
@ -1,7 +1,5 @@
 00-INDEX
 	- This file
-as-iosched.txt
-	- Anticipatory IO scheduler
 barrier.txt
 	- I/O Barriers
 biodoc.txt
--- a/Documentation/block/as-iosched.txt
+++ b/Documentation/block/as-iosched.txt
@ -1,172 +0,0 @@
-Anticipatory IO scheduler
-------------------------
-Nick Piggin <piggin@cyberone.com.au>    13 Sep 2003
-
-Attention! Database servers, especially those using "TCQ" disks should
-investigate performance with the 'deadline' IO scheduler. Any system with high
-disk performance requirements should do so, in fact.
-
-If you see unusual performance characteristics of your disk systems, or you
-see big performance regressions versus the deadline scheduler, please email
-me. Database users don't bother unless you're willing to test a lot of patches
-from me ;) its a known issue.
-
-Also, users with hardware RAID controllers, doing striping, may find
-highly variable performance results with using the as-iosched. The
-as-iosched anticipatory implementation is based on the notion that a disk
-device has only one physical seeking head.  A striped RAID controller
-actually has a head for each physical device in the logical RAID device.
-
-However, setting the antic_expire (see tunable parameters below) produces
-very similar behavior to the deadline IO scheduler.
-
-Selecting IO schedulers
-----------------------
-Refer to Documentation/block/switching-sched.txt for information on
-selecting an io scheduler on a per-device basis.
-
-Anticipatory IO scheduler Policies
----------------------------------
-The as-iosched implementation implements several layers of policies
-to determine when an IO request is dispatched to the disk controller.
-Here are the policies outlined, in order of application.
-
-1. one-way Elevator algorithm.
-
-The elevator algorithm is similar to that used in deadline scheduler, with
-the addition that it allows limited backward movement of the elevator
-(i.e. seeks backwards).  A seek backwards can occur when choosing between
-two IO requests where one is behind the elevator's current position, and
-the other is in front of the elevator's position. If the seek distance to
-the request in back of the elevator is less than half the seek distance to
-the request in front of the elevator, then the request in back can be chosen.
-Backward seeks are also limited to a maximum of MAXBACK (1024*1024) sectors.
-This favors forward movement of the elevator, while allowing opportunistic
-"short" backward seeks.
-
-2. FIFO expiration times for reads and for writes.
-
-This is again very similar to the deadline IO scheduler.  The expiration
-times for requests on these lists is tunable using the parameters read_expire
-and write_expire discussed below.  When a read or a write expires in this way,
-the IO scheduler will interrupt its current elevator sweep or read anticipation
-to service the expired request.
-
-3. Read and write request batching
-
-A batch is a collection of read requests or a collection of write
-requests.  The as scheduler alternates dispatching read and write batches
-to the driver.  In the case a read batch, the scheduler submits read
-requests to the driver as long as there are read requests to submit, and
-the read batch time limit has not been exceeded (read_batch_expire).
-The read batch time limit begins counting down only when there are
-competing write requests pending.
-
-In the case of a write batch, the scheduler submits write requests to
-the driver as long as there are write requests available, and the
-write batch time limit has not been exceeded (write_batch_expire).
-However, the length of write batches will be gradually shortened
-when read batches frequently exceed their time limit.
-
-When changing between batch types, the scheduler waits for all requests
-from the previous batch to complete before scheduling requests for the
-next batch.
-
-The read and write fifo expiration times described in policy 2 above
-are checked only when in scheduling IO of a batch for the corresponding
-(read/write) type.  So for example, the read FIFO timeout values are
-tested only during read batches.  Likewise, the write FIFO timeout
-values are tested only during write batches.  For this reason,
-it is generally not recommended for the read batch time
-to be longer than the write expiration time, nor for the write batch
-time to exceed the read expiration time (see tunable parameters below).
-
-When the IO scheduler changes from a read to a write batch,
-it begins the elevator from the request that is on the head of the
-write expiration FIFO.  Likewise, when changing from a write batch to
-a read batch, scheduler begins the elevator from the first entry
-on the read expiration FIFO.
-
-4. Read anticipation.
-
-Read anticipation occurs only when scheduling a read batch.
-This implementation of read anticipation allows only one read request
-to be dispatched to the disk controller at a time.  In
-contrast, many write requests may be dispatched to the disk controller
-at a time during a write batch.  It is this characteristic that can make
-the anticipatory scheduler perform anomalously with controllers supporting
-TCQ, or with hardware striped RAID devices. Setting the antic_expire
-queue parameter (see below) to zero disables this behavior, and the 
-anticipatory scheduler behaves essentially like the deadline scheduler.
-
-When read anticipation is enabled (antic_expire is not zero), reads
-are dispatched to the disk controller one at a time.
-At the end of each read request, the IO scheduler examines its next
-candidate read request from its sorted read list.  If that next request
-is from the same process as the request that just completed,
-or if the next request in the queue is "very close" to the
-just completed request, it is dispatched immediately.  Otherwise,
-statistics (average think time, average seek distance) on the process
-that submitted the just completed request are examined.  If it seems
-likely that that process will submit another request soon, and that
-request is likely to be near the just completed request, then the IO
-scheduler will stop dispatching more read requests for up to (antic_expire)
-milliseconds, hoping that process will submit a new request near the one
-that just completed.  If such a request is made, then it is dispatched
-immediately.  If the antic_expire wait time expires, then the IO scheduler
-will dispatch the next read request from the sorted read queue.
-
-To decide whether an anticipatory wait is worthwhile, the scheduler
-maintains statistics for each process that can be used to compute
-mean "think time" (the time between read requests), and mean seek
-distance for that process.  One observation is that these statistics
-are associated with each process, but those statistics are not associated
-with a specific IO device.  So for example, if a process is doing IO
-on several file systems on separate devices, the statistics will be
-a combination of IO behavior from all those devices.
-
-
-Tuning the anticipatory IO scheduler
------------------------------------
-When using 'as', the anticipatory IO scheduler there are 5 parameters under
-/sys/block/*/queue/iosched/. All are units of milliseconds.
-
-The parameters are:
-* read_expire
-    Controls how long until a read request becomes "expired". It also controls the
-    interval between which expired requests are served, so set to 50, a request
-    might take anywhere < 100ms to be serviced _if_ it is the next on the
-    expired list. Obviously request expiration strategies won't make the disk
-    go faster. The result basically equates to the timeslice a single reader
-    gets in the presence of other IO. 100*((seek time / read_expire) + 1) is
-    very roughly the % streaming read efficiency your disk should get with
-    multiple readers.
-
-* read_batch_expire
-    Controls how much time a batch of reads is given before pending writes are
-    served. A higher value is more efficient. This might be set below read_expire
-    if writes are to be given higher priority than reads, but reads are to be
-    as efficient as possible when there are no writes. Generally though, it
-    should be some multiple of read_expire.
-
-* write_expire, and
-* write_batch_expire are equivalent to the above, for writes.
-
-* antic_expire
-    Controls the maximum amount of time we can anticipate a good read (one
-    with a short seek distance from the most recently completed request) before
-    giving up. Many other factors may cause anticipation to be stopped early,
-    or some processes will not be "anticipated" at all. Should be a bit higher
-    for big seek time devices though not a linear correspondence - most
-    processes have only a few ms thinktime.
-
-In addition to the tunables above there is a read-only file named est_time
-which, when read, will show:
-
-    - The probability of a task exiting without a cooperating task
-      submitting an anticipated IO.
-
-    - The current mean think time.
-
-    - The seek distance used to determine if an incoming IO is better.
-
--- a/Documentation/block/biodoc.txt
+++ b/Documentation/block/biodoc.txt
@ -186,7 +186,7 @@ a virtual address mapping (unlike the earlier scheme of virtual address
 do not have a corresponding kernel virtual address space mapping) and
 low-memory pages.

-Note: Please refer to Documentation/DMA-mapping.txt for a discussion
+Note: Please refer to Documentation/PCI/PCI-DMA-mapping.txt for a discussion
 on PCI high mem DMA aspects and mapping of scatter gather lists, and support
 for 64 bit PCI.

--- a/Documentation/block/queue-sysfs.txt
+++ b/Documentation/block/queue-sysfs.txt
@ -25,11 +25,11 @@ size allowed by the hardware.

 nomerges (RW)
 -------------
-This enables the user to disable the lookup logic involved with IO merging
-requests in the block layer. Merging may still occur through a direct
-1-hit cache, since that comes for (almost) free. The IO scheduler will not
-waste cycles doing tree/hash lookups for merges if nomerges is 1. Defaults
-to 0, enabling all merges.
+This enables the user to disable the lookup logic involved with IO
+merging requests in the block layer. By default (0) all merges are
+enabled. When set to 1 only simple one-hit merges will be tried. When
+set to 2 no merge algorithms will be tried (including one-hit or more
+complex tree/hash lookups).

 nr_requests (RW)
 ----------------
--- a/Documentation/cachetlb.txt
+++ b/Documentation/cachetlb.txt
@ -88,12 +88,12 @@ changes occur:
 	This is used primarily during fault processing.

 5) void update_mmu_cache(struct vm_area_struct *vma,
-			 unsigned long address, pte_t pte)
+			 unsigned long address, pte_t *ptep)

 	At the end of every page fault, this routine is invoked to
 	tell the architecture specific code that a translation
-	described by "pte" now exists at virtual address "address"
-	for address space "vma->vm_mm", in the software page tables.
+	now exists at virtual address "address" for address space
+	"vma->vm_mm", in the software page tables.

 	A port may use this information in any way it so chooses.
 	For example, it could use this event to pre-load TLB
@ -377,3 +377,27 @@ maps this page at its virtual address.
 	All the functionality of flush_icache_page can be implemented in
 	flush_dcache_page and update_mmu_cache. In 2.7 the hope is to
 	remove this interface completely.
+
+The final category of APIs is for I/O to deliberately aliased address
+ranges inside the kernel.  Such aliases are set up by use of the
+vmap/vmalloc API.  Since kernel I/O goes via physical pages, the I/O
+subsystem assumes that the user mapping and kernel offset mapping are
+the only aliases.  This isn't true for vmap aliases, so anything in
+the kernel trying to do I/O to vmap areas must manually manage
+coherency.  It must do this by flushing the vmap range before doing
+I/O and invalidating it after the I/O returns.
+
+  void flush_kernel_vmap_range(void *vaddr, int size)
+       flushes the kernel cache for a given virtual address range in
+       the vmap area.  This is to make sure that any data the kernel
+       modified in the vmap range is made visible to the physical
+       page.  The design is to make this area safe to perform I/O on.
+       Note that this API does *not* also flush the offset map alias
+       of the area.
+
+  void invalidate_kernel_vmap_range(void *vaddr, int size) invalidates
+       the cache for a given virtual address range in the vmap area
+       which prevents the processor from making the cache stale by
+       speculatively reading data while the I/O was occurring to the
+       physical pages.  This is only necessary for data reads into the
+       vmap area.
--- a/Documentation/cdrom/ide-cd
+++ b/Documentation/cdrom/ide-cd
@ -159,42 +159,7 @@ two arguments:  the CDROM device, and the slot number to which you wish
 to change.  If the slot number is -1, the drive is unloaded.


-4. Compilation options
----------------------
-
-There are a few additional options which can be set when compiling the
-driver.  Most people should not need to mess with any of these; they
-are listed here simply for completeness.  A compilation option can be
-enabled by adding a line of the form `#define <option> 1' to the top
-of ide-cd.c.  All these options are disabled by default.
-
-VERBOSE_IDE_CD_ERRORS
-  If this is set, ATAPI error codes will be translated into textual
-  descriptions.  In addition, a dump is made of the command which
-  provoked the error.  This is off by default to save the memory used
-  by the (somewhat long) table of error descriptions.  
-
-STANDARD_ATAPI
-  If this is set, the code needed to deal with certain drives which do
-  not properly implement the ATAPI spec will be disabled.  If you know
-  your drive implements ATAPI properly, you can turn this on to get a
-  slightly smaller kernel.
-
-NO_DOOR_LOCKING
-  If this is set, the driver will never attempt to lock the door of
-  the drive.
-
-CDROM_NBLOCKS_BUFFER
-  This sets the size of the buffer to be used for a CDROMREADAUDIO
-  ioctl.  The default is 8.
-
-TEST
-  This currently enables an additional ioctl which enables a user-mode
-  program to execute an arbitrary packet command.  See the source for
-  details.  This should be left off unless you know what you're doing.
-
-
-5. Common problems
+4. Common problems
 ------------------

 This section discusses some common problems encountered when trying to
@ -371,7 +336,7 @@ f. Data corruption.
    expense of low system performance.


-6. cdchange.c
+5. cdchange.c
 -------------

 /*
--- a/Documentation/cgroups/cgroup_event_listener.c
+++ b/Documentation/cgroups/cgroup_event_listener.c
@ -0,0 +1,110 @@
+/*
+ * cgroup_event_listener.c - Simple listener of cgroup events
+ *
+ * Copyright (C) Kirill A. Shutemov <kirill@shutemov.name>
+ */
+
+#include <assert.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <libgen.h>
+#include <limits.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <sys/eventfd.h>
+
+#define USAGE_STR "Usage: cgroup_event_listener <path-to-control-file> <args>\n"
+
+int main(int argc, char **argv)
+{
+	int efd = -1;
+	int cfd = -1;
+	int event_control = -1;
+	char event_control_path[PATH_MAX];
+	char line[LINE_MAX];
+	int ret;
+
+	if (argc != 3) {
+		fputs(USAGE_STR, stderr);
+		return 1;
+	}
+
+	cfd = open(argv[1], O_RDONLY);
+	if (cfd == -1) {
+		fprintf(stderr, "Cannot open %s: %s\n", argv[1],
+				strerror(errno));
+		goto out;
+	}
+
+	ret = snprintf(event_control_path, PATH_MAX, "%s/cgroup.event_control",
+			dirname(argv[1]));
+	if (ret >= PATH_MAX) {
+		fputs("Path to cgroup.event_control is too long\n", stderr);
+		goto out;
+	}
+
+	event_control = open(event_control_path, O_WRONLY);
+	if (event_control == -1) {
+		fprintf(stderr, "Cannot open %s: %s\n", event_control_path,
+				strerror(errno));
+		goto out;
+	}
+
+	efd = eventfd(0, 0);
+	if (efd == -1) {
+		perror("eventfd() failed");
+		goto out;
+	}
+
+	ret = snprintf(line, LINE_MAX, "%d %d %s", efd, cfd, argv[2]);
+	if (ret >= LINE_MAX) {
+		fputs("Arguments string is too long\n", stderr);
+		goto out;
+	}
+
+	ret = write(event_control, line, strlen(line) + 1);
+	if (ret == -1) {
+		perror("Cannot write to cgroup.event_control");
+		goto out;
+	}
+
+	while (1) {
+		uint64_t result;
+
+		ret = read(efd, &result, sizeof(result));
+		if (ret == -1) {
+			if (errno == EINTR)
+				continue;
+			perror("Cannot read from eventfd");
+			break;
+		}
+		assert(ret == sizeof(result));
+
+		ret = access(event_control_path, W_OK);
+		if ((ret == -1) && (errno == ENOENT)) {
+				puts("The cgroup seems to have removed.");
+				ret = 0;
+				break;
+		}
+
+		if (ret == -1) {
+			perror("cgroup.event_control "
+					"is not accessable any more");
+			break;
+		}
+
+		printf("%s %s: crossed\n", argv[1], argv[2]);
+	}
+
+out:
+	if (efd >= 0)
+		close(efd);
+	if (event_control >= 0)
+		close(event_control);
+	if (cfd >= 0)
+		close(cfd);
+
+	return (ret != 0);
+}
--- a/Documentation/cgroups/cgroups.txt
+++ b/Documentation/cgroups/cgroups.txt
@ -22,6 +22,8 @@ CONTENTS:
 2. Usage Examples and Syntax
  2.1 Basic Usage
  2.2 Attaching processes
+  2.3 Mounting hierarchies by name
+  2.4 Notification API
 3. Kernel API
  3.1 Overview
  3.2 Synchronization
@ -434,6 +436,25 @@ you give a subsystem a name.
 The name of the subsystem appears as part of the hierarchy description
 in /proc/mounts and /proc/<pid>/cgroups.

+2.4 Notification API
+--------------------
+
+There is mechanism which allows to get notifications about changing
+status of a cgroup.
+
+To register new notification handler you need:
+ - create a file descriptor for event notification using eventfd(2);
+ - open a control file to be monitored (e.g. memory.usage_in_bytes);
+ - write "<event_fd> <control_fd> <args>" to cgroup.event_control.
+   Interpretation of args is defined by control file implementation;
+
+eventfd will be woken up by control file implementation or when the
+cgroup is removed.
+
+To unregister notification handler just close eventfd.
+
+NOTE: Support of notifications should be implemented for the control
+file. See documentation for the subsystem.

 3. Kernel API
 =============
@ -488,6 +509,11 @@ Each subsystem should:
 - add an entry in linux/cgroup_subsys.h
 - define a cgroup_subsys object called <name>_subsys

+If a subsystem can be compiled as a module, it should also have in its
+module initcall a call to cgroup_load_subsys(), and in its exitcall a
+call to cgroup_unload_subsys(). It should also set its_subsys.module =
+THIS_MODULE in its .c file.
+
 Each subsystem may export the following methods. The only mandatory
 methods are create/destroy. Any others that are null are presumed to
 be successful no-ops.
@ -536,10 +562,21 @@ returns an error, this will abort the attach operation.  If a NULL
 task is passed, then a successful result indicates that *any*
 unspecified task can be moved into the cgroup. Note that this isn't
 called on a fork. If this method returns 0 (success) then this should
-remain valid while the caller holds cgroup_mutex. If threadgroup is
+remain valid while the caller holds cgroup_mutex and it is ensured that either
+attach() or cancel_attach() will be called in future. If threadgroup is
 true, then a successful result indicates that all threads in the given
 thread's threadgroup can be moved together.

+void cancel_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
+	       struct task_struct *task, bool threadgroup)
+(cgroup_mutex held by caller)
+
+Called when a task attach operation has failed after can_attach() has succeeded.
+A subsystem whose can_attach() has some side-effects should provide this
+function, so that the subsytem can implement a rollback. If not, not necessary.
+This will be called only about subsystems whose can_attach() operation have
+succeeded.
+
 void attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
 	    struct cgroup *old_cgrp, struct task_struct *task,
 	    bool threadgroup)
--- a/Documentation/cgroups/cpusets.txt
+++ b/Documentation/cgroups/cpusets.txt
@ -168,20 +168,20 @@ Each cpuset is represented by a directory in the cgroup file system
 containing (on top of the standard cgroup files) the following
 files describing that cpuset:

- - cpus: list of CPUs in that cpuset
- - mems: list of Memory Nodes in that cpuset
- - memory_migrate flag: if set, move pages to cpusets nodes
- - cpu_exclusive flag: is cpu placement exclusive?
- - mem_exclusive flag: is memory placement exclusive?
- - mem_hardwall flag:  is memory allocation hardwalled
- - memory_pressure: measure of how much paging pressure in cpuset
- - memory_spread_page flag: if set, spread page cache evenly on allowed nodes
- - memory_spread_slab flag: if set, spread slab cache evenly on allowed nodes
- - sched_load_balance flag: if set, load balance within CPUs on that cpuset
- - sched_relax_domain_level: the searching range when migrating tasks
+ - cpuset.cpus: list of CPUs in that cpuset
+ - cpuset.mems: list of Memory Nodes in that cpuset
+ - cpuset.memory_migrate flag: if set, move pages to cpusets nodes
+ - cpuset.cpu_exclusive flag: is cpu placement exclusive?
+ - cpuset.mem_exclusive flag: is memory placement exclusive?
+ - cpuset.mem_hardwall flag:  is memory allocation hardwalled
+ - cpuset.memory_pressure: measure of how much paging pressure in cpuset
+ - cpuset.memory_spread_page flag: if set, spread page cache evenly on allowed nodes
+ - cpuset.memory_spread_slab flag: if set, spread slab cache evenly on allowed nodes
+ - cpuset.sched_load_balance flag: if set, load balance within CPUs on that cpuset
+ - cpuset.sched_relax_domain_level: the searching range when migrating tasks

 In addition, the root cpuset only has the following file:
- - memory_pressure_enabled flag: compute memory_pressure?
+ - cpuset.memory_pressure_enabled flag: compute memory_pressure?

 New cpusets are created using the mkdir system call or shell
 command.  The properties of a cpuset, such as its flags, allowed
@ -229,7 +229,7 @@ If a cpuset is cpu or mem exclusive, no other cpuset, other than
 a direct ancestor or descendant, may share any of the same CPUs or
 Memory Nodes.

-A cpuset that is mem_exclusive *or* mem_hardwall is "hardwalled",
+A cpuset that is cpuset.mem_exclusive *or* cpuset.mem_hardwall is "hardwalled",
 i.e. it restricts kernel allocations for page, buffer and other data
 commonly shared by the kernel across multiple users.  All cpusets,
 whether hardwalled or not, restrict allocations of memory for user
@ -304,15 +304,15 @@ times 1000.
 ---------------------------
 There are two boolean flag files per cpuset that control where the
 kernel allocates pages for the file system buffers and related in
-kernel data structures.  They are called 'memory_spread_page' and
-'memory_spread_slab'.
+kernel data structures.  They are called 'cpuset.memory_spread_page' and
+'cpuset.memory_spread_slab'.

-If the per-cpuset boolean flag file 'memory_spread_page' is set, then
+If the per-cpuset boolean flag file 'cpuset.memory_spread_page' is set, then
 the kernel will spread the file system buffers (page cache) evenly
 over all the nodes that the faulting task is allowed to use, instead
 of preferring to put those pages on the node where the task is running.

-If the per-cpuset boolean flag file 'memory_spread_slab' is set,
+If the per-cpuset boolean flag file 'cpuset.memory_spread_slab' is set,
 then the kernel will spread some file system related slab caches,
 such as for inodes and dentries evenly over all the nodes that the
 faulting task is allowed to use, instead of preferring to put those
@ -337,21 +337,21 @@ their containing tasks memory spread settings.  If memory spreading
 is turned off, then the currently specified NUMA mempolicy once again
 applies to memory page allocations.

-Both 'memory_spread_page' and 'memory_spread_slab' are boolean flag
+Both 'cpuset.memory_spread_page' and 'cpuset.memory_spread_slab' are boolean flag
 files.  By default they contain "0", meaning that the feature is off
 for that cpuset.  If a "1" is written to that file, then that turns
 the named feature on.

 The implementation is simple.

-Setting the flag 'memory_spread_page' turns on a per-process flag
+Setting the flag 'cpuset.memory_spread_page' turns on a per-process flag
 PF_SPREAD_PAGE for each task that is in that cpuset or subsequently
 joins that cpuset.  The page allocation calls for the page cache
 is modified to perform an inline check for this PF_SPREAD_PAGE task
 flag, and if set, a call to a new routine cpuset_mem_spread_node()
 returns the node to prefer for the allocation.

-Similarly, setting 'memory_spread_slab' turns on the flag
+Similarly, setting 'cpuset.memory_spread_slab' turns on the flag
 PF_SPREAD_SLAB, and appropriately marked slab caches will allocate
 pages from the node returned by cpuset_mem_spread_node().

@ -404,24 +404,24 @@ the following two situations:
    system overhead on those CPUs, including avoiding task load
    balancing if that is not needed.

-When the per-cpuset flag "sched_load_balance" is enabled (the default
-setting), it requests that all the CPUs in that cpusets allowed 'cpus'
+When the per-cpuset flag "cpuset.sched_load_balance" is enabled (the default
+setting), it requests that all the CPUs in that cpusets allowed 'cpuset.cpus'
 be contained in a single sched domain, ensuring that load balancing
 can move a task (not otherwised pinned, as by sched_setaffinity)
 from any CPU in that cpuset to any other.

-When the per-cpuset flag "sched_load_balance" is disabled, then the
+When the per-cpuset flag "cpuset.sched_load_balance" is disabled, then the
 scheduler will avoid load balancing across the CPUs in that cpuset,
 --except-- in so far as is necessary because some overlapping cpuset
 has "sched_load_balance" enabled.

-So, for example, if the top cpuset has the flag "sched_load_balance"
+So, for example, if the top cpuset has the flag "cpuset.sched_load_balance"
 enabled, then the scheduler will have one sched domain covering all
-CPUs, and the setting of the "sched_load_balance" flag in any other
+CPUs, and the setting of the "cpuset.sched_load_balance" flag in any other
 cpusets won't matter, as we're already fully load balancing.

 Therefore in the above two situations, the top cpuset flag
-"sched_load_balance" should be disabled, and only some of the smaller,
+"cpuset.sched_load_balance" should be disabled, and only some of the smaller,
 child cpusets have this flag enabled.

 When doing this, you don't usually want to leave any unpinned tasks in
@ -433,7 +433,7 @@ scheduler might not consider the possibility of load balancing that
 task to that underused CPU.

 Of course, tasks pinned to a particular CPU can be left in a cpuset
-that disables "sched_load_balance" as those tasks aren't going anywhere
+that disables "cpuset.sched_load_balance" as those tasks aren't going anywhere
 else anyway.

 There is an impedance mismatch here, between cpusets and sched domains.
@ -443,19 +443,19 @@ overlap and each CPU is in at most one sched domain.
 It is necessary for sched domains to be flat because load balancing
 across partially overlapping sets of CPUs would risk unstable dynamics
 that would be beyond our understanding.  So if each of two partially
-overlapping cpusets enables the flag 'sched_load_balance', then we
+overlapping cpusets enables the flag 'cpuset.sched_load_balance', then we
 form a single sched domain that is a superset of both.  We won't move
 a task to a CPU outside it cpuset, but the scheduler load balancing
 code might waste some compute cycles considering that possibility.

 This mismatch is why there is not a simple one-to-one relation
-between which cpusets have the flag "sched_load_balance" enabled,
+between which cpusets have the flag "cpuset.sched_load_balance" enabled,
 and the sched domain configuration.  If a cpuset enables the flag, it
 will get balancing across all its CPUs, but if it disables the flag,
 it will only be assured of no load balancing if no other overlapping
 cpuset enables the flag.

-If two cpusets have partially overlapping 'cpus' allowed, and only
+If two cpusets have partially overlapping 'cpuset.cpus' allowed, and only
 one of them has this flag enabled, then the other may find its
 tasks only partially load balanced, just on the overlapping CPUs.
 This is just the general case of the top_cpuset example given a few
@ -468,23 +468,23 @@ load balancing to the other CPUs.
 1.7.1 sched_load_balance implementation details.
 ------------------------------------------------

-The per-cpuset flag 'sched_load_balance' defaults to enabled (contrary
+The per-cpuset flag 'cpuset.sched_load_balance' defaults to enabled (contrary
 to most cpuset flags.)  When enabled for a cpuset, the kernel will
 ensure that it can load balance across all the CPUs in that cpuset
 (makes sure that all the CPUs in the cpus_allowed of that cpuset are
 in the same sched domain.)

-If two overlapping cpusets both have 'sched_load_balance' enabled,
+If two overlapping cpusets both have 'cpuset.sched_load_balance' enabled,
 then they will be (must be) both in the same sched domain.

-If, as is the default, the top cpuset has 'sched_load_balance' enabled,
+If, as is the default, the top cpuset has 'cpuset.sched_load_balance' enabled,
 then by the above that means there is a single sched domain covering
 the whole system, regardless of any other cpuset settings.

 The kernel commits to user space that it will avoid load balancing
 where it can.  It will pick as fine a granularity partition of sched
 domains as it can while still providing load balancing for any set
-of CPUs allowed to a cpuset having 'sched_load_balance' enabled.
+of CPUs allowed to a cpuset having 'cpuset.sched_load_balance' enabled.

 The internal kernel cpuset to scheduler interface passes from the
 cpuset code to the scheduler code a partition of the load balanced
@ -495,9 +495,9 @@ all the CPUs that must be load balanced.
 The cpuset code builds a new such partition and passes it to the
 scheduler sched domain setup code, to have the sched domains rebuilt
 as necessary, whenever:
- - the 'sched_load_balance' flag of a cpuset with non-empty CPUs changes,
+ - the 'cpuset.sched_load_balance' flag of a cpuset with non-empty CPUs changes,
 - or CPUs come or go from a cpuset with this flag enabled,
- - or 'sched_relax_domain_level' value of a cpuset with non-empty CPUs
+ - or 'cpuset.sched_relax_domain_level' value of a cpuset with non-empty CPUs
   and with this flag enabled changes,
 - or a cpuset with non-empty CPUs and with this flag enabled is removed,
 - or a cpu is offlined/onlined.
@ -542,7 +542,7 @@ As the result, task B on CPU X need to wait task A or wait load balance
 on the next tick.  For some applications in special situation, waiting
 1 tick may be too long.

-The 'sched_relax_domain_level' file allows you to request changing
+The 'cpuset.sched_relax_domain_level' file allows you to request changing
 this searching range as you like.  This file takes int value which
 indicates size of searching range in levels ideally as follows,
 otherwise initial value -1 that indicates the cpuset has no request.
@ -559,8 +559,8 @@ The system default is architecture dependent.  The system default
 can be changed using the relax_domain_level= boot parameter.

 This file is per-cpuset and affect the sched domain where the cpuset
-belongs to.  Therefore if the flag 'sched_load_balance' of a cpuset
-is disabled, then 'sched_relax_domain_level' have no effect since
+belongs to.  Therefore if the flag 'cpuset.sched_load_balance' of a cpuset
+is disabled, then 'cpuset.sched_relax_domain_level' have no effect since
 there is no sched domain belonging the cpuset.

 If multiple cpusets are overlapping and hence they form a single sched
@ -607,9 +607,9 @@ from one cpuset to another, then the kernel will adjust the tasks
 memory placement, as above, the next time that the kernel attempts
 to allocate a page of memory for that task.

-If a cpuset has its 'cpus' modified, then each task in that cpuset
+If a cpuset has its 'cpuset.cpus' modified, then each task in that cpuset
 will have its allowed CPU placement changed immediately.  Similarly,
-if a tasks pid is written to another cpusets 'tasks' file, then its
+if a tasks pid is written to another cpusets 'cpuset.tasks' file, then its
 allowed CPU placement is changed immediately.  If such a task had been
 bound to some subset of its cpuset using the sched_setaffinity() call,
 the task will be allowed to run on any CPU allowed in its new cpuset,
@ -622,8 +622,8 @@ and the processor placement is updated immediately.
 Normally, once a page is allocated (given a physical page
 of main memory) then that page stays on whatever node it
 was allocated, so long as it remains allocated, even if the
-cpusets memory placement policy 'mems' subsequently changes.
-If the cpuset flag file 'memory_migrate' is set true, then when
+cpusets memory placement policy 'cpuset.mems' subsequently changes.
+If the cpuset flag file 'cpuset.memory_migrate' is set true, then when
 tasks are attached to that cpuset, any pages that task had
 allocated to it on nodes in its previous cpuset are migrated
 to the tasks new cpuset. The relative placement of the page within
@ -631,12 +631,12 @@ the cpuset is preserved during these migration operations if possible.
 For example if the page was on the second valid node of the prior cpuset
 then the page will be placed on the second valid node of the new cpuset.

-Also if 'memory_migrate' is set true, then if that cpusets
-'mems' file is modified, pages allocated to tasks in that
-cpuset, that were on nodes in the previous setting of 'mems',
+Also if 'cpuset.memory_migrate' is set true, then if that cpusets
+'cpuset.mems' file is modified, pages allocated to tasks in that
+cpuset, that were on nodes in the previous setting of 'cpuset.mems',
 will be moved to nodes in the new setting of 'mems.'
 Pages that were not in the tasks prior cpuset, or in the cpusets
-prior 'mems' setting, will not be moved.
+prior 'cpuset.mems' setting, will not be moved.

 There is an exception to the above.  If hotplug functionality is used
 to remove all the CPUs that are currently assigned to a cpuset,
@ -678,8 +678,8 @@ and then start a subshell 'sh' in that cpuset:
  cd /dev/cpuset
  mkdir Charlie
  cd Charlie
-  /bin/echo 2-3 > cpus
-  /bin/echo 1 > mems
+  /bin/echo 2-3 > cpuset.cpus
+  /bin/echo 1 > cpuset.mems
  /bin/echo $$ > tasks
  sh
  # The subshell 'sh' is now running in cpuset Charlie
@ -725,10 +725,13 @@ Now you want to do something with this cpuset.

 In this directory you can find several files:
 # ls
-cpu_exclusive  memory_migrate      mems                      tasks
-cpus           memory_pressure     notify_on_release
-mem_exclusive  memory_spread_page  sched_load_balance
-mem_hardwall   memory_spread_slab  sched_relax_domain_level
+cpuset.cpu_exclusive       cpuset.memory_spread_slab
+cpuset.cpus                cpuset.mems
+cpuset.mem_exclusive       cpuset.sched_load_balance
+cpuset.mem_hardwall        cpuset.sched_relax_domain_level
+cpuset.memory_migrate      notify_on_release
+cpuset.memory_pressure     tasks
+cpuset.memory_spread_page

 Reading them will give you information about the state of this cpuset:
 the CPUs and Memory Nodes it can use, the processes that are using
@ -736,13 +739,13 @@ it, its properties.  By writing to these files you can manipulate
 the cpuset.

 Set some flags:
-# /bin/echo 1 > cpu_exclusive
+# /bin/echo 1 > cpuset.cpu_exclusive

 Add some cpus:
-# /bin/echo 0-7 > cpus
+# /bin/echo 0-7 > cpuset.cpus

 Add some mems:
-# /bin/echo 0-7 > mems
+# /bin/echo 0-7 > cpuset.mems

 Now attach your shell to this cpuset:
 # /bin/echo $$ > tasks
@ -774,28 +777,28 @@ echo "/sbin/cpuset_release_agent" > /dev/cpuset/release_agent
 This is the syntax to use when writing in the cpus or mems files
 in cpuset directories:

-# /bin/echo 1-4 > cpus		-> set cpus list to cpus 1,2,3,4
-# /bin/echo 1,2,3,4 > cpus	-> set cpus list to cpus 1,2,3,4
+# /bin/echo 1-4 > cpuset.cpus		-> set cpus list to cpus 1,2,3,4
+# /bin/echo 1,2,3,4 > cpuset.cpus	-> set cpus list to cpus 1,2,3,4

 To add a CPU to a cpuset, write the new list of CPUs including the
 CPU to be added. To add 6 to the above cpuset:

-# /bin/echo 1-4,6 > cpus	-> set cpus list to cpus 1,2,3,4,6
+# /bin/echo 1-4,6 > cpuset.cpus	-> set cpus list to cpus 1,2,3,4,6

 Similarly to remove a CPU from a cpuset, write the new list of CPUs
 without the CPU to be removed.

 To remove all the CPUs:

-# /bin/echo "" > cpus		-> clear cpus list
+# /bin/echo "" > cpuset.cpus		-> clear cpus list

 2.3 Setting flags
 -----------------

 The syntax is very simple:

-# /bin/echo 1 > cpu_exclusive 	-> set flag 'cpu_exclusive'
-# /bin/echo 0 > cpu_exclusive 	-> unset flag 'cpu_exclusive'
+# /bin/echo 1 > cpuset.cpu_exclusive 	-> set flag 'cpuset.cpu_exclusive'
+# /bin/echo 0 > cpuset.cpu_exclusive 	-> unset flag 'cpuset.cpu_exclusive'

 2.4 Attaching processes
 -----------------------
--- a/Documentation/cgroups/memcg_test.txt
+++ b/Documentation/cgroups/memcg_test.txt
@ -1,6 +1,6 @@
 Memory Resource Controller(Memcg)  Implementation Memo.
-Last Updated: 2009/1/20
-Base Kernel Version: based on 2.6.29-rc2.
+Last Updated: 2010/2
+Base Kernel Version: based on 2.6.33-rc7-mm(candidate for 34).

 Because VM is getting complex (one of reasons is memcg...), memcg's behavior
 is complex. This is a document for memcg's internal behavior.
@ -337,7 +337,7 @@ Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y.
 	race and lock dependency with other cgroup subsystems.

 	example)
-	# mount -t cgroup none /cgroup -t cpuset,memory,cpu,devices
+	# mount -t cgroup none /cgroup -o cpuset,memory,cpu,devices

 	and do task move, mkdir, rmdir etc...under this.

@ -348,7 +348,7 @@ Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y.

 	For example, test like following is good.
 	(Shell-A)
-	# mount -t cgroup none /cgroup -t memory
+	# mount -t cgroup none /cgroup -o memory
 	# mkdir /cgroup/test
 	# echo 40M > /cgroup/test/memory.limit_in_bytes
 	# echo 0 > /cgroup/test/tasks
@ -378,3 +378,42 @@ Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y.
 	#echo 50M > memory.limit_in_bytes
 	#echo 50M > memory.memsw.limit_in_bytes
 	run 51M of malloc
+
+ 9.9 Move charges at task migration
+	Charges associated with a task can be moved along with task migration.
+
+	(Shell-A)
+	#mkdir /cgroup/A
+	#echo $$ >/cgroup/A/tasks
+	run some programs which uses some amount of memory in /cgroup/A.
+
+	(Shell-B)
+	#mkdir /cgroup/B
+	#echo 1 >/cgroup/B/memory.move_charge_at_immigrate
+	#echo "pid of the program running in group A" >/cgroup/B/tasks
+
+	You can see charges have been moved by reading *.usage_in_bytes or
+	memory.stat of both A and B.
+	See 8.2 of Documentation/cgroups/memory.txt to see what value should be
+	written to move_charge_at_immigrate.
+
+ 9.10 Memory thresholds
+	Memory controler implements memory thresholds using cgroups notification
+	API. You can use Documentation/cgroups/cgroup_event_listener.c to test
+	it.
+
+	(Shell-A) Create cgroup and run event listener
+	# mkdir /cgroup/A
+	# ./cgroup_event_listener /cgroup/A/memory.usage_in_bytes 5M
+
+	(Shell-B) Add task to cgroup and try to allocate and free memory
+	# echo $$ >/cgroup/A/tasks
+	# a="$(dd if=/dev/zero bs=1M count=10)"
+	# a=
+
+	You will see message from cgroup_event_listener every time you cross
+	the thresholds.
+
+	Use /cgroup/A/memory.memsw.usage_in_bytes to test memsw thresholds.
+
+	It's good idea to test root cgroup as well.
--- a/Documentation/cgroups/memory.txt
+++ b/Documentation/cgroups/memory.txt
@ -182,6 +182,8 @@ list.
 NOTE: Reclaim does not work for the root cgroup, since we cannot set any
 limits on the root cgroup.

+Note2: When panic_on_oom is set to "2", the whole system will panic.
+
 2. Locking

 The memory controller uses the following hierarchy
@ -262,10 +264,12 @@ some of the pages cached in the cgroup (page cache pages).
 4.2 Task migration

 When a task migrates from one cgroup to another, it's charge is not
-carried forward. The pages allocated from the original cgroup still
+carried forward by default. The pages allocated from the original cgroup still
 remain charged to it, the charge is dropped when the page is freed or
 reclaimed.

+Note: You can move charges of a task along with task migration. See 8.
+
 4.3 Removing a cgroup

 A cgroup can be removed by rmdir, but as discussed in sections 4.1 and 4.2, a
@ -336,7 +340,7 @@ Note:
 5.3 swappiness
  Similar to /proc/sys/vm/swappiness, but affecting a hierarchy of groups only.

-  Following cgroups' swapiness can't be changed.
+  Following cgroups' swappiness can't be changed.
  - root cgroup (uses /proc/sys/vm/swappiness).
  - a cgroup which uses hierarchy and it has child cgroup.
  - a cgroup which uses hierarchy and not the root of hierarchy.
@ -377,7 +381,8 @@ The feature can be disabled by
 NOTE1: Enabling/disabling will fail if the cgroup already has other
 cgroups created below it.

-NOTE2: This feature can be enabled/disabled per subtree.
+NOTE2: When panic_on_oom is set to "2", the whole system will panic in
+case of an oom event in any cgroup.

 7. Soft limits

@ -414,7 +419,76 @@ NOTE1: Soft limits take effect over a long period of time, since they involve
 NOTE2: It is recommended to set the soft limit always below the hard limit,
       otherwise the hard limit will take precedence.

-8. TODO
+8. Move charges at task migration
+
+Users can move charges associated with a task along with task migration, that
+is, uncharge task's pages from the old cgroup and charge them to the new cgroup.
+This feature is not supported in !CONFIG_MMU environments because of lack of
+page tables.
+
+8.1 Interface
+
+This feature is disabled by default. It can be enabled(and disabled again) by
+writing to memory.move_charge_at_immigrate of the destination cgroup.
+
+If you want to enable it:
+
+# echo (some positive value) > memory.move_charge_at_immigrate
+
+Note: Each bits of move_charge_at_immigrate has its own meaning about what type
+      of charges should be moved. See 8.2 for details.
+Note: Charges are moved only when you move mm->owner, IOW, a leader of a thread
+      group.
+Note: If we cannot find enough space for the task in the destination cgroup, we
+      try to make space by reclaiming memory. Task migration may fail if we
+      cannot make enough space.
+Note: It can take several seconds if you move charges in giga bytes order.
+
+And if you want disable it again:
+
+# echo 0 > memory.move_charge_at_immigrate
+
+8.2 Type of charges which can be move
+
+Each bits of move_charge_at_immigrate has its own meaning about what type of
+charges should be moved.
+
+  bit | what type of charges would be moved ?
+ -----+------------------------------------------------------------------------
+   0  | A charge of an anonymous page(or swap of it) used by the target task.
+      | Those pages and swaps must be used only by the target task. You must
+      | enable Swap Extension(see 2.4) to enable move of swap charges.
+
+Note: Those pages and swaps must be charged to the old cgroup.
+Note: More type of pages(e.g. file cache, shmem,) will be supported by other
+      bits in future.
+
+8.3 TODO
+
+- Add support for other types of pages(e.g. file cache, shmem, etc.).
+- Implement madvise(2) to let users decide the vma to be moved or not to be
+  moved.
+- All of moving charge operations are done under cgroup_mutex. It's not good
+  behavior to hold the mutex too long, so we may need some trick.
+
+9. Memory thresholds
+
+Memory controler implements memory thresholds using cgroups notification
+API (see cgroups.txt). It allows to register multiple memory and memsw
+thresholds and gets notifications when it crosses.
+
+To register a threshold application need:
+ - create an eventfd using eventfd(2);
+ - open memory.usage_in_bytes or memory.memsw.usage_in_bytes;
+ - write string like "<event_fd> <memory.usage_in_bytes> <threshold>" to
+   cgroup.event_control.
+
+Application will be notified through eventfd when memory usage crosses
+threshold in any direction.
+
+It's applicable for root and non-root cgroup.
+
+10. TODO

 1. Add support for accounting huge pages (as a separate controller)
 2. Make per-cgroup scanner reclaim not-shared pages first
--- a/Documentation/circular-buffers.txt
+++ b/Documentation/circular-buffers.txt
@ -0,0 +1,234 @@
+			       ================
+			       CIRCULAR BUFFERS
+			       ================
+
+By: David Howells <dhowells@redhat.com>
+    Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+
+
+Linux provides a number of features that can be used to implement circular
+buffering.  There are two sets of such features:
+
+ (1) Convenience functions for determining information about power-of-2 sized
+     buffers.
+
+ (2) Memory barriers for when the producer and the consumer of objects in the
+     buffer don't want to share a lock.
+
+To use these facilities, as discussed below, there needs to be just one
+producer and just one consumer.  It is possible to handle multiple producers by
+serialising them, and to handle multiple consumers by serialising them.
+
+
+Contents:
+
+ (*) What is a circular buffer?
+
+ (*) Measuring power-of-2 buffers.
+
+ (*) Using memory barriers with circular buffers.
+     - The producer.
+     - The consumer.
+
+
+==========================
+WHAT IS A CIRCULAR BUFFER?
+==========================
+
+First of all, what is a circular buffer?  A circular buffer is a buffer of
+fixed, finite size into which there are two indices:
+
+ (1) A 'head' index - the point at which the producer inserts items into the
+     buffer.
+
+ (2) A 'tail' index - the point at which the consumer finds the next item in
+     the buffer.
+
+Typically when the tail pointer is equal to the head pointer, the buffer is
+empty; and the buffer is full when the head pointer is one less than the tail
+pointer.
+
+The head index is incremented when items are added, and the tail index when
+items are removed.  The tail index should never jump the head index, and both
+indices should be wrapped to 0 when they reach the end of the buffer, thus
+allowing an infinite amount of data to flow through the buffer.
+
+Typically, items will all be of the same unit size, but this isn't strictly
+required to use the techniques below.  The indices can be increased by more
+than 1 if multiple items or variable-sized items are to be included in the
+buffer, provided that neither index overtakes the other.  The implementer must
+be careful, however, as a region more than one unit in size may wrap the end of
+the buffer and be broken into two segments.
+
+
+============================
+MEASURING POWER-OF-2 BUFFERS
+============================
+
+Calculation of the occupancy or the remaining capacity of an arbitrarily sized
+circular buffer would normally be a slow operation, requiring the use of a
+modulus (divide) instruction.  However, if the buffer is of a power-of-2 size,
+then a much quicker bitwise-AND instruction can be used instead.
+
+Linux provides a set of macros for handling power-of-2 circular buffers.  These
+can be made use of by:
+
+	#include <linux/circ_buf.h>
+
+The macros are:
+
+ (*) Measure the remaining capacity of a buffer:
+
+	CIRC_SPACE(head_index, tail_index, buffer_size);
+
+     This returns the amount of space left in the buffer[1] into which items
+     can be inserted.
+
+
+ (*) Measure the maximum consecutive immediate space in a buffer:
+
+	CIRC_SPACE_TO_END(head_index, tail_index, buffer_size);
+
+     This returns the amount of consecutive space left in the buffer[1] into
+     which items can be immediately inserted without having to wrap back to the
+     beginning of the buffer.
+
+
+ (*) Measure the occupancy of a buffer:
+
+	CIRC_CNT(head_index, tail_index, buffer_size);
+
+     This returns the number of items currently occupying a buffer[2].
+
+
+ (*) Measure the non-wrapping occupancy of a buffer:
+
+	CIRC_CNT_TO_END(head_index, tail_index, buffer_size);
+
+     This returns the number of consecutive items[2] that can be extracted from
+     the buffer without having to wrap back to the beginning of the buffer.
+
+
+Each of these macros will nominally return a value between 0 and buffer_size-1,
+however:
+
+ [1] CIRC_SPACE*() are intended to be used in the producer.  To the producer
+     they will return a lower bound as the producer controls the head index,
+     but the consumer may still be depleting the buffer on another CPU and
+     moving the tail index.
+
+     To the consumer it will show an upper bound as the producer may be busy
+     depleting the space.
+
+ [2] CIRC_CNT*() are intended to be used in the consumer.  To the consumer they
+     will return a lower bound as the consumer controls the tail index, but the
+     producer may still be filling the buffer on another CPU and moving the
+     head index.
+
+     To the producer it will show an upper bound as the consumer may be busy
+     emptying the buffer.
+
+ [3] To a third party, the order in which the writes to the indices by the
+     producer and consumer become visible cannot be guaranteed as they are
+     independent and may be made on different CPUs - so the result in such a
+     situation will merely be a guess, and may even be negative.
+
+
+===========================================
+USING MEMORY BARRIERS WITH CIRCULAR BUFFERS
+===========================================
+
+By using memory barriers in conjunction with circular buffers, you can avoid
+the need to:
+
+ (1) use a single lock to govern access to both ends of the buffer, thus
+     allowing the buffer to be filled and emptied at the same time; and
+
+ (2) use atomic counter operations.
+
+There are two sides to this: the producer that fills the buffer, and the
+consumer that empties it.  Only one thing should be filling a buffer at any one
+time, and only one thing should be emptying a buffer at any one time, but the
+two sides can operate simultaneously.
+
+
+THE PRODUCER
+------------
+
+The producer will look something like this:
+
+	spin_lock(&producer_lock);
+
+	unsigned long head = buffer->head;
+	unsigned long tail = ACCESS_ONCE(buffer->tail);
+
+	if (CIRC_SPACE(head, tail, buffer->size) >= 1) {
+		/* insert one item into the buffer */
+		struct item *item = buffer[head];
+
+		produce_item(item);
+
+		smp_wmb(); /* commit the item before incrementing the head */
+
+		buffer->head = (head + 1) & (buffer->size - 1);
+
+		/* wake_up() will make sure that the head is committed before
+		 * waking anyone up */
+		wake_up(consumer);
+	}
+
+	spin_unlock(&producer_lock);
+
+This will instruct the CPU that the contents of the new item must be written
+before the head index makes it available to the consumer and then instructs the
+CPU that the revised head index must be written before the consumer is woken.
+
+Note that wake_up() doesn't have to be the exact mechanism used, but whatever
+is used must guarantee a (write) memory barrier between the update of the head
+index and the change of state of the consumer, if a change of state occurs.
+
+
+THE CONSUMER
+------------
+
+The consumer will look something like this:
+
+	spin_lock(&consumer_lock);
+
+	unsigned long head = ACCESS_ONCE(buffer->head);
+	unsigned long tail = buffer->tail;
+
+	if (CIRC_CNT(head, tail, buffer->size) >= 1) {
+		/* read index before reading contents at that index */
+		smp_read_barrier_depends();
+
+		/* extract one item from the buffer */
+		struct item *item = buffer[tail];
+
+		consume_item(item);
+
+		smp_mb(); /* finish reading descriptor before incrementing tail */
+
+		buffer->tail = (tail + 1) & (buffer->size - 1);
+	}
+
+	spin_unlock(&consumer_lock);
+
+This will instruct the CPU to make sure the index is up to date before reading
+the new item, and then it shall make sure the CPU has finished reading the item
+before it writes the new tail pointer, which will erase the item.
+
+
+Note the use of ACCESS_ONCE() in both algorithms to read the opposition index.
+This prevents the compiler from discarding and reloading its cached value -
+which some compilers will do across smp_read_barrier_depends().  This isn't
+strictly needed if you can be sure that the opposition index will _only_ be
+used the once.
+
+
+===============
+FURTHER READING
+===============
+
+See also Documentation/memory-barriers.txt for a description of Linux's memory
+barrier facilities.
--- a/Documentation/console/console.txt
+++ b/Documentation/console/console.txt
@ -74,7 +74,7 @@ driver takes over the consoles vacated by the driver. Binding, on the other
 hand, will bind the driver to the consoles that are currently occupied by a
 system driver.

-NOTE1: Binding and binding must be selected in Kconfig. It's under:
+NOTE1: Binding and unbinding must be selected in Kconfig. It's under:

 Device Drivers -> Character devices -> Support for binding and unbinding
 console drivers
--- a/Documentation/cpu-freq/governors.txt
+++ b/Documentation/cpu-freq/governors.txt
@ -145,8 +145,8 @@ show_sampling_rate_max: THIS INTERFACE IS DEPRECATED, DON'T USE IT.
 up_threshold: defines what the average CPU usage between the samplings
 of 'sampling_rate' needs to be for the kernel to make a decision on
 whether it should increase the frequency.  For example when it is set
-to its default value of '80' it means that between the checking
-intervals the CPU needs to be on average more than 80% in use to then
+to its default value of '95' it means that between the checking
+intervals the CPU needs to be on average more than 95% in use to then
 decide that the CPU frequency needs to be increased.  

 ignore_nice_load: this parameter takes a value of '0' or '1'. When
--- a/Documentation/cpu-freq/pcc-cpufreq.txt
+++ b/Documentation/cpu-freq/pcc-cpufreq.txt
@ -0,0 +1,207 @@
+/*
+ *  pcc-cpufreq.txt - PCC interface documentation
+ *
+ *  Copyright (C) 2009 Red Hat, Matthew Garrett <mjg@redhat.com>
+ *  Copyright (C) 2009 Hewlett-Packard Development Company, L.P.
+ *      Nagananda Chumbalkar <nagananda.chumbalkar@hp.com>
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; version 2 of the License.
+ *
+ *  This program is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or NON
+ *  INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with this program; if not, write to the Free Software Foundation, Inc.,
+ *  675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
+
+
+			Processor Clocking Control Driver
+			---------------------------------
+
+Contents:
+---------
+1.	Introduction
+1.1	PCC interface
+1.1.1   Get Average Frequency
+1.1.2	Set Desired Frequency
+1.2	Platforms affected
+2.	Driver and /sys details
+2.1	scaling_available_frequencies
+2.2	cpuinfo_transition_latency
+2.3	cpuinfo_cur_freq
+2.4	related_cpus
+3.	Caveats
+
+1. Introduction:
+----------------
+Processor Clocking Control (PCC) is an interface between the platform
+firmware and OSPM. It is a mechanism for coordinating processor
+performance (ie: frequency) between the platform firmware and the OS.
+
+The PCC driver (pcc-cpufreq) allows OSPM to take advantage of the PCC
+interface.
+
+OS utilizes the PCC interface to inform platform firmware what frequency the
+OS wants for a logical processor. The platform firmware attempts to achieve
+the requested frequency. If the request for the target frequency could not be
+satisfied by platform firmware, then it usually means that power budget
+conditions are in place, and "power capping" is taking place.
+
+1.1 PCC interface:
+------------------
+The complete PCC specification is available here:
+http://www.acpica.org/download/Processor-Clocking-Control-v1p0.pdf
+
+PCC relies on a shared memory region that provides a channel for communication
+between the OS and platform firmware. PCC also implements a "doorbell" that
+is used by the OS to inform the platform firmware that a command has been
+sent.
+
+The ACPI PCCH() method is used to discover the location of the PCC shared
+memory region. The shared memory region header contains the "command" and
+"status" interface. PCCH() also contains details on how to access the platform
+doorbell.
+
+The following commands are supported by the PCC interface:
+* Get Average Frequency
+* Set Desired Frequency
+
+The ACPI PCCP() method is implemented for each logical processor and is
+used to discover the offsets for the input and output buffers in the shared
+memory region.
+
+When PCC mode is enabled, the platform will not expose processor performance
+or throttle states (_PSS, _TSS and related ACPI objects) to OSPM. Therefore,
+the native P-state driver (such as acpi-cpufreq for Intel, powernow-k8 for
+AMD) will not load.
+
+However, OSPM remains in control of policy. The governor (eg: "ondemand")
+computes the required performance for each processor based on server workload.
+The PCC driver fills in the command interface, and the input buffer and
+communicates the request to the platform firmware. The platform firmware is
+responsible for delivering the requested performance.
+
+Each PCC command is "global" in scope and can affect all the logical CPUs in
+the system. Therefore, PCC is capable of performing "group" updates. With PCC
+the OS is capable of getting/setting the frequency of all the logical CPUs in
+the system with a single call to the BIOS.
+
+1.1.1 Get Average Frequency:
+----------------------------
+This command is used by the OSPM to query the running frequency of the
+processor since the last time this command was completed. The output buffer
+indicates the average unhalted frequency of the logical processor expressed as
+a percentage of the nominal (ie: maximum) CPU frequency. The output buffer
+also signifies if the CPU frequency is limited by a power budget condition.
+
+1.1.2 Set Desired Frequency:
+----------------------------
+This command is used by the OSPM to communicate to the platform firmware the
+desired frequency for a logical processor. The output buffer is currently
+ignored by OSPM. The next invocation of "Get Average Frequency" will inform
+OSPM if the desired frequency was achieved or not.
+
+1.2 Platforms affected:
+-----------------------
+The PCC driver will load on any system where the platform firmware:
+* supports the PCC interface, and the associated PCCH() and PCCP() methods
+* assumes responsibility for managing the hardware clocking controls in order
+to deliver the requested processor performance
+
+Currently, certain HP ProLiant platforms implement the PCC interface. On those
+platforms PCC is the "default" choice.
+
+However, it is possible to disable this interface via a BIOS setting. In
+such an instance, as is also the case on platforms where the PCC interface
+is not implemented, the PCC driver will fail to load silently.
+
+2. Driver and /sys details:
+---------------------------
+When the driver loads, it merely prints the lowest and the highest CPU
+frequencies supported by the platform firmware.
+
+The PCC driver loads with a message such as:
+pcc-cpufreq: (v1.00.00) driver loaded with frequency limits: 1600 MHz, 2933
+MHz
+
+This means that the OPSM can request the CPU to run at any frequency in
+between the limits (1600 MHz, and 2933 MHz) specified in the message.
+
+Internally, there is no need for the driver to convert the "target" frequency
+to a corresponding P-state.
+
+The VERSION number for the driver will be of the format v.xy.ab.
+eg: 1.00.02
+   ----- --
+    |    |
+    |    -- this will increase with bug fixes/enhancements to the driver
+    |-- this is the version of the PCC specification the driver adheres to
+
+
+The following is a brief discussion on some of the fields exported via the
+/sys filesystem and how their values are affected by the PCC driver:
+
+2.1 scaling_available_frequencies:
+----------------------------------
+scaling_available_frequencies is not created in /sys. No intermediate
+frequencies need to be listed because the BIOS will try to achieve any
+frequency, within limits, requested by the governor. A frequency does not have
+to be strictly associated with a P-state.
+
+2.2 cpuinfo_transition_latency:
+-------------------------------
+The cpuinfo_transition_latency field is 0. The PCC specification does
+not include a field to expose this value currently.
+
+2.3 cpuinfo_cur_freq:
+---------------------
+A) Often cpuinfo_cur_freq will show a value different than what is declared
+in the scaling_available_frequencies or scaling_cur_freq, or scaling_max_freq.
+This is due to "turbo boost" available on recent Intel processors. If certain
+conditions are met the BIOS can achieve a slightly higher speed than requested
+by OSPM. An example:
+
+scaling_cur_freq	: 2933000
+cpuinfo_cur_freq	: 3196000
+
+B) There is a round-off error associated with the cpuinfo_cur_freq value.
+Since the driver obtains the current frequency as a "percentage" (%) of the
+nominal frequency from the BIOS, sometimes, the values displayed by
+scaling_cur_freq and cpuinfo_cur_freq may not match. An example:
+
+scaling_cur_freq	: 1600000
+cpuinfo_cur_freq	: 1583000
+
+In this example, the nominal frequency is 2933 MHz. The driver obtains the
+current frequency, cpuinfo_cur_freq, as 54% of the nominal frequency:
+
+	54% of 2933 MHz = 1583 MHz
+
+Nominal frequency is the maximum frequency of the processor, and it usually
+corresponds to the frequency of the P0 P-state.
+
+2.4 related_cpus:
+-----------------
+The related_cpus field is identical to affected_cpus.
+
+affected_cpus	: 4
+related_cpus	: 4
+
+Currently, the PCC driver does not evaluate _PSD. The platforms that support
+PCC do not implement SW_ALL. So OSPM doesn't need to perform any coordination
+to ensure that the same frequency is requested of all dependent CPUs.
+
+3. Caveats:
+-----------
+The "cpufreq_stats" module in its present form cannot be loaded and
+expected to work with the PCC driver. Since the "cpufreq_stats" module
+provides information wrt each P-state, it is not applicable to the PCC driver.
--- a/Documentation/cpu-hotplug.txt
+++ b/Documentation/cpu-hotplug.txt
@ -315,41 +315,26 @@ A: The following are what is required for CPU hotplug infrastructure to work

 Q: I need to ensure that a particular cpu is not removed when there is some
   work specific to this cpu is in progress.
-A: First switch the current thread context to preferred cpu
+A: There are two ways.  If your code can be run in interrupt context, use
+   smp_call_function_single(), otherwise use work_on_cpu().  Note that
+   work_on_cpu() is slow, and can fail due to out of memory:

 	int my_func_on_cpu(int cpu)
 	{
-		cpumask_t saved_mask, new_mask = CPU_MASK_NONE;
-		int curr_cpu, err = 0;
-
-		saved_mask = current->cpus_allowed;
-		cpu_set(cpu, new_mask);
-		err = set_cpus_allowed(current, new_mask);
-
-		if (err)
-			return err;
-
-		/*
-		 * If we got scheduled out just after the return from
-		 * set_cpus_allowed() before running the work, this ensures
-		 * we stay locked.
-		 */
-		curr_cpu = get_cpu();
-
-		if (curr_cpu != cpu) {
-			err = -EAGAIN;
-			goto ret;
-		} else {
-			/*
-			 * Do work : But cant sleep, since get_cpu() disables preempt
-			 */
-		}
-		ret:
-			put_cpu();
-			set_cpus_allowed(current, saved_mask);
-			return err;
-		}
-
+		int err;
+		get_online_cpus();
+		if (!cpu_online(cpu))
+			err = -EINVAL;
+		else
+#if NEEDS_BLOCKING
+			err = work_on_cpu(cpu, __my_func_on_cpu, NULL);
+#else
+			smp_call_function_single(cpu, __my_func_on_cpu, &err,
+						 true);
+#endif
+		put_online_cpus();
+		return err;
+	}

 Q: How do we determine how many CPUs are available for hotplug.
 A: There is no clear spec defined way from ACPI that can give us that
--- a/Documentation/device-mapper/snapshot.txt
+++ b/Documentation/device-mapper/snapshot.txt
@ -122,3 +122,47 @@ volumeGroup-base: 0 2097152 snapshot-merge 254:11 254:12 P 16
 brw-------  1 root root 254, 11 29 ago 18:15 /dev/mapper/volumeGroup-base-real
 brw-------  1 root root 254, 12 29 ago 18:16 /dev/mapper/volumeGroup-base-cow
 brw-------  1 root root 254, 10 29 ago 18:16 /dev/mapper/volumeGroup-base
+
+
+How to determine when a merging is complete
+===========================================
+The snapshot-merge and snapshot status lines end with:
+  <sectors_allocated>/<total_sectors> <metadata_sectors>
+
+Both <sectors_allocated> and <total_sectors> include both data and metadata.
+During merging, the number of sectors allocated gets smaller and
+smaller.  Merging has finished when the number of sectors holding data
+is zero, in other words <sectors_allocated> == <metadata_sectors>.
+
+Here is a practical example (using a hybrid of lvm and dmsetup commands):
+
+# lvs
+  LV      VG          Attr   LSize Origin  Snap%  Move Log Copy%  Convert
+  base    volumeGroup owi-a- 4.00g
+  snap    volumeGroup swi-a- 1.00g base  18.97
+
+# dmsetup status volumeGroup-snap
+0 8388608 snapshot 397896/2097152 1560
+                                  ^^^^ metadata sectors
+
+# lvconvert --merge -b volumeGroup/snap
+  Merging of volume snap started.
+
+# lvs volumeGroup/snap
+  LV      VG          Attr   LSize Origin  Snap%  Move Log Copy%  Convert
+  base    volumeGroup Owi-a- 4.00g          17.23
+
+# dmsetup status volumeGroup-base
+0 8388608 snapshot-merge 281688/2097152 1104
+
+# dmsetup status volumeGroup-base
+0 8388608 snapshot-merge 180480/2097152 712
+
+# dmsetup status volumeGroup-base
+0 8388608 snapshot-merge 16/2097152 16
+
+Merging has finished.
+
+# lvs
+  LV      VG          Attr   LSize Origin  Snap%  Move Log Copy%  Convert
+  base    volumeGroup owi-a- 4.00g
--- a/Documentation/dontdiff
+++ b/Documentation/dontdiff
@ -69,7 +69,6 @@ av_permissions.h
 bbootsect
 bin2c
 binkernel.spec
-binoffset
 bootsect
 bounds.h
 bsetup
@ -103,6 +102,7 @@ gconf
 gen-devlist
 gen_crc32table
 gen_init_cpio
+generated
 genheaders
 genksyms
 *_gray256.c
--- a/Documentation/driver-model/driver.txt
+++ b/Documentation/driver-model/driver.txt
@ -226,5 +226,5 @@ struct driver_attribute driver_attr_debug;
 This can then be used to add and remove the attribute from the
 driver's directory using:

-int driver_create_file(struct device_driver *, struct driver_attribute *);
-void driver_remove_file(struct device_driver *, struct driver_attribute *);
+int driver_create_file(struct device_driver *, const struct driver_attribute *);
+void driver_remove_file(struct device_driver *, const struct driver_attribute *);
--- a/Documentation/driver-model/platform.txt
+++ b/Documentation/driver-model/platform.txt
@ -192,7 +192,7 @@ command line. This will execute all matching early_param() callbacks.
 User specified early platform devices will be registered at this point.
 For the early serial console case the user can specify port on the
 kernel command line as "earlyprintk=serial.0" where "earlyprintk" is
-the class string, "serial" is the name of the platfrom driver and
+the class string, "serial" is the name of the platform driver and
 0 is the platform device id. If the id is -1 then the dot and the
 id can be omitted.

--- a/Documentation/dvb/get_dvb_firmware
+++ b/Documentation/dvb/get_dvb_firmware
@ -26,7 +26,7 @@ use IO::Handle;
 		"dec3000s", "vp7041", "dibusb", "nxt2002", "nxt2004",
 		"or51211", "or51132_qam", "or51132_vsb", "bluebird",
 		"opera1", "cx231xx", "cx18", "cx23885", "pvrusb2", "mpc718",
-		"af9015");
+		"af9015", "ngene");

 # Check args
 syntax() if (scalar(@ARGV) != 1);
@ -39,7 +39,7 @@ for ($i=0; $i < scalar(@components); $i++) {
 	die $@ if $@;
 	print STDERR <<EOF;
 Firmware(s) $outfile extracted successfully.
-Now copy it(they) to either /usr/lib/hotplug/firmware or /lib/firmware
+Now copy it(them) to either /usr/lib/hotplug/firmware or /lib/firmware
 (depending on configuration of firmware hotplug).
 EOF
 	exit(0);
@ -549,6 +549,24 @@ sub af9015 {
 	close INFILE;
 }

+sub ngene {
+    my $url = "http://www.digitaldevices.de/download/";
+    my $file1 = "ngene_15.fw";
+    my $hash1 = "d798d5a757121174f0dbc5f2833c0c85";
+    my $file2 = "ngene_17.fw";
+    my $hash2 = "26b687136e127b8ac24b81e0eeafc20b";
+
+    checkstandard();
+
+    wgetfile($file1, $url . $file1);
+    verify($file1, $hash1);
+
+    wgetfile($file2, $url . $file2);
+    verify($file2, $hash2);
+
+    "$file1, $file2";
+}
+
 # ---------------------------------------------------------------
 # Utilities

@ -667,6 +685,7 @@ sub delzero{
 sub syntax() {
    print STDERR "syntax: get_dvb_firmware <component>\n";
    print STDERR "Supported components:\n";
+    @components = sort @components;
    for($i=0; $i < scalar(@components); $i++) {
 	print STDERR "\t" . $components[$i] . "\n";
    }
--- a/Documentation/eisa.txt
+++ b/Documentation/eisa.txt
@ -171,7 +171,7 @@ device.
 virtual_root.force_probe :

 Force the probing code to probe EISA slots even when it cannot find an
-EISA compliant mainboard (nothing appears on slot 0). Defaultd to 0
+EISA compliant mainboard (nothing appears on slot 0). Defaults to 0
 (don't force), and set to 1 (force probing) when either
 CONFIG_ALPHA_JENSEN or CONFIG_EISA_VLB_PRIMING are set.

--- a/Documentation/email-clients.txt
+++ b/Documentation/email-clients.txt
@ -216,26 +216,14 @@ Works.  Use "Insert file..." or external editor.
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 Gmail (Web GUI)

-If you just have to use Gmail to send patches, it CAN be made to work.  It
-requires a bit of external help, though.
+Does not work for sending patches.

-The first problem is that Gmail converts tabs to spaces.  This will
-totally break your patches.  To prevent this, you have to use a different
-editor.  There is a firefox extension called "ViewSourceWith"
-(https://addons.mozilla.org/en-US/firefox/addon/394) which allows you to
-edit any text box in the editor of your choice.  Configure it to launch
-your favorite editor.  When you want to send a patch, use this technique.
-Once you have crafted your messsage + patch, save and exit the editor,
-which should reload the Gmail edit box.  GMAIL WILL PRESERVE THE TABS.
-Hoorah.  Apparently you can cut-n-paste literal tabs, but Gmail will
-convert those to spaces upon sending!
+Gmail web client converts tabs to spaces automatically.

-The second problem is that Gmail converts tabs to spaces on replies.  If
-you reply to a patch, don't expect to be able to apply it as a patch.
+At the same time it wraps lines every 78 chars with CRLF style line breaks
+although tab2space problem can be solved with external editor.

-The last problem is that Gmail will base64-encode any message that has a
-non-ASCII character.  That includes things like European names.  Be aware.
-
-Gmail is not convenient for lkml patches, but CAN be made to work.
+Another problem is that Gmail will base64-encode any message that has a
+non-ASCII character. That includes things like European names.

                                ###
--- a/Documentation/fault-injection/fault-injection.txt
+++ b/Documentation/fault-injection/fault-injection.txt
@ -143,8 +143,8 @@ o provide a way to configure fault attributes
  failslab, fail_page_alloc, and fail_make_request use this way.
  Helper functions:

-	init_fault_attr_entries(entries, attr, name);
-	void cleanup_fault_attr_entries(entries);
+	init_fault_attr_dentries(entries, attr, name);
+	void cleanup_fault_attr_dentries(entries);

 - module parameters

--- a/Documentation/fault-injection/provoke-crashes.txt
+++ b/Documentation/fault-injection/provoke-crashes.txt
@ -0,0 +1,38 @@
+The lkdtm module provides an interface to crash or injure the kernel at
+predefined crashpoints to evaluate the reliability of crash dumps obtained
+using different dumping solutions. The module uses KPROBEs to instrument
+crashing points, but can also crash the kernel directly without KRPOBE
+support.
+
+
+You can provide the way either through module arguments when inserting
+the module, or through a debugfs interface.
+
+Usage: insmod lkdtm.ko [recur_count={>0}] cpoint_name=<> cpoint_type=<>
+				[cpoint_count={>0}]
+
+  recur_count : Recursion level for the stack overflow test. Default is 10.
+
+  cpoint_name : Crash point where the kernel is to be crashed. It can be
+	 one of INT_HARDWARE_ENTRY, INT_HW_IRQ_EN, INT_TASKLET_ENTRY,
+	 FS_DEVRW, MEM_SWAPOUT, TIMERADD, SCSI_DISPATCH_CMD,
+	 IDE_CORE_CP, DIRECT
+
+  cpoint_type : Indicates the action to be taken on hitting the crash point.
+     It can be one of PANIC, BUG, EXCEPTION, LOOP, OVERFLOW,
+     CORRUPT_STACK, UNALIGNED_LOAD_STORE_WRITE, OVERWRITE_ALLOCATION,
+     WRITE_AFTER_FREE,
+
+  cpoint_count : Indicates the number of times the crash point is to be hit
+    to trigger an action. The default is 10.
+
+You can also induce failures by mounting debugfs and writing the type to
+<mountpoint>/provoke-crash/<crashpoint>. E.g.,
+
+  mount -t debugfs debugfs /mnt
+  echo EXCEPTION > /mnt/provoke-crash/INT_HARDWARE_ENTRY
+
+
+A special file is `DIRECT' which will induce the crash directly without
+KPROBE instrumentation. This mode is the only one available when the module
+is built on a kernel without KPROBEs support.
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@ -6,21 +6,6 @@ be removed from this file.

 ---------------------------

-What:	USER_SCHED
-When:	2.6.34
-
-Why:	USER_SCHED was implemented as a proof of concept for group scheduling.
-	The effect of USER_SCHED can already be achieved from userspace with
-	the help of libcgroup. The removal of USER_SCHED will also simplify
-	the scheduler code with the removal of one major ifdef. There are also
-	issues USER_SCHED has with USER_NS. A decision was taken not to fix
-	those and instead remove USER_SCHED. Also new group scheduling
-	features will not be implemented for USER_SCHED.
-
-Who:	Dhaval Giani <dhaval@linux.vnet.ibm.com>
-
---------------------------
-
 What:	PRISM54
 When:	2.6.34

@ -64,6 +49,17 @@ Who:	Robin Getz <rgetz@blackfin.uclinux.org> & Matt Mackall <mpm@selenic.com>

 ---------------------------

+What:	Deprecated snapshot ioctls
+When:	2.6.36
+
+Why:	The ioctls in kernel/power/user.c were marked as deprecated long time
+	ago. Now they notify users about that so that they need to replace
+	their userspace. After some more time, remove them completely.
+
+Who:	Jiri Slaby <jirislaby@gmail.com>
+
+---------------------------
+
 What:	The ieee80211_regdom module parameter
 When:	March 2010 / desktop catchup

@ -88,27 +84,6 @@ Who:	Luis R. Rodriguez <lrodriguez@atheros.com>

 ---------------------------

-What:	CONFIG_WIRELESS_OLD_REGULATORY - old static regulatory information
-When:	March 2010 / desktop catchup
-
-Why:	The old regulatory infrastructure has been replaced with a new one
-	which does not require statically defined regulatory domains. We do
-	not want to keep static regulatory domains in the kernel due to the
-	the dynamic nature of regulatory law and localization. We kept around
-	the old static definitions for the regulatory domains of:
-
-		* US
-		* JP
-		* EU
-
-	and used by default the US when CONFIG_WIRELESS_OLD_REGULATORY was
-	set. We will remove this option once the standard Linux desktop catches
-	up with the new userspace APIs we have implemented.
-
-Who:	Luis R. Rodriguez <lrodriguez@atheros.com>
-
---------------------------
-
 What:	dev->power.power_state
 When:	July 2007
 Why:	Broken design for runtime control over driver power states, confusing
@ -142,19 +117,25 @@ Who:	Mauro Carvalho Chehab <mchehab@infradead.org>
 ---------------------------

 What:	PCMCIA control ioctl (needed for pcmcia-cs [cardmgr, cardctl])
-When:	November 2005
+When:	2.6.35/2.6.36
 Files:	drivers/pcmcia/: pcmcia_ioctl.c
 Why:	With the 16-bit PCMCIA subsystem now behaving (almost) like a
 	normal hotpluggable bus, and with it using the default kernel
 	infrastructure (hotplug, driver core, sysfs) keeping the PCMCIA
 	control ioctl needed by cardmgr and cardctl from pcmcia-cs is
-	unnecessary, and makes further cleanups and integration of the
+	unnecessary and potentially harmful (it does not provide for
+	proper locking), and makes further cleanups and integration of the
 	PCMCIA subsystem into the Linux kernel device driver model more
 	difficult. The features provided by cardmgr and cardctl are either
 	handled by the kernel itself now or are available in the new
 	pcmciautils package available at
 	http://kernel.org/pub/linux/utils/kernel/pcmcia/
-Who:	Dominik Brodowski <linux@brodo.de>
+
+	For all architectures except ARM, the associated config symbol
+	has been removed from kernel 2.6.34; for ARM, it will be likely
+	be removed from kernel 2.6.35. The actual code will then likely
+	be removed from kernel 2.6.36.
+Who:	Dominik Brodowski <linux@dominikbrodowski.net>

 ---------------------------

@ -468,9 +449,143 @@ Who:	Alok N Kataria <akataria@vmware.com>

 ----------------------------

-What:	adt7473 hardware monitoring driver
-When:	February 2010
-Why:	Obsoleted by the adt7475 driver.
-Who:	Jean Delvare <khali@linux-fr.org>
+What:	Support for lcd_switch and display_get in asus-laptop driver
+When:	March 2010
+Why:	These two features use non-standard interfaces. There are the
+	only features that really need multiple path to guess what's
+	the right method name on a specific laptop.

---------------------------
+	Removing them will allow to remove a lot of code an significantly
+	clean the drivers.
+
+	This will affect the backlight code which won't be able to know
+	if the backlight is on or off. The platform display file will also be
+	write only (like the one in eeepc-laptop).
+
+	This should'nt affect a lot of user because they usually know
+	when their display is on or off.
+
+Who:	Corentin Chary <corentin.chary@gmail.com>
+
+----------------------------
+
+What:	usbvideo quickcam_messenger driver
+When:	2.6.35
+Files:	drivers/media/video/usbvideo/quickcam_messenger.[ch]
+Why:	obsolete v4l1 driver replaced by gspca_stv06xx
+Who:	Hans de Goede <hdegoede@redhat.com>
+
+----------------------------
+
+What:	ov511 v4l1 driver
+When:	2.6.35
+Files:	drivers/media/video/ov511.[ch]
+Why:	obsolete v4l1 driver replaced by gspca_ov519
+Who:	Hans de Goede <hdegoede@redhat.com>
+
+----------------------------
+
+What:	w9968cf v4l1 driver
+When:	2.6.35
+Files:	drivers/media/video/w9968cf*.[ch]
+Why:	obsolete v4l1 driver replaced by gspca_ov519
+Who:	Hans de Goede <hdegoede@redhat.com>
+
+----------------------------
+
+What:	ovcamchip sensor framework
+When:	2.6.35
+Files:	drivers/media/video/ovcamchip/*
+Why:	Only used by obsoleted v4l1 drivers
+Who:	Hans de Goede <hdegoede@redhat.com>
+
+----------------------------
+
+What:	stv680 v4l1 driver
+When:	2.6.35
+Files:	drivers/media/video/stv680.[ch]
+Why:	obsolete v4l1 driver replaced by gspca_stv0680
+Who:	Hans de Goede <hdegoede@redhat.com>
+
+----------------------------
+
+What:	zc0301 v4l driver
+When:	2.6.35
+Files:	drivers/media/video/zc0301/*
+Why:	Duplicate functionality with the gspca_zc3xx driver, zc0301 only
+	supports 2 USB-ID's (because it only supports a limited set of
+	sensors) wich are also supported by the gspca_zc3xx driver
+	(which supports 53 USB-ID's in total)
+Who:	Hans de Goede <hdegoede@redhat.com>
+
+----------------------------
+
+What:	corgikbd, spitzkbd, tosakbd driver
+When:	2.6.35
+Files:	drivers/input/keyboard/{corgi,spitz,tosa}kbd.c
+Why:	We now have a generic GPIO based matrix keyboard driver that
+	are fully capable of handling all the keys on these devices.
+	The original drivers manipulate the GPIO registers directly
+	and so are difficult to maintain.
+Who:	Eric Miao <eric.y.miao@gmail.com>
+
+----------------------------
+
+What:	corgi_ssp and corgi_ts driver
+When:	2.6.35
+Files:	arch/arm/mach-pxa/corgi_ssp.c, drivers/input/touchscreen/corgi_ts.c
+Why:	The corgi touchscreen is now deprecated in favour of the generic
+	ads7846.c driver. The noise reduction technique used in corgi_ts.c,
+	that's to wait till vsync before ADC sampling, is also integrated into
+	ads7846 driver now. Provided that the original driver is not generic
+	and is difficult to maintain, it will be removed later.
+Who:	Eric Miao <eric.y.miao@gmail.com>
+
+----------------------------
+
+What:	capifs
+When:	February 2011
+Files:	drivers/isdn/capi/capifs.*
+Why:	udev fully replaces this special file system that only contains CAPI
+	NCCI TTY device nodes. User space (pppdcapiplugin) works without
+	noticing the difference.
+Who:	Jan Kiszka <jan.kiszka@web.de>
+
+----------------------------
+
+What:	KVM memory aliases support
+When:	July 2010
+Why:	Memory aliasing support is used for speeding up guest vga access
+	through the vga windows.
+
+	Modern userspace no longer uses this feature, so it's just bitrotted
+	code and can be removed with no impact.
+Who:	Avi Kivity <avi@redhat.com>
+
+----------------------------
+
+What:	KVM kernel-allocated memory slots
+When:	July 2010
+Why:	Since 2.6.25, kvm supports user-allocated memory slots, which are
+	much more flexible than kernel-allocated slots.  All current userspace
+	supports the newer interface and this code can be removed with no
+	impact.
+Who:	Avi Kivity <avi@redhat.com>
+
+----------------------------
+
+What:	KVM paravirt mmu host support
+When:	January 2011
+Why:	The paravirt mmu host support is slower than non-paravirt mmu, both
+	on newer and older hardware.  It is already not exposed to the guest,
+	and kept only for live migration purposes.
+Who:	Avi Kivity <avi@redhat.com>
+
+----------------------------
+
+What: 	"acpi=ht" boot option
+When:	2.6.35
+Why:	Useful in 2003, implementation is a hack.
+	Generally invoked by accident today.
+	Seen as doing more harm than good.
+Who:	Len Brown <len.brown@intel.com>
--- a/Documentation/filesystems/00-INDEX
+++ b/Documentation/filesystems/00-INDEX
@ -1,7 +1,5 @@
 00-INDEX
 	- this file (info on some of the filesystems supported by linux).
-Exporting
-	- explanation of how to make filesystems exportable.
 Locking
 	- info on locking rules as they pertain to Linux VFS.
 9p.txt
@ -18,6 +16,8 @@ befs.txt
 	- information about the BeOS filesystem for Linux.
 bfs.txt
 	- info for the SCO UnixWare Boot Filesystem (BFS).
+ceph.txt
+	- info for the Ceph Distributed File System
 cifs.txt
 	- description of the CIFS filesystem.
 coda.txt
@ -34,6 +34,8 @@ dlmfs.txt
 	- info on the userspace interface to the OCFS2 DLM.
 dnotify.txt
 	- info about directory notification in Linux.
+dnotify_test.c
+	- example program for dnotify
 ecryptfs.txt
 	- docs on eCryptfs: stacked cryptographic filesystem for Linux.
 exofs.txt
@ -64,16 +66,14 @@ jfs.txt
 	- info and mount options for the JFS filesystem.
 locks.txt
 	- info on file locking implementations, flock() vs. fcntl(), etc.
+logfs.txt
+	- info on the LogFS flash filesystem.
 mandatory-locking.txt
 	- info on the Linux implementation of Sys V mandatory file locking.
 ncpfs.txt
 	- info on Novell Netware(tm) filesystem using NCP protocol.
-nfs41-server.txt
-	- info on the Linux server implementation of NFSv4 minor version 1.
-nfs-rdma.txt
-	- how to install and setup the Linux NFS/RDMA client and server software.
-nfsroot.txt
-	- short guide on setting up a diskless box with NFS root filesystem.
+nfs/
+	- nfs-related documentation.
 nilfs2.txt
 	- info and mount options for the NILFS2 filesystem.
 ntfs.txt
@ -92,8 +92,6 @@ relay.txt
 	- info on relay, for efficient streaming from kernel to user space.
 romfs.txt
 	- description of the ROMFS filesystem.
-rpc-cache.txt
-	- introduction to the caching mechanisms in the sunrpc layer.
 seq_file.txt
 	- how to use the seq_file API
 sharedsubtree.txt
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@ -460,13 +460,6 @@ in sys_read() and friends.

 --------------------------- dquot_operations -------------------------------
 prototypes:
-	int (*initialize) (struct inode *, int);
-	int (*drop) (struct inode *);
-	int (*alloc_space) (struct inode *, qsize_t, int);
-	int (*alloc_inode) (const struct inode *, unsigned long);
-	int (*free_space) (struct inode *, qsize_t);
-	int (*free_inode) (const struct inode *, unsigned long);
-	int (*transfer) (struct inode *, struct iattr *);
 	int (*write_dquot) (struct dquot *);
 	int (*acquire_dquot) (struct dquot *);
 	int (*release_dquot) (struct dquot *);
@ -479,13 +472,6 @@ a proper locking wrt the filesystem and call the generic quota operations.
 What filesystem should expect from the generic quota functions:

 		FS recursion	Held locks when called
-initialize:	yes		maybe dqonoff_sem
-drop:		yes		-
-alloc_space:	->mark_dirty()	-
-alloc_inode:	->mark_dirty()	-
-free_space:	->mark_dirty()	-
-free_inode:	->mark_dirty()	-
-transfer:	yes		-
 write_dquot:	yes		dqonoff_sem or dqptr_sem
 acquire_dquot:	yes		dqonoff_sem or dqptr_sem
 release_dquot:	yes		dqonoff_sem or dqptr_sem
@ -495,10 +481,6 @@ write_info:	yes		dqonoff_sem
 FS recursion means calling ->quota_read() and ->quota_write() from superblock
 operations.

->alloc_space(), ->alloc_inode(), ->free_space(), ->free_inode() are called
-only directly by the filesystem and do not call any fs functions only
-the ->mark_dirty() operation.
-
 More details about quota locking can be found in fs/dquot.c.

 --------------------------- vm_operations_struct -----------------------------
--- a/Documentation/filesystems/Makefile
+++ b/Documentation/filesystems/Makefile
@ -0,0 +1,8 @@
+# kbuild trick to avoid linker error. Can be omitted if a module is built.
+obj- := dummy.o
+
+# List of programs to build
+hostprogs-y := dnotify_test
+
+# Tell kbuild to always build the programs
+always := $(hostprogs-y)
--- a/Documentation/filesystems/ceph.txt
+++ b/Documentation/filesystems/ceph.txt
@ -0,0 +1,140 @@
+Ceph Distributed File System
+============================
+
+Ceph is a distributed network file system designed to provide good
+performance, reliability, and scalability.
+
+Basic features include:
+
+ * POSIX semantics
+ * Seamless scaling from 1 to many thousands of nodes
+ * High availability and reliability.  No single point of failure.
+ * N-way replication of data across storage nodes
+ * Fast recovery from node failures
+ * Automatic rebalancing of data on node addition/removal
+ * Easy deployment: most FS components are userspace daemons
+
+Also,
+ * Flexible snapshots (on any directory)
+ * Recursive accounting (nested files, directories, bytes)
+
+In contrast to cluster filesystems like GFS, OCFS2, and GPFS that rely
+on symmetric access by all clients to shared block devices, Ceph
+separates data and metadata management into independent server
+clusters, similar to Lustre.  Unlike Lustre, however, metadata and
+storage nodes run entirely as user space daemons.  Storage nodes
+utilize btrfs to store data objects, leveraging its advanced features
+(checksumming, metadata replication, etc.).  File data is striped
+across storage nodes in large chunks to distribute workload and
+facilitate high throughputs.  When storage nodes fail, data is
+re-replicated in a distributed fashion by the storage nodes themselves
+(with some minimal coordination from a cluster monitor), making the
+system extremely efficient and scalable.
+
+Metadata servers effectively form a large, consistent, distributed
+in-memory cache above the file namespace that is extremely scalable,
+dynamically redistributes metadata in response to workload changes,
+and can tolerate arbitrary (well, non-Byzantine) node failures.  The
+metadata server takes a somewhat unconventional approach to metadata
+storage to significantly improve performance for common workloads.  In
+particular, inodes with only a single link are embedded in
+directories, allowing entire directories of dentries and inodes to be
+loaded into its cache with a single I/O operation.  The contents of
+extremely large directories can be fragmented and managed by
+independent metadata servers, allowing scalable concurrent access.
+
+The system offers automatic data rebalancing/migration when scaling
+from a small cluster of just a few nodes to many hundreds, without
+requiring an administrator carve the data set into static volumes or
+go through the tedious process of migrating data between servers.
+When the file system approaches full, new nodes can be easily added
+and things will "just work."
+
+Ceph includes flexible snapshot mechanism that allows a user to create
+a snapshot on any subdirectory (and its nested contents) in the
+system.  Snapshot creation and deletion are as simple as 'mkdir
+.snap/foo' and 'rmdir .snap/foo'.
+
+Ceph also provides some recursive accounting on directories for nested
+files and bytes.  That is, a 'getfattr -d foo' on any directory in the
+system will reveal the total number of nested regular files and
+subdirectories, and a summation of all nested file sizes.  This makes
+the identification of large disk space consumers relatively quick, as
+no 'du' or similar recursive scan of the file system is required.
+
+
+Mount Syntax
+============
+
+The basic mount syntax is:
+
+ # mount -t ceph monip[:port][,monip2[:port]...]:/[subdir] mnt
+
+You only need to specify a single monitor, as the client will get the
+full list when it connects.  (However, if the monitor you specify
+happens to be down, the mount won't succeed.)  The port can be left
+off if the monitor is using the default.  So if the monitor is at
+1.2.3.4,
+
+ # mount -t ceph 1.2.3.4:/ /mnt/ceph
+
+is sufficient.  If /sbin/mount.ceph is installed, a hostname can be
+used instead of an IP address.
+
+
+
+Mount Options
+=============
+
+  ip=A.B.C.D[:N]
+	Specify the IP and/or port the client should bind to locally.
+	There is normally not much reason to do this.  If the IP is not
+	specified, the client's IP address is determined by looking at the
+	address it's connection to the monitor originates from.
+
+  wsize=X
+	Specify the maximum write size in bytes.  By default there is no
+	maximum.  Ceph will normally size writes based on the file stripe
+	size.
+
+  rsize=X
+	Specify the maximum readahead.
+
+  mount_timeout=X
+	Specify the timeout value for mount (in seconds), in the case
+	of a non-responsive Ceph file system.  The default is 30
+	seconds.
+
+  rbytes
+	When stat() is called on a directory, set st_size to 'rbytes',
+	the summation of file sizes over all files nested beneath that
+	directory.  This is the default.
+
+  norbytes
+	When stat() is called on a directory, set st_size to the
+	number of entries in that directory.
+
+  nocrc
+	Disable CRC32C calculation for data writes.  If set, the storage node
+	must rely on TCP's error correction to detect data corruption
+	in the data payload.
+
+  noasyncreaddir
+	Disable client's use its local cache to satisfy	readdir
+	requests.  (This does not change correctness; the client uses
+	cached metadata only when a lease or capability ensures it is
+	valid.)
+
+
+More Information
+================
+
+For more information on Ceph, see the home page at
+	http://ceph.newdream.net/
+
+The Linux kernel client source tree is available at
+	git://ceph.newdream.net/git/ceph-client.git
+	git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client.git
+
+and the source for the full system is at
+	git://ceph.newdream.net/git/ceph.git
--- a/Documentation/filesystems/dentry-locking.txt
+++ b/Documentation/filesystems/dentry-locking.txt
@ -62,7 +62,8 @@ changes are :
 2. Insertion of a dentry into the hash table is done using
   hlist_add_head_rcu() which take care of ordering the writes - the
   writes to the dentry must be visible before the dentry is
-   inserted. This works in conjunction with hlist_for_each_rcu() while
+   inserted. This works in conjunction with hlist_for_each_rcu(),
+   which has since been replaced by hlist_for_each_entry_rcu(), while
   walking the hash chain. The only requirement is that all
   initialization to the dentry must be done before
   hlist_add_head_rcu() since we don't have dcache_lock protection
--- a/Documentation/filesystems/dnotify.txt
+++ b/Documentation/filesystems/dnotify.txt
@ -62,38 +62,9 @@ disabled, fcntl(fd, F_NOTIFY, ...) will return -EINVAL.

 Example
 -------
+See Documentation/filesystems/dnotify_test.c for an example.

-	#define _GNU_SOURCE	/* needed to get the defines */
-	#include <fcntl.h>	/* in glibc 2.2 this has the needed
-					   values defined */
-	#include <signal.h>
-	#include <stdio.h>
-	#include <unistd.h>
-
-	static volatile int event_fd;
-
-	static void handler(int sig, siginfo_t *si, void *data)
-	{
-		event_fd = si->si_fd;
-	}
-
-	int main(void)
-	{
-		struct sigaction act;
-		int fd;
-
-		act.sa_sigaction = handler;
-		sigemptyset(&act.sa_mask);
-		act.sa_flags = SA_SIGINFO;
-		sigaction(SIGRTMIN + 1, &act, NULL);
-
-		fd = open(".", O_RDONLY);
-		fcntl(fd, F_SETSIG, SIGRTMIN + 1);
-		fcntl(fd, F_NOTIFY, DN_MODIFY|DN_CREATE|DN_MULTISHOT);
-		/* we will now be notified if any of the files
-		   in "." is modified or new files are created */
-		while (1) {
-			pause();
-			printf("Got event on fd=%d\n", event_fd);
-		}
-	}
+NOTE
+----
+Beginning with Linux 2.6.13, dnotify has been replaced by inotify.
+See Documentation/filesystems/inotify.txt for more information on it.
--- a/Documentation/filesystems/dnotify_test.c
+++ b/Documentation/filesystems/dnotify_test.c
@ -0,0 +1,34 @@
+#define _GNU_SOURCE	/* needed to get the defines */
+#include <fcntl.h>	/* in glibc 2.2 this has the needed
+				   values defined */
+#include <signal.h>
+#include <stdio.h>
+#include <unistd.h>
+
+static volatile int event_fd;
+
+static void handler(int sig, siginfo_t *si, void *data)
+{
+	event_fd = si->si_fd;
+}
+
+int main(void)
+{
+	struct sigaction act;
+	int fd;
+
+	act.sa_sigaction = handler;
+	sigemptyset(&act.sa_mask);
+	act.sa_flags = SA_SIGINFO;
+	sigaction(SIGRTMIN + 1, &act, NULL);
+
+	fd = open(".", O_RDONLY);
+	fcntl(fd, F_SETSIG, SIGRTMIN + 1);
+	fcntl(fd, F_NOTIFY, DN_MODIFY|DN_CREATE|DN_MULTISHOT);
+	/* we will now be notified if any of the files
+	   in "." is modified or new files are created */
+	while (1) {
+		pause();
+		printf("Got event on fd=%d\n", event_fd);
+	}
+}
--- a/Documentation/filesystems/ext4.txt
+++ b/Documentation/filesystems/ext4.txt
@ -196,7 +196,7 @@ nobarrier		This also requires an IO stack which can support
 			also be used to enable or disable barriers, for
 			consistency with other ext4 mount options.

-inode_readahead=n	This tuning parameter controls the maximum
+inode_readahead_blks=n	This tuning parameter controls the maximum
 			number of inode table blocks that ext4's inode
 			table readahead algorithm will pre-read into
 			the buffer cache.  The default value is 32 blocks.
--- a/Documentation/filesystems/logfs.txt
+++ b/Documentation/filesystems/logfs.txt
@ -0,0 +1,241 @@
+
+The LogFS Flash Filesystem
+==========================
+
+Specification
+=============
+
+Superblocks
+-----------
+
+Two superblocks exist at the beginning and end of the filesystem.
+Each superblock is 256 Bytes large, with another 3840 Bytes reserved
+for future purposes, making a total of 4096 Bytes.
+
+Superblock locations may differ for MTD and block devices.  On MTD the
+first non-bad block contains a superblock in the first 4096 Bytes and
+the last non-bad block contains a superblock in the last 4096 Bytes.
+On block devices, the first 4096 Bytes of the device contain the first
+superblock and the last aligned 4096 Byte-block contains the second
+superblock.
+
+For the most part, the superblocks can be considered read-only.  They
+are written only to correct errors detected within the superblocks,
+move the journal and change the filesystem parameters through tunefs.
+As a result, the superblock does not contain any fields that require
+constant updates, like the amount of free space, etc.
+
+Segments
+--------
+
+The space in the device is split up into equal-sized segments.
+Segments are the primary write unit of LogFS.  Within each segments,
+writes happen from front (low addresses) to back (high addresses.  If
+only a partial segment has been written, the segment number, the
+current position within and optionally a write buffer are stored in
+the journal.
+
+Segments are erased as a whole.  Therefore Garbage Collection may be
+required to completely free a segment before doing so.
+
+Journal
+--------
+
+The journal contains all global information about the filesystem that
+is subject to frequent change.  At mount time, it has to be scanned
+for the most recent commit entry, which contains a list of pointers to
+all currently valid entries.
+
+Object Store
+------------
+
+All space except for the superblocks and journal is part of the object
+store.  Each segment contains a segment header and a number of
+objects, each consisting of the object header and the payload.
+Objects are either inodes, directory entries (dentries), file data
+blocks or indirect blocks.
+
+Levels
+------
+
+Garbage collection (GC) may fail if all data is written
+indiscriminately.  One requirement of GC is that data is seperated
+roughly according to the distance between the tree root and the data.
+Effectively that means all file data is on level 0, indirect blocks
+are on levels 1, 2, 3 4 or 5 for 1x, 2x, 3x, 4x or 5x indirect blocks,
+respectively.  Inode file data is on level 6 for the inodes and 7-11
+for indirect blocks.
+
+Each segment contains objects of a single level only.  As a result,
+each level requires its own seperate segment to be open for writing.
+
+Inode File
+----------
+
+All inodes are stored in a special file, the inode file.  Single
+exception is the inode file's inode (master inode) which for obvious
+reasons is stored in the journal instead.  Instead of data blocks, the
+leaf nodes of the inode files are inodes.
+
+Aliases
+-------
+
+Writes in LogFS are done by means of a wandering tree.  A naïve
+implementation would require that for each write or a block, all
+parent blocks are written as well, since the block pointers have
+changed.  Such an implementation would not be very efficient.
+
+In LogFS, the block pointer changes are cached in the journal by means
+of alias entries.  Each alias consists of its logical address - inode
+number, block index, level and child number (index into block) - and
+the changed data.  Any 8-byte word can be changes in this manner.
+
+Currently aliases are used for block pointers, file size, file used
+bytes and the height of an inodes indirect tree.
+
+Segment Aliases
+---------------
+
+Related to regular aliases, these are used to handle bad blocks.
+Initially, bad blocks are handled by moving the affected segment
+content to a spare segment and noting this move in the journal with a
+segment alias, a simple (to, from) tupel.  GC will later empty this
+segment and the alias can be removed again.  This is used on MTD only.
+
+Vim
+---
+
+By cleverly predicting the life time of data, it is possible to
+seperate long-living data from short-living data and thereby reduce
+the GC overhead later.  Each type of distinc life expectency (vim) can
+have a seperate segment open for writing.  Each (level, vim) tupel can
+be open just once.  If an open segment with unknown vim is encountered
+at mount time, it is closed and ignored henceforth.
+
+Indirect Tree
+-------------
+
+Inodes in LogFS are similar to FFS-style filesystems with direct and
+indirect block pointers.  One difference is that LogFS uses a single
+indirect pointer that can be either a 1x, 2x, etc. indirect pointer.
+A height field in the inode defines the height of the indirect tree
+and thereby the indirection of the pointer.
+
+Another difference is the addressing of indirect blocks.  In LogFS,
+the first 16 pointers in the first indirect block are left empty,
+corresponding to the 16 direct pointers in the inode.  In ext2 (maybe
+others as well) the first pointer in the first indirect block
+corresponds to logical block 12, skipping the 12 direct pointers.
+So where ext2 is using arithmetic to better utilize space, LogFS keeps
+arithmetic simple and uses compression to save space.
+
+Compression
+-----------
+
+Both file data and metadata can be compressed.  Compression for file
+data can be enabled with chattr +c and disabled with chattr -c.  Doing
+so has no effect on existing data, but new data will be stored
+accordingly.  New inodes will inherit the compression flag of the
+parent directory.
+
+Metadata is always compressed.  However, the space accounting ignores
+this and charges for the uncompressed size.  Failing to do so could
+result in GC failures when, after moving some data, indirect blocks
+compress worse than previously.  Even on a 100% full medium, GC may
+not consume any extra space, so the compression gains are lost space
+to the user.
+
+However, they are not lost space to the filesystem internals.  By
+cheating the user for those bytes, the filesystem gained some slack
+space and GC will run less often and faster.
+
+Garbage Collection and Wear Leveling
+------------------------------------
+
+Garbage collection is invoked whenever the number of free segments
+falls below a threshold.  The best (known) candidate is picked based
+on the least amount of valid data contained in the segment.  All
+remaining valid data is copied elsewhere, thereby invalidating it.
+
+The GC code also checks for aliases and writes then back if their
+number gets too large.
+
+Wear leveling is done by occasionally picking a suboptimal segment for
+garbage collection.  If a stale segments erase count is significantly
+lower than the active segments' erase counts, it will be picked.  Wear
+leveling is rate limited, so it will never monopolize the device for
+more than one segment worth at a time.
+
+Values for "occasionally", "significantly lower" are compile time
+constants.
+
+Hashed directories
+------------------
+
+To satisfy efficient lookup(), directory entries are hashed and
+located based on the hash.  In order to both support large directories
+and not be overly inefficient for small directories, several hash
+tables of increasing size are used.  For each table, the hash value
+modulo the table size gives the table index.
+
+Tables sizes are chosen to limit the number of indirect blocks with a
+fully populated table to 0, 1, 2 or 3 respectively.  So the first
+table contains 16 entries, the second 512-16, etc.
+
+The last table is special in several ways.  First its size depends on
+the effective 32bit limit on telldir/seekdir cookies.  Since logfs
+uses the upper half of the address space for indirect blocks, the size
+is limited to 2^31.  Secondly the table contains hash buckets with 16
+entries each.
+
+Using single-entry buckets would result in birthday "attacks".  At
+just 2^16 used entries, hash collisions would be likely (P >= 0.5).
+My math skills are insufficient to do the combinatorics for the 17x
+collisions necessary to overflow a bucket, but testing showed that in
+10,000 runs the lowest directory fill before a bucket overflow was
+188,057,130 entries with an average of 315,149,915 entries.  So for
+directory sizes of up to a million, bucket overflows should be
+virtually impossible under normal circumstances.
+
+With carefully chosen filenames, it is obviously possible to cause an
+overflow with just 21 entries (4 higher tables + 16 entries + 1).  So
+there may be a security concern if a malicious user has write access
+to a directory.
+
+Open For Discussion
+===================
+
+Device Address Space
+--------------------
+
+A device address space is used for caching.  Both block devices and
+MTD provide functions to either read a single page or write a segment.
+Partial segments may be written for data integrity, but where possible
+complete segments are written for performance on simple block device
+flash media.
+
+Meta Inodes
+-----------
+
+Inodes are stored in the inode file, which is just a regular file for
+most purposes.  At umount time, however, the inode file needs to
+remain open until all dirty inodes are written.  So
+generic_shutdown_super() may not close this inode, but shouldn't
+complain about remaining inodes due to the inode file either.  Same
+goes for mapping inode of the device address space.
+
+Currently logfs uses a hack that essentially copies part of fs/inode.c
+code over.  A general solution would be preferred.
+
+Indirect block mapping
+----------------------
+
+With compression, the block device (or mapping inode) cannot be used
+to cache indirect blocks.  Some other place is required.  Currently
+logfs uses the top half of each inode's address space.  The low 8TB
+(on 32bit) are filled with file data, the high 8TB are used for
+indirect blocks.
+
+One problem is that 16TB files created on 64bit systems actually have
+data in the top 8TB.  But files >16TB would cause problems anyway, so
+only the limit has changed.
--- a/Documentation/filesystems/nfs/00-INDEX
+++ b/Documentation/filesystems/nfs/00-INDEX
@ -0,0 +1,16 @@
+00-INDEX
+	- this file (nfs-related documentation).
+Exporting
+	- explanation of how to make filesystems exportable.
+knfsd-stats.txt
+	- statistics which the NFS server makes available to user space.
+nfs.txt
+	- nfs client, and DNS resolution for fs_locations.
+nfs41-server.txt
+	- info on the Linux server implementation of NFSv4 minor version 1.
+nfs-rdma.txt
+	- how to install and setup the Linux NFS/RDMA client and server software
+nfsroot.txt
+	- short guide on setting up a diskless box with NFS root filesystem.
+rpc-cache.txt
+	- introduction to the caching mechanisms in the sunrpc layer.
--- a/Documentation/filesystems/nfs/Exporting
+++ b/Documentation/filesystems/nfs/Exporting
--- a/Documentation/filesystems/nfs/knfsd-stats.txt
+++ b/Documentation/filesystems/nfs/knfsd-stats.txt
--- a/Documentation/filesystems/nfs/nfs-rdma.txt
+++ b/Documentation/filesystems/nfs/nfs-rdma.txt
--- a/Documentation/filesystems/nfs/nfs.txt
+++ b/Documentation/filesystems/nfs/nfs.txt
--- a/Documentation/filesystems/nfs/nfs41-server.txt
+++ b/Documentation/filesystems/nfs/nfs41-server.txt
@ -17,8 +17,7 @@ kernels must turn 4.1 on or off *before* turning support for version 4
 on or off; rpc.nfsd does this correctly.)

 The NFSv4 minorversion 1 (NFSv4.1) implementation in nfsd is based
-on the latest NFSv4.1 Internet Draft:
-http://tools.ietf.org/html/draft-ietf-nfsv4-minorversion1-29
+on RFC 5661.

 From the many new features in NFSv4.1 the current implementation
 focuses on the mandatory-to-implement NFSv4.1 Sessions, providing
@ -41,10 +40,10 @@ interoperability problems with future clients.  Known issues:
 	  conformant with the spec (for example, we don't use kerberos
 	  on the backchannel correctly).
 	- no trunking support: no clients currently take advantage of
-	  trunking, but this is a mandatory failure, and its use is
+	  trunking, but this is a mandatory feature, and its use is
 	  recommended to clients in a number of places.  (E.g. to ensure
 	  timely renewal in case an existing connection's retry timeouts
-	  have gotten too long; see section 8.3 of the draft.)
+	  have gotten too long; see section 8.3 of the RFC.)
 	  Therefore, lack of this feature may cause future clients to
 	  fail.
 	- Incomplete backchannel support: incomplete backchannel gss
@ -213,3 +212,10 @@ The following cases aren't supported yet:
  DESTROY_CLIENTID, DESTROY_SESSION, EXCHANGE_ID.
 * DESTROY_SESSION MUST be the final operation in the COMPOUND request.

+Nonstandard compound limitations:
+* No support for a sessions fore channel RPC compound that requires both a
+  ca_maxrequestsize request and a ca_maxresponsesize reply, so we may
+  fail to live up to the promise we made in CREATE_SESSION fore channel
+  negotiation.
+* No more than one IO operation (read, write, readdir) allowed per
+  compound.
--- a/Documentation/filesystems/nfs/nfsroot.txt
+++ b/Documentation/filesystems/nfs/nfsroot.txt
--- a/Documentation/filesystems/nfs/rpc-cache.txt
+++ b/Documentation/filesystems/nfs/rpc-cache.txt
--- a/Documentation/filesystems/nilfs2.txt
+++ b/Documentation/filesystems/nilfs2.txt
@ -28,7 +28,7 @@ described in the man pages included in the package.
 Project web page:    http://www.nilfs.org/en/
 Download page:       http://www.nilfs.org/en/download.html
 Git tree web page:   http://www.nilfs.org/git/
-NILFS mailing lists: http://www.nilfs.org/mailman/listinfo/users
+List info:           http://vger.kernel.org/vger-lists.html#linux-nilfs

 Caveats
 =======
@ -74,6 +74,9 @@ norecovery		Disable recovery of the filesystem on mount.
 			This disables every write access on the device for
 			read-only mounts or snapshots.  This option will fail
 			for r/w mounts on an unclean volume.
+discard			Issue discard/TRIM commands to the underlying block
+			device when blocks are freed.  This is useful for SSD
+			devices and sparse/thinly-provisioned LUNs.

 NILFS2 usage
 ============
--- a/Documentation/filesystems/porting
+++ b/Documentation/filesystems/porting
@ -140,7 +140,7 @@ Callers of notify_change() need ->i_mutex now.
 New super_block field "struct export_operations *s_export_op" for
 explicit support for exporting, e.g. via NFS.  The structure is fully
 documented at its declaration in include/linux/fs.h, and in
-Documentation/filesystems/Exporting.
+Documentation/filesystems/nfs/Exporting.

 Briefly it allows for the definition of decode_fh and encode_fh operations
 to encode and decode filehandles, and allows the filesystem to use
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@ -164,6 +164,7 @@ read the file /proc/PID/status:
  VmExe:        68 kB
  VmLib:      1412 kB
  VmPTE:        20 kb
+  VmSwap:        0 kB
  Threads:        1
  SigQ:   0/28578
  SigPnd: 0000000000000000
@ -177,7 +178,6 @@ read the file /proc/PID/status:
  CapBnd: ffffffffffffffff
  voluntary_ctxt_switches:        0
  nonvoluntary_ctxt_switches:     1
-  Stack usage:    12 kB

 This shows you nearly the same information you would get if you viewed it with
 the ps  command.  In  fact,  ps  uses  the  proc  file  system  to  obtain its
@ -189,7 +189,13 @@ memory usage. Its seven fields are explained in Table 1-3.  The stat file
 contains details information about the process itself.  Its fields are
 explained in Table 1-4.

-Table 1-2: Contents of the statm files (as of 2.6.30-rc7)
+(for SMP CONFIG users)
+For making accounting scalable, RSS related information are handled in
+asynchronous manner and the vaule may not be very precise. To see a precise
+snapshot of a moment, you can see /proc/<pid>/smaps file and scan page table.
+It's slow but very precise.
+
+Table 1-2: Contents of the status files (as of 2.6.30-rc7)
 ..............................................................................
 Field                       Content
 Name                        filename of the executable
@ -214,6 +220,7 @@ Table 1-2: Contents of the statm files (as of 2.6.30-rc7)
 VmExe                       size of text segment
 VmLib                       size of shared library code
 VmPTE                       size of page table entries
+ VmSwap                      size of swap usage (the number of referred swapents)
 Threads                     number of threads
 SigQ                        number of signals queued/max. number for queue
 SigPnd                      bitmap of pending signals for the thread
@ -231,7 +238,6 @@ Table 1-2: Contents of the statm files (as of 2.6.30-rc7)
 Mems_allowed_list           Same as previous, but in "list format"
 voluntary_ctxt_switches     number of voluntary context switches
 nonvoluntary_ctxt_switches  number of non voluntary context switches
- Stack usage:                stack usage high water mark (round up to page size)
 ..............................................................................

 Table 1-3: Contents of the statm files (as of 2.6.8-rc3)
@ -432,6 +438,7 @@ Table 1-5: Kernel info in /proc
 modules     List of loaded modules                            
 mounts      Mounted filesystems                               
 net         Networking info (see text)                        
+ pagetypeinfo Additional page allocator information (see text)  (2.5)
 partitions  Table of partitions known to the system           
 pci	     Deprecated info of PCI bus (new way -> /proc/bus/pci/,
             decoupled by lspci					(2.4)
@ -586,7 +593,7 @@ Node 0, zone      DMA      0      4      5      4      4      3 ...
 Node 0, zone   Normal      1      0      0      1    101      8 ...
 Node 0, zone  HighMem      2      0      0      1      1      0 ...

-Memory fragmentation is a problem under some workloads, and buddyinfo is a 
+External fragmentation is a problem under some workloads, and buddyinfo is a
 useful tool for helping diagnose these problems.  Buddyinfo will give you a 
 clue as to how big an area you can safely allocate, or why a previous
 allocation failed.
@ -596,6 +603,48 @@ available.  In this case, there are 0 chunks of 2^0*PAGE_SIZE available in
 ZONE_DMA, 4 chunks of 2^1*PAGE_SIZE in ZONE_DMA, 101 chunks of 2^4*PAGE_SIZE 
 available in ZONE_NORMAL, etc... 

+More information relevant to external fragmentation can be found in
+pagetypeinfo.
+
+> cat /proc/pagetypeinfo
+Page block order: 9
+Pages per block:  512
+
+Free pages count per migrate type at order       0      1      2      3      4      5      6      7      8      9     10
+Node    0, zone      DMA, type    Unmovable      0      0      0      1      1      1      1      1      1      1      0
+Node    0, zone      DMA, type  Reclaimable      0      0      0      0      0      0      0      0      0      0      0
+Node    0, zone      DMA, type      Movable      1      1      2      1      2      1      1      0      1      0      2
+Node    0, zone      DMA, type      Reserve      0      0      0      0      0      0      0      0      0      1      0
+Node    0, zone      DMA, type      Isolate      0      0      0      0      0      0      0      0      0      0      0
+Node    0, zone    DMA32, type    Unmovable    103     54     77      1      1      1     11      8      7      1      9
+Node    0, zone    DMA32, type  Reclaimable      0      0      2      1      0      0      0      0      1      0      0
+Node    0, zone    DMA32, type      Movable    169    152    113     91     77     54     39     13      6      1    452
+Node    0, zone    DMA32, type      Reserve      1      2      2      2      2      0      1      1      1      1      0
+Node    0, zone    DMA32, type      Isolate      0      0      0      0      0      0      0      0      0      0      0
+
+Number of blocks type     Unmovable  Reclaimable      Movable      Reserve      Isolate
+Node 0, zone      DMA            2            0            5            1            0
+Node 0, zone    DMA32           41            6          967            2            0
+
+Fragmentation avoidance in the kernel works by grouping pages of different
+migrate types into the same contiguous regions of memory called page blocks.
+A page block is typically the size of the default hugepage size e.g. 2MB on
+X86-64. By keeping pages grouped based on their ability to move, the kernel
+can reclaim pages within a page block to satisfy a high-order allocation.
+
+The pagetypinfo begins with information on the size of a page block. It
+then gives the same type of information as buddyinfo except broken down
+by migrate-type and finishes with details on how many page blocks of each
+type exist.
+
+If min_free_kbytes has been tuned correctly (recommendations made by hugeadm
+from libhugetlbfs http://sourceforge.net/projects/libhugetlbfs/), one can
+make an estimate of the likely number of huge pages that can be allocated
+at a given point in time. All the "Movable" blocks should be allocatable
+unless memory has been mlock()'d. Some of the Reclaimable blocks should
+also be allocatable although a lot of filesystem metadata may have to be
+reclaimed to achieve this.
+
 ..............................................................................

 meminfo:
--- a/Documentation/filesystems/sharedsubtree.txt
+++ b/Documentation/filesystems/sharedsubtree.txt
@ -837,6 +837,9 @@ replicas continue to be exactly same.
 	 individual lists does not affect propagation or the way propagation
 	 tree is modified by operations.

+	All vfsmounts in a peer group have the same ->mnt_master.  If it is
+	non-NULL, they form a contiguous (ordered) segment of slave list.
+
 	A example propagation tree looks as shown in the figure below.
 	[ NOTE: Though it looks like a forest, if we consider all the shared
 	mounts as a conceptual entity called 'pnode', it becomes a tree]
@ -874,8 +877,19 @@ replicas continue to be exactly same.

 	NOTE: The propagation tree is orthogonal to the mount tree.

+8B Locking:

-8B Algorithm:
+	->mnt_share, ->mnt_slave, ->mnt_slave_list, ->mnt_master are protected
+	by namespace_sem (exclusive for modifications, shared for reading).
+
+	Normally we have ->mnt_flags modifications serialized by vfsmount_lock.
+	There are two exceptions: do_add_mount() and clone_mnt().
+	The former modifies a vfsmount that has not been visible in any shared
+	data structures yet.
+	The latter holds namespace_sem and the only references to vfsmount
+	are in lists that can't be traversed without namespace_sem.
+
+8C Algorithm:

 	The crux of the implementation resides in rbind/move operation.

--- a/Documentation/filesystems/sysfs.txt
+++ b/Documentation/filesystems/sysfs.txt
@ -91,8 +91,8 @@ struct device_attribute {
 			 const char *buf, size_t count);
 };

-int device_create_file(struct device *, struct device_attribute *);
-void device_remove_file(struct device *, struct device_attribute *);
+int device_create_file(struct device *, const struct device_attribute *);
+void device_remove_file(struct device *, const struct device_attribute *);

 It also defines this helper for defining device attributes: 

@ -316,8 +316,8 @@ DEVICE_ATTR(_name, _mode, _show, _store);

 Creation/Removal:

-int device_create_file(struct device *device, struct device_attribute * attr);
-void device_remove_file(struct device * dev, struct device_attribute * attr);
+int device_create_file(struct device *dev, const struct device_attribute * attr);
+void device_remove_file(struct device *dev, const struct device_attribute * attr);


 - bus drivers (include/linux/device.h)
@ -358,7 +358,7 @@ DRIVER_ATTR(_name, _mode, _show, _store)

 Creation/Removal:

-int driver_create_file(struct device_driver *, struct driver_attribute *);
-void driver_remove_file(struct device_driver *, struct driver_attribute *);
+int driver_create_file(struct device_driver *, const struct driver_attribute *);
+void driver_remove_file(struct device_driver *, const struct driver_attribute *);


--- a/Documentation/filesystems/tmpfs.txt
+++ b/Documentation/filesystems/tmpfs.txt
@ -82,11 +82,13 @@ tmpfs has a mount option to set the NUMA memory allocation policy for
 all files in that instance (if CONFIG_NUMA is enabled) - which can be
 adjusted on the fly via 'mount -o remount ...'

-mpol=default             prefers to allocate memory from the local node
+mpol=default             use the process allocation policy
+                         (see set_mempolicy(2))
 mpol=prefer:Node         prefers to allocate memory from the given Node
 mpol=bind:NodeList       allocates memory only from nodes in NodeList
 mpol=interleave          prefers to allocate from each node in turn
 mpol=interleave:NodeList allocates from each node of NodeList in turn
+mpol=local		 prefers to allocate memory from the local node

 NodeList format is a comma-separated list of decimal numbers and ranges,
 a range being two hyphen-separated decimal numbers, the smallest and
@ -134,3 +136,5 @@ Author:
   Christoph Rohland <cr@sap.com>, 1.12.01
 Updated:
   Hugh Dickins, 4 June 2007
+Updated:
+   KOSAKI Motohiro, 16 Mar 2010
--- a/Documentation/gpio.txt
+++ b/Documentation/gpio.txt
@ -253,6 +253,70 @@ pin setup (e.g. controlling which pin the GPIO uses, pullup/pulldown).
 Also note that it's your responsibility to have stopped using a GPIO
 before you free it.

+Considering in most cases GPIOs are actually configured right after they
+are claimed, three additional calls are defined:
+
+	/* request a single GPIO, with initial configuration specified by
+	 * 'flags', identical to gpio_request() wrt other arguments and
+	 * return value
+	 */
+	int gpio_request_one(unsigned gpio, unsigned long flags, const char *label);
+
+	/* request multiple GPIOs in a single call
+	 */
+	int gpio_request_array(struct gpio *array, size_t num);
+
+	/* release multiple GPIOs in a single call
+	 */
+	void gpio_free_array(struct gpio *array, size_t num);
+
+where 'flags' is currently defined to specify the following properties:
+
+	* GPIOF_DIR_IN		- to configure direction as input
+	* GPIOF_DIR_OUT		- to configure direction as output
+
+	* GPIOF_INIT_LOW	- as output, set initial level to LOW
+	* GPIOF_INIT_HIGH	- as output, set initial level to HIGH
+
+since GPIOF_INIT_* are only valid when configured as output, so group valid
+combinations as:
+
+	* GPIOF_IN		- configure as input
+	* GPIOF_OUT_INIT_LOW	- configured as output, initial level LOW
+	* GPIOF_OUT_INIT_HIGH	- configured as output, initial level HIGH
+
+In the future, these flags can be extended to support more properties such
+as open-drain status.
+
+Further more, to ease the claim/release of multiple GPIOs, 'struct gpio' is
+introduced to encapsulate all three fields as:
+
+	struct gpio {
+		unsigned	gpio;
+		unsigned long	flags;
+		const char	*label;
+	};
+
+A typical example of usage:
+
+	static struct gpio leds_gpios[] = {
+		{ 32, GPIOF_OUT_INIT_HIGH, "Power LED" }, /* default to ON */
+		{ 33, GPIOF_OUT_INIT_LOW,  "Green LED" }, /* default to OFF */
+		{ 34, GPIOF_OUT_INIT_LOW,  "Red LED"   }, /* default to OFF */
+		{ 35, GPIOF_OUT_INIT_LOW,  "Blue LED"  }, /* default to OFF */
+		{ ... },
+	};
+
+	err = gpio_request_one(31, GPIOF_IN, "Reset Button");
+	if (err)
+		...
+
+	err = gpio_request_array(leds_gpios, ARRAY_SIZE(leds_gpios));
+	if (err)
+		...
+
+	gpio_free_array(leds_gpios, ARRAY_SIZE(leds_gpios));
+

 GPIOs mapped to IRQs
 --------------------
--- a/Documentation/hwmon/abituguru
+++ b/Documentation/hwmon/abituguru
@ -30,7 +30,7 @@ Supported chips:
 	   bank1_types=1,1,0,0,0,0,0,2,0,0,0,0,2,0,0,1
 	   You may also need to specify the fan_sensors option for these boards
 	   fan_sensors=5
-	2) There is a seperate abituguru3 driver for these motherboards,
+	2) There is a separate abituguru3 driver for these motherboards,
 	   the abituguru (without the 3 !) driver will not work on these
 	   motherboards (and visa versa)!

--- a/Documentation/hwmon/adt7411
+++ b/Documentation/hwmon/adt7411
@ -0,0 +1,42 @@
+Kernel driver adt7411
+=====================
+
+Supported chips:
+  * Analog Devices ADT7411
+    Prefix: 'adt7411'
+    Addresses scanned: 0x48, 0x4a, 0x4b
+    Datasheet: Publicly available at the Analog Devices website
+
+Author: Wolfram Sang (based on adt7470 by Darrick J. Wong)
+
+Description
+-----------
+
+This driver implements support for the Analog Devices ADT7411 chip. There may
+be other chips that implement this interface.
+
+The ADT7411 can use an I2C/SMBus compatible 2-wire interface or an
+SPI-compatible 4-wire interface. It provides a 10-bit analog to digital
+converter which measures 1 temperature, vdd and 8 input voltages. It has an
+internal temperature sensor, but an external one can also be connected (one
+loses 2 inputs then). There are high- and low-limit registers for all inputs.
+
+Check the datasheet for details.
+
+sysfs-Interface
+---------------
+
+in0_input	- vdd voltage input
+in[1-8]_input	- analog 1-8 input
+temp1_input	- temperature input
+
+Besides standard interfaces, this driver adds (0 = off, 1 = on):
+
+  adc_ref_vdd	- Use vdd as reference instead of 2.25 V
+  fast_sampling	- Sample at 22.5 kHz instead of 1.4 kHz, but drop filters
+  no_average	- Turn off averaging over 16 samples
+
+Notes
+-----
+
+SPI, external temperature sensor and limit registers are not supported yet.
--- a/Documentation/hwmon/adt7473
+++ b/Documentation/hwmon/adt7473
@ -1,74 +0,0 @@
-Kernel driver adt7473
-======================
-
-Supported chips:
-  * Analog Devices ADT7473
-    Prefix: 'adt7473'
-    Addresses scanned: I2C 0x2C, 0x2D, 0x2E
-    Datasheet: Publicly available at the Analog Devices website
-
-Author: Darrick J. Wong
-
-This driver is depreacted, please use the adt7475 driver instead.
-
-Description
-----------
-
-This driver implements support for the Analog Devices ADT7473 chip family.
-
-The ADT7473 uses the 2-wire interface compatible with the SMBUS 2.0
-specification. Using an analog to digital converter it measures three (3)
-temperatures and two (2) voltages. It has four (4) 16-bit counters for
-measuring fan speed. There are three (3) PWM outputs that can be used
-to control fan speed.
-
-A sophisticated control system for the PWM outputs is designed into the
-ADT7473 that allows fan speed to be adjusted automatically based on any of the
-three temperature sensors. Each PWM output is individually adjustable and
-programmable. Once configured, the ADT7473 will adjust the PWM outputs in
-response to the measured temperatures without further host intervention.
-This feature can also be disabled for manual control of the PWM's.
-
-Each of the measured inputs (voltage, temperature, fan speed) has
-corresponding high/low limit values. The ADT7473 will signal an ALARM if
-any measured value exceeds either limit.
-
-The ADT7473 samples all inputs continuously. The driver will not read
-the registers more often than once every other second. Further,
-configuration data is only read once per minute.
-
-Special Features
----------------
-
-The ADT7473 have a 10-bit ADC and can therefore measure temperatures
-with 0.25 degC resolution. Temperature readings can be configured either
-for twos complement format or "Offset 64" format, wherein 63 is subtracted
-from the raw value to get the temperature value.
-
-The Analog Devices datasheet is very detailed and describes a procedure for
-determining an optimal configuration for the automatic PWM control.
-
-Configuration Notes
-------------------
-
-Besides standard interfaces driver adds the following:
-
-* PWM Control
-
-* pwm#_auto_point1_pwm and temp#_auto_point1_temp and
-* pwm#_auto_point2_pwm and temp#_auto_point2_temp -
-
-point1: Set the pwm speed at a lower temperature bound.
-point2: Set the pwm speed at a higher temperature bound.
-
-The ADT7473 will scale the pwm between the lower and higher pwm speed when
-the temperature is between the two temperature boundaries.  PWM values range
-from 0 (off) to 255 (full speed).  Fan speed will be set to maximum when the
-temperature sensor associated with the PWM control exceeds temp#_max.
-
-Notes
-----
-
-The NVIDIA binary driver presents an ADT7473 chip via an on-card i2c bus.
-Unfortunately, they fail to set the i2c adapter class, so this driver may
-fail to find the chip until the nvidia driver is patched.
--- a/Documentation/hwmon/amc6821
+++ b/Documentation/hwmon/amc6821
@ -0,0 +1,102 @@
+Kernel driver amc6821
+=====================
+
+Supported chips:
+	Texas Instruments AMC6821
+	Prefix: 'amc6821'
+	Addresses scanned: 0x18, 0x19, 0x1a, 0x2c, 0x2d, 0x2e, 0x4c, 0x4d, 0x4e
+	Datasheet: http://focus.ti.com/docs/prod/folders/print/amc6821.html
+
+Authors:
+	Tomaz Mertelj <tomaz.mertelj@guest.arnes.si>
+
+
+Description
+-----------
+
+This driver implements support for the Texas Instruments amc6821 chip.
+The chip has one on-chip and one remote temperature sensor and one pwm fan
+regulator.
+The pwm can be controlled either from software or automatically.
+
+The driver provides the following sensor accesses in sysfs:
+
+temp1_input		ro	on-chip temperature
+temp1_min		rw	"
+temp1_max		rw	"
+temp1_crit	 	rw	"
+temp1_min_alarm		ro	"
+temp1_max_alarm		ro	"
+temp1_crit_alarm	ro	"
+
+temp2_input		ro	remote temperature
+temp2_min		rw	"
+temp2_max		rw	"
+temp2_crit	 	rw	"
+temp2_min_alarm		ro	"
+temp2_max_alarm		ro	"
+temp2_crit_alarm	ro	"
+temp2_fault		ro	"
+
+fan1_input	 	ro	tachometer speed
+fan1_min		rw	"
+fan1_max		rw	"
+fan1_fault	 	ro	"
+fan1_div		rw	Fan divisor can be either 2 or 4.
+
+pwm1			rw	pwm1
+pwm1_enable		rw	regulator mode, 1=open loop, 2=fan controlled
+				by remote temperature, 3=fan controlled by
+				combination of the on-chip temperature and
+				remote-sensor temperature,
+pwm1_auto_channels_temp ro	1 if pwm_enable==2, 3 if pwm_enable==3
+pwm1_auto_point1_pwm	ro	Hardwired to 0, shared for both
+				temperature channels.
+pwm1_auto_point2_pwm	rw	This value is shared for both temperature
+				channels.
+pwm1_auto_point3_pwm	rw	Hardwired to 255, shared for both
+				temperature channels.
+
+temp1_auto_point1_temp	ro	Hardwired to temp2_auto_point1_temp
+				which is rw. Below this temperature fan stops.
+temp1_auto_point2_temp	rw	The low-temperature limit of the proportional
+				range. Below this temperature
+				pwm1 = pwm1_auto_point2_pwm. It can go from
+				0 degree C to 124 degree C in steps of
+				4 degree C. Read it out after writing to get
+				the actual value.
+temp1_auto_point3_temp	rw	Above this temperature fan runs at maximum
+				speed. It can go from temp1_auto_point2_temp.
+				It can only have certain discrete values
+				which depend on temp1_auto_point2_temp and
+				pwm1_auto_point2_pwm. Read it out after
+				writing to get the actual value.
+
+temp2_auto_point1_temp	rw	Must be between 0 degree C and 63 degree C and
+				it defines the passive cooling temperature.
+				Below this temperature the fan stops in
+				the closed loop mode.
+temp2_auto_point2_temp	rw	The low-temperature limit of the proportional
+				range. Below this temperature
+				pwm1 = pwm1_auto_point2_pwm. It can go from
+				0 degree C to 124 degree C in steps
+				of 4 degree C.
+
+temp2_auto_point3_temp	rw	Above this temperature fan runs at maximum
+				speed. It can only have certain discrete
+				values which depend on temp2_auto_point2_temp
+				and pwm1_auto_point2_pwm. Read it out after
+				writing to get actual value.
+
+
+Module parameters
+-----------------
+
+If your board has a BIOS that initializes the amc6821 correctly, you should
+load the module with: init=0.
+
+If your board BIOS doesn't initialize the chip, or you want
+different settings, you can set the following parameters:
+init=1,
+pwminv: 0 default pwm output, 1 inverts pwm output.
+
--- a/Show more
+++ b/Show more